# Interpretable ML - Linear Regression
> Demo for Interpretable ML - Linear Regression

- toc:true
- branch: master
- badges: true
- comments: true
- author: Han Wu
- categories: [jupyter]

In [215]:
import pandas as pd

In [216]:
import numpy as np
np.set_printoptions(suppress=True)

# Read Dataset

In [217]:
bike_day = pd.read_csv("dataset/bike/day.csv")

In [218]:
bike_day.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


# Feature Engineering

### Numerical Features

In [219]:
bike_day.temp = bike_day.temp * (39 - (-8)) + (-8)
bike_day.atemp = bike_day.atemp * (50 - (16)) + (16)
bike_day.windspeed = bike_day.windspeed * 67
bike_day.hum = bike_day.hum * 100

### Datetime

In [220]:
bike_start_day = pd.to_datetime('2011-01-01')
bike_day['dteday'] = pd.to_datetime(bike_day['dteday'])
bike_day['days_since_2011'] = (bike_day.dteday - bike_start_day).dt.days

In [221]:
selected_features = ["temp", "atemp", "hum", "windspeed", "days_since_2011"]
X = bike_day[selected_features]
y = bike_day.cnt

### Categorical Features

Year

In [222]:
bike_day.yr[bike_day.loc[:, 'yr'] == 0] = '2011'
bike_day.yr[bike_day.loc[:, 'yr'] == 1] = '2012'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.yr[bike_day.loc[:, 'yr'] == 0] = '2011'


In [223]:
X = X.join(pd.get_dummies(bike_day.yr))

Season

In [224]:
bike_day.season[bike_day.season == 1] = 'SPRING'
bike_day.season[bike_day.season == 2] = 'SUMMER' 
bike_day.season[bike_day.season == 3] = 'FALL' 
bike_day.season[bike_day.season == 4] = 'WINTER' 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.season[bike_day.season == 1] = 'SPRING'


In [225]:
X = X.join(pd.get_dummies(bike_day.season))

Weather

In [226]:
bike_day.weathersit[bike_day.weathersit == 1] = 'GOOD'
bike_day.weathersit[bike_day.weathersit == 2] = 'MISTY'
bike_day.weathersit[bike_day.weathersit == 3] = 'RAIN/SNOW/STORM'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.weathersit[bike_day.weathersit == 1] = 'GOOD'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.weathersit[bike_day.weathersit == 2] = 'MISTY'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.weathersit[bike_day.weathersit == 3] = 'RAIN/SNOW/STORM'


In [227]:
X = X.join(pd.get_dummies(bike_day.weathersit))

Month

In [228]:
bike_day.mnth[bike_day.mnth == 1] = 'JAN'
bike_day.mnth[bike_day.mnth == 2] = 'FEB'
bike_day.mnth[bike_day.mnth == 3] = 'MAR'
bike_day.mnth[bike_day.mnth == 4] = 'APR'
bike_day.mnth[bike_day.mnth == 5] = 'MAY'
bike_day.mnth[bike_day.mnth == 6] = 'JUN'
bike_day.mnth[bike_day.mnth == 7] = 'JUL'
bike_day.mnth[bike_day.mnth == 8] = 'AUG'
bike_day.mnth[bike_day.mnth == 9] = 'SEP'
bike_day.mnth[bike_day.mnth == 10] = 'OCT'
bike_day.mnth[bike_day.mnth == 11] = 'NOV'
bike_day.mnth[bike_day.mnth == 12] = 'DEC'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bike_day.mnth[bike_day.mnth == 1] = 'JAN'


In [229]:
X = X.join(pd.get_dummies(bike_day.mnth))

# Train test split

In [230]:
from sklearn.model_selection import train_test_split

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(), y.to_numpy(), train_size = 0.7, random_state = 0)

# Training

In [232]:
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

Ridge

In [233]:
rr = Ridge()

In [234]:
rr.fit(X_train, y_train)

Ridge()

Lasso

In [235]:
lr = Lasso(normalize=True)

In [236]:
lr.fit(X_train, y_train)

Lasso(normalize=True)

# Metrics

In [237]:
from sklearn.metrics import mean_squared_error

Ridge

In [238]:
y_pred_r = rr.predict(X_test)

In [239]:
mean_squared_error(y_test, y_pred_r)

650885.6073230483

In [240]:
X.columns

Index(['temp', 'atemp', 'hum', 'windspeed', 'days_since_2011', '2011', '2012',
       'FALL', 'SPRING', 'SUMMER', 'WINTER', 'GOOD', 'MISTY',
       'RAIN/SNOW/STORM', 'APR', 'AUG', 'DEC', 'FEB', 'JAN', 'JUL', 'JUN',
       'MAR', 'MAY', 'NOV', 'OCT', 'SEP'],
      dtype='object')

In [241]:
rr.coef_

array([  88.56269292,   26.35233461,  -20.35055709,  -50.15770336,
          0.11737953, -931.02767102,  931.02767102,  129.9776705 ,
       -858.75619916,   52.74840848,  676.03012019,  633.50374363,
        356.83084875, -990.33459238,   77.61987747, -103.53517043,
       -229.15948447,  -66.61631165, -164.93492354, -531.25800384,
       -128.86883146,  332.58997771,  377.06743755, -389.4554859 ,
        245.35738634,  581.19353222])

In [242]:
rr.intercept_

3670.8640712920055

Lasso

In [243]:
y_pred_l = lr.predict(X_test)

In [244]:
mean_squared_error(y_test, y_pred_l)

659705.342130416

In [245]:
lr.coef_

array([   75.4720349 ,    33.18166704,   -16.06025245,   -43.72312824,
           0.        , -1891.66068971,     0.        ,    -0.        ,
       -1045.36798995,     0.        ,   240.19341828,   283.91219444,
          -0.        , -1414.16952065,     0.        ,     0.        ,
          -0.        ,    -0.        ,  -116.21917705,  -301.81734998,
          -0.        ,   251.78401898,   273.0091002 ,   -13.95883721,
         504.19655444,   651.96621325])

In [246]:
lr.intercept_

4739.781565530179

# Plot

In [276]:
import altair as alt

In [277]:
lr_pred_df = pd.DataFrame({'day': X_test[:, 4],
                          'y_test': y_test,
                          'y_pred': y_pred_l,
                          })

In [278]:
lr_pred_df = lr_pred_df.melt('day', var_name='data', value_name='value')

In [279]:
scales = alt.selection_interval(bind='scales')
alt.Chart(lr_pred_df).mark_line(point=True).encode(
    x = 'day:Q',
    y = 'value:Q',
    color='data:N',
    tooltip=[ 'data:N', 'value:N']
).add_selection(
    scales
).properties(width=800)