In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from statsmodels.formula.api import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")

In [144]:
# read in the sampled data
sample = pd.read_csv('sample.csv').drop('index', axis='columns')
sample = sample.drop(sample.columns[0], axis=1)
sample

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,...,DATE,AWND,PRCP,SNOW,SNWD,TAVG,bad weather,time,T_hour,type
0,2,2018-01-01 01:30:44,2018-01-01 01:53:15,2,17.1300,2,N,132,137,1,...,01-01,7.8300,0.0000,0.0000,0.0000,-10.5556,N,22.5167,1,holiday
1,2,2018-01-01 01:41:09,2018-01-01 02:27:03,1,30.9400,5,N,164,265,1,...,01-01,7.8300,0.0000,0.0000,0.0000,-10.5556,N,45.9000,1,holiday
2,1,2018-01-01 04:29:38,2018-01-01 04:55:48,3,9.1000,1,N,230,17,1,...,01-01,7.8300,0.0000,0.0000,0.0000,-10.5556,N,26.1667,4,holiday
3,2,2018-01-01 04:54:15,2018-01-01 05:18:48,1,16.4100,2,N,162,132,1,...,01-01,7.8300,0.0000,0.0000,0.0000,-10.5556,N,24.5500,4,holiday
4,1,2018-01-01 05:43:37,2018-01-01 06:11:02,2,17.7000,2,N,163,132,1,...,01-01,7.8300,0.0000,0.0000,0.0000,-10.5556,N,27.4167,5,holiday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22517,2,2018-06-30 23:00:22,2018-06-30 23:39:20,1,18.5800,2,N,132,262,1,...,06-30,2.2400,0.0000,0.0000,0.0000,28.0556,N,38.9667,23,weekday
22518,2,2018-06-30 23:03:18,2018-06-30 23:50:55,1,17.7800,2,N,132,100,1,...,06-30,2.2400,0.0000,0.0000,0.0000,28.0556,N,47.6167,23,weekday
22519,1,2018-06-30 23:08:17,2018-06-30 23:49:47,1,21.5000,2,N,132,233,1,...,06-30,2.2400,0.0000,0.0000,0.0000,28.0556,N,41.5000,23,weekday
22520,2,2018-06-30 23:21:35,2018-06-30 23:41:42,1,6.9300,1,N,138,41,1,...,06-30,2.2400,0.0000,0.0000,0.0000,28.0556,N,20.1167,23,weekday


In [145]:
# calc the ols of the selected features
COL_FILTER = ['passenger_count', 'trip_distance', 'fare_amount', "tip_amount", "total_amount", "tolls_amount", "time"]
filtered = sample.loc[:,COL_FILTER].reset_index(drop=True)
filtered

Unnamed: 0,passenger_count,trip_distance,fare_amount,tip_amount,total_amount,tolls_amount,time
0,2,17.1300,52.0000,11.7100,70.2700,5.7600,22.5167
1,1,30.9400,115.0000,24.3100,145.8700,5.7600,45.9000
2,3,9.1000,29.0000,1.0000,37.0600,5.7600,26.1667
3,1,16.4100,52.0000,14.6400,73.2000,5.7600,24.5500
4,2,17.7000,52.0000,2.0000,60.5600,5.7600,27.4167
...,...,...,...,...,...,...,...
22517,1,18.5800,52.0000,11.7100,70.2700,5.7600,38.9667
22518,1,17.7800,52.0000,5.0000,63.5600,5.7600,47.6167
22519,1,21.5000,52.0000,14.6000,73.1600,5.7600,41.5000
22520,1,6.9300,22.0000,5.8100,34.8700,5.7600,20.1167


In [146]:
# get the ols regession results
fit = ols(formula="tip_amount ~ passenger_count + trip_distance + fare_amount + total_amount+ tolls_amount + time",
         data=filtered).fit()
print(fit.summary())

                            OLS Regression Results                            
Dep. Variable:             tip_amount   R-squared:                       0.940
Model:                            OLS   Adj. R-squared:                  0.940
Method:                 Least Squares   F-statistic:                 5.917e+04
Date:                Sun, 15 Aug 2021   Prob (F-statistic):               0.00
Time:                        15:40:13   Log-Likelihood:                -31363.
No. Observations:               22522   AIC:                         6.274e+04
Df Residuals:                   22515   BIC:                         6.280e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -1.0323      0.027    -

In [156]:
# calc the ols with the feature passenger count removed
COL_FILTER2 = ['trip_distance', 'fare_amount', "tip_amount", "total_amount", "tolls_amount", "time"]
filtered2 = sample.loc[:,COL_FILTER2].reset_index(drop=True)
filtered2

Unnamed: 0,trip_distance,fare_amount,tip_amount,total_amount,tolls_amount,time
0,17.1300,52.0000,11.7100,70.2700,5.7600,22.5167
1,30.9400,115.0000,24.3100,145.8700,5.7600,45.9000
2,9.1000,29.0000,1.0000,37.0600,5.7600,26.1667
3,16.4100,52.0000,14.6400,73.2000,5.7600,24.5500
4,17.7000,52.0000,2.0000,60.5600,5.7600,27.4167
...,...,...,...,...,...,...
22517,18.5800,52.0000,11.7100,70.2700,5.7600,38.9667
22518,17.7800,52.0000,5.0000,63.5600,5.7600,47.6167
22519,21.5000,52.0000,14.6000,73.1600,5.7600,41.5000
22520,6.9300,22.0000,5.8100,34.8700,5.7600,20.1167


In [157]:
# get the ols regession results
fitter = ols(formula="tip_amount ~ trip_distance + fare_amount + + tolls_amount + total_amount + time",
         data=filtered2).fit()
print(fitter.summary())

                            OLS Regression Results                            
Dep. Variable:             tip_amount   R-squared:                       0.940
Model:                            OLS   Adj. R-squared:                  0.940
Method:                 Least Squares   F-statistic:                 7.100e+04
Date:                Sun, 15 Aug 2021   Prob (F-statistic):               0.00
Time:                        15:55:16   Log-Likelihood:                -31364.
No. Observations:               22522   AIC:                         6.274e+04
Df Residuals:                   22516   BIC:                         6.279e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -1.0214      0.026    -39.673

In [155]:
# calc the aic and bic for the full and reduced model
[fit.aic, fitter.aic], [fit.bic, fitter.bic]

([62740.74199716929, 62744.12072311419],
 [62796.897732387086, 62784.231962555474])

## Ridge Regression

In [161]:
cleaned_data = pd.read_feather('cleaned_data.feather').drop('index', axis=1)
data = cleaned_data.loc[:,COL_FILTER2].reset_index(drop=True)
data

Unnamed: 0,trip_distance,fare_amount,tip_amount,total_amount,tolls_amount,time
0,18.1000,52.0000,14.6000,73.1600,5.7600,36.0667
1,8.6000,25.5000,6.5000,39.0600,5.7600,16.0333
2,17.7700,52.0000,11.7100,70.2700,5.7600,23.0833
3,11.9000,34.5000,15.3800,53.8200,2.6400,25.0333
4,19.5000,52.0000,10.0000,68.5600,5.7600,30.2833
...,...,...,...,...,...,...
2252205,7.6100,22.0000,7.2600,36.3200,5.7600,14.1000
2252206,17.9800,52.0000,11.7100,70.2700,5.7600,1407.5167
2252207,5.5700,17.0000,4.5000,28.5600,5.7600,13.2167
2252208,3.2000,13.0000,1.0000,21.0600,5.7600,14.3000


In [162]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from scipy.stats import zscore
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import train_test_split
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV

In [163]:
yCOLS = ['tip_amount']
xCOLS = ['trip_distance', 'fare_amount', "total_amount", "tolls_amount", "time"]

# standardize the data has mean 0 and var 1
df_standard = data[xCOLS].astype(float).apply(zscore)

In [164]:
# get the train and test set 
dms = pd.get_dummies(df_standard[xCOLS])

y = data[yCOLS]

X_ = data.drop(['tip_amount', 'trip_distance', 'fare_amount', "total_amount", "tolls_amount", "time"], axis=1).astype('float64')

X = pd.concat([X_, dms[xCOLS]], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1801768, 5), (450442, 5), (1801768, 1), (450442, 1))

In [177]:
# find the optimal alpha by Grid Search
grid = dict()
grid['alpha'] = 10**np.linspace(10,-2,100)*0.5
ridge_model = Ridge()
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

search = GridSearchCV(ridge_model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

In [178]:
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

MAE: -0.623
Config: {'alpha': 0.005}


In [180]:
# fit the ridge_model use a random alpha 5
ridge_model = Ridge(alpha = 5).fit(X_train, y_train)

In [181]:
# get the coeficients of the features and intercept
ridge_model.coef_

array([[ -0.04295058, -11.8204076 ,  15.04053314,  -1.69413432,
         -0.02682257]])

In [182]:
ridge_model.intercept_

array([8.51192373])

In [183]:
y_pred = ridge_model.predict(X_train)

In [184]:
RMSE = np.sqrt(mean_squared_error(y_train, y_pred))
RMSE

0.9657807193364206

In [172]:
# get the RMSE for the test set
y_pred = ridge_model.predict(X_test)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
RMSE

0.9696541976041579

In [173]:
# fit the ridge_model use the tuned alpha 0.005
ridge_tuned = Ridge(alpha = 0.005).fit(X_train, y_train)
y_pred = ridge_tuned.predict(X_train)
RMSE2 = np.sqrt(mean_squared_error(y_train, y_pred))
RMSE2

0.965780663214377

In [174]:
# get the RMSE for the test set
y_pred_test = ridge_tuned.predict(X_test)
RMSE_test =np.sqrt(mean_squared_error(y_test, y_pred_test))
RMSE_test

0.9696549832181395

In [185]:
# get the coeficients of the features and intercept
ridge_tuned.coef_

array([[ -0.04287731, -11.82183965,  15.04198482,  -1.69429155,
         -0.02681546]])

In [186]:
ridge_tuned.intercept_

array([8.51192373])