In [3]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

import pickle

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

In [4]:
def read_dataframe(filename = './data/green_tripdata_2021-01.parquet'):
    df= pd.read_parquet(filename)

    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df['duration'] = df.duration.apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration >= 1)&(df.duration <= 60)].copy()
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = read_dataframe()
df_val = read_dataframe(filename = './data/green_tripdata_2021-02.parquet')

In [6]:
df_train['PU_DU'] = df_train['PULocationID'] + '_'+ df_train['DOLocationID'] 
df_val['PU_DU'] = df_val['PULocationID'] + '_'+ df_val['DOLocationID'] 

In [7]:
categorical = ['PU_DU']#'PULocationID', 'DOLocationID']
numerical = ['trip_distance', ]

dv = DictVectorizer()

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [8]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('train')
y_train_hat = lr.predict(X_train)

print("MSE", mean_squared_error(y_train, y_train_hat, squared = False))

print("R2", r2_score(y_train, y_train_hat))
print('val')
y_val_hat = lr.predict(X_val)

print("MSE", mean_squared_error(y_val, y_val_hat, squared = False))

print("R2", r2_score(y_val, y_val_hat))

train
MSE 4.640808516968527
R2 0.8389206469633814
val
MSE 7.479562160810692
R2 0.622429782552913


In [None]:
with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)
    

In [11]:
lr = Lasso()
lr.fit(X_train, y_train)
print('train')
y_train_hat = lr.predict(X_train)

print("MSE", mean_squared_error(y_train, y_train_hat, squared = False))

print("R2", r2_score(y_train, y_train_hat))
print('val')
y_val_hat = lr.predict(X_val)

print("MSE", mean_squared_error(y_val, y_val_hat, squared = False))

print("R2", r2_score(y_val, y_val_hat))

train
MSE 11.562050466293025
R2 0.00017889723988795403
val
MSE 12.212583224318818
R2 -0.0066081699465667665


In [12]:
with open('models/lasso.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)
    

In [13]:
lr = Ridge()
lr.fit(X_train, y_train)
print('train')
y_train_hat = lr.predict(X_train)

print("MSE", mean_squared_error(y_train, y_train_hat, squared = False))

print("R2", r2_score(y_train, y_train_hat))
print('val')
y_val_hat = lr.predict(X_val)

print("MSE", mean_squared_error(y_val, y_val_hat, squared = False))

print("R2", r2_score(y_val, y_val_hat))

train
MSE 10.712682923244786
R2 0.14168030268953524
val
MSE 11.342603943250333
R2 0.13169777481020617


In [14]:
with open('models/ridge.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)
    