### import

In [118]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

import seaborn as sns

import pickle

### setup

In [106]:
INFILE_TRAIN = '../data/green_tripdata_2022-01.parquet'
INFILE_TEST = '../data/green_tripdata_2022-02.parquet'
TARGET = 'duration'

### read data

In [107]:
def read_dataframe(infile):
    
    df = pd.read_parquet(infile)

    # keep street-hail trips (and ignore dispatch)
    df = df[df['trip_type'] == 1]

    # calculate trip duration
    delta = pd.to_datetime(df['lpep_dropoff_datetime']) - \
            pd.to_datetime(df['lpep_pickup_datetime'])

    df['duration'] = delta.dt.total_seconds() / 60

    mask = df['duration'] > 0

    df = df[mask]
    df = df[df['duration'] <= 60]

    cols_cat = ['PULocationID', 'DOLocationID']
    cols_num = ['trip_distance']

    df[cols_cat] = df[cols_cat].astype(str)

    return df

In [108]:
df_train = read_dataframe(INFILE_TRAIN)
df_test = read_dataframe(INFILE_TEST)

len(df_train), len(df_test)

(53119, 58344)

### build model

In [109]:
dv = DictVectorizer()

train_dicts = df_train[cols_cat + cols_num].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
y_train = df_train[TARGET].values

test_dicts = df_test[cols_cat + cols_num].to_dict(orient='records')
X_test = dv.transform(test_dicts)
y_test = df_test[TARGET].values

reg = LinearRegression()
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)

mean_squared_error(y_test, y_pred, squared=False)

7.951400288803228

### try lasso

In [114]:
for alpha in [1, .1, .01, .001]:
    reg = Lasso(alpha=alpha)
    reg.fit(X_train, y_train)

    y_pred = reg.predict(X_test)

    print(mean_squared_error(y_test, y_pred, squared=False))    

9.775297042640023
9.382977763845506
8.320417259062861
7.958716487243303


### save the final model

In [117]:
reg = LinearRegression()
reg.fit(X_train, y_train)

In [120]:
with open('../models/lin_reg.bin', 'wb') as f:
    pickle.dump((dv, reg), f)