In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import pickle

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
def read_dataframe(filename):
    
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df['duration'].apply(lambda td: td.total_seconds()/60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PUlocationID', 'DOlocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

In [5]:
df_train = read_dataframe('../data/fhv_tripdata_2021-01.parquet')
df_val = read_dataframe('../data/fhv_tripdata_2021-02.parquet')

In [6]:
len(df_train), len(df_val)

(1109826, 990113)

In [7]:
df_train

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.000000
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.000000
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037,9.050000
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,8.750000
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,57.600000
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,16.200000
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,19.433333


In [8]:
df_train.duration.mean()

16.247253368247375

In [11]:
df_train.isnull().sum()/len(df_train)

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID                    0
DOlocationID                    0
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [10]:
df_train.PUlocationID.isnull().sum()

0

In [66]:
df_train.PUlocationID = df_train.PUlocationID.replace('nan', -1)
df_train.DOlocationID = df_train.DOlocationID.replace('nan', -1)
df_val.PUlocationID = df_val.PUlocationID.replace('nan', -1)
df_val.DOlocationID = df_val.DOlocationID.replace('nan', -1)

In [67]:
df_train.PUlocationID.describe()

count     1109826
unique        262
top            -1
freq       927008
Name: PUlocationID, dtype: int64

In [68]:
categorical = ['PUlocationID', 'DOlocationID']
#numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
x_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
x_val = dv.transform(val_dicts)

In [69]:
train_dicts

[{'PUlocationID': -1, 'DOlocationID': -1},
 {'PUlocationID': -1, 'DOlocationID': -1},
 {'PUlocationID': -1, 'DOlocationID': '72.0'},
 {'PUlocationID': -1, 'DOlocationID': '61.0'},
 {'PUlocationID': -1, 'DOlocationID': '71.0'},
 {'PUlocationID': -1, 'DOlocationID': '91.0'},
 {'PUlocationID': -1, 'DOlocationID': '39.0'},
 {'PUlocationID': -1, 'DOlocationID': '37.0'},
 {'PUlocationID': -1, 'DOlocationID': '39.0'},
 {'PUlocationID': -1, 'DOlocationID': '72.0'},
 {'PUlocationID': -1, 'DOlocationID': '72.0'},
 {'PUlocationID': -1, 'DOlocationID': '89.0'},
 {'PUlocationID': -1, 'DOlocationID': '177.0'},
 {'PUlocationID': -1, 'DOlocationID': '225.0'},
 {'PUlocationID': -1, 'DOlocationID': '63.0'},
 {'PUlocationID': -1, 'DOlocationID': '67.0'},
 {'PUlocationID': -1, 'DOlocationID': '22.0'},
 {'PUlocationID': -1, 'DOlocationID': '61.0'},
 {'PUlocationID': -1, 'DOlocationID': '14.0'},
 {'PUlocationID': -1, 'DOlocationID': '14.0'},
 {'PUlocationID': -1, 'DOlocationID': '188.0'},
 {'PUlocationID': 

In [70]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [71]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519107211805

In [72]:
lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)

11.014283195752167

In [60]:
with open('../models/lin_reg_homework1.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)