In [2]:
import pandas as pd
import numpy as np

In [3]:
! ls data

fhv_tripdata_2021-01.parquet  fhv_tripdata_2021-02.parquet


In [104]:
dataframe = pd.read_parquet("data/fhv_tripdata_2021-01.parquet")

In [105]:
len(dataframe)

1154112

In [106]:
dataframe.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [107]:
dataframe.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
dtype: object

## Duration

In [108]:

dataframe['duration'] = dataframe['dropOff_datetime'] - dataframe['pickup_datetime']
dataframe['duration'] = dataframe['duration'].apply(lambda x: x.total_seconds()/60)
print(dataframe['duration'].mean())
filter_within_range = (dataframe['duration'] <= 60) & (dataframe['duration']  >= 1)
num_dropped = len(dataframe) - filter_within_range.sum()
dataframe = dataframe.loc[filter_within_range]


19.1672240937939


In [109]:
num_dropped

44286

In [110]:
filter_within_range

0           True
1           True
2          False
3           True
4           True
           ...  
1154107     True
1154108     True
1154109     True
1154110     True
1154111     True
Name: duration, Length: 1154112, dtype: bool

## Missing values

In [74]:
num_na = (dataframe['PUlocationID'].isna()| dataframe['DOlocationID'].isna()).sum()

In [75]:
num_na

938156

In [76]:
num_na_pickup = dataframe['PUlocationID'].isna().sum()

In [77]:
ratio = num_na_pickup/len(dataframe)

In [78]:
ratio

0.8352732770722617

In [79]:
dataframe.loc[:, 'PUlocationID'] = dataframe.loc[:, 'PUlocationID'].fillna(-1)

In [80]:
dataframe.loc[:, 'DOlocationID'] = dataframe.loc[:, 'DOlocationID'].fillna(-1)

In [81]:
dataframe['PUlocationID'].isna().sum()

0

## One-hot encoding

In [85]:
categorical = ['PUlocationID', 'DOlocationID']
dataframe[categorical] = dataframe[categorical].astype(str)

In [86]:
feat_list_dict = dataframe[categorical].to_dict(orient = 'records')

In [87]:
feat_list_dict

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [88]:
vectorizer = DictVectorizer()

In [89]:
X_train = vectorizer.fit_transform(feat_list_dict)

In [90]:
X_train

<1109826x525 sparse matrix of type '<class 'numpy.float64'>'
	with 2219652 stored elements in Compressed Sparse Row format>

In [91]:
len(dataframe)

1109826

In [92]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
target = 'duration'
y_train = dataframe[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.52851910722287

## Q6. Evaluating the model

In [94]:
validation_df = pd.read_parquet("data/fhv_tripdata_2021-02.parquet")

In [95]:
validation_df['duration'] = validation_df['dropOff_datetime'] - validation_df['pickup_datetime']
validation_df['duration'] = validation_df['duration'].apply(lambda x: x.total_seconds()/60)
filter_within_range = (validation_df['duration'] <= 60) & (dataframe['duration']  >= 1)
num_dropped = len(validation_df) - filter_within_range.sum()
validation_df = validation_df.loc[filter_within_range]



In [96]:
validation_df.loc[:, 'PUlocationID'] = validation_df.loc[:, 'PUlocationID'].fillna(-1)

In [97]:
validation_df.loc[:, 'DOlocationID'] = validation_df.loc[:, 'DOlocationID'].fillna(-1)

In [98]:
X_val = vectorizer.transform(validation_df[['PUlocationID', 'DOlocationID']].to_dict(orient = 'records'))

In [99]:
X_val

<963283x525 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [100]:
y_pred_val = lr.predict(X_val)

In [101]:
y_val = validation_df[target].values

In [102]:
mean_squared_error(y_pred_val, y_val, squared=False)

12.985449515655873