In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet('data/fhv_tripdata_2021-01.parquet')

# Question 1

In [3]:
df.shape

(1154112, 7)

# Question 2

In [4]:
df['duration'] = pd.to_datetime(df.dropOff_datetime) - pd.to_datetime(df.pickup_datetime)
df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60)

In [5]:
df.duration.mean()

19.1672240937939

# Question

In [6]:
df[(df.duration < 1.0) | (df.duration > 60.0)].shape

(44286, 8)

In [7]:
df = df[(df.duration >= 1.0) & (df.duration <= 60.0)]

In [8]:
df.shape

(1109826, 8)

# Question 3

In [9]:
df.PUlocationID.isna().sum() / df.shape[0]

0.8352732770722617

In [10]:
categorical_features = ['PUlocationID', 'DOlocationID']

In [11]:
for feat in categorical_features:
    df[feat] = df[feat].fillna(-1).astype(int).astype(str)

In [12]:
train_dicts = df[categorical_features].to_dict(orient='records')

In [13]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

# Question 4

In [32]:
X_train.shape[1]

525

In [14]:
y_train = df['duration'].values

In [15]:
model = LinearRegression()

In [16]:
model.fit(X_train, y_train)

LinearRegression()

In [20]:
y_pred = model.predict(X_train)

In [None]:
sns.distplot(y_train, label='ground truth')
sns.distplot(y_pred, label='prediction')
plt.legend()

# Question 5

In [30]:
mean_squared_error(y_pred, y_train, squared=False)

10.5285191072072

In [33]:
df_eval = pd.read_parquet('data/fhv_tripdata_2021-02.parquet')

In [35]:
df_eval['duration'] = pd.to_datetime(df_eval.dropOff_datetime) - pd.to_datetime(df_eval.pickup_datetime)
df_eval['duration'] = df_eval['duration'].apply(lambda x: x.total_seconds() / 60)

In [36]:
df_eval = df_eval[(df_eval.duration >= 1.0) & (df_eval.duration <= 60.0)]

In [None]:
for feat in categorical_features:
    df_eval[feat] = df_eval[feat].fillna(-1).astype(int).astype(str)

In [38]:
eval_dicts = df_eval[categorical_features].to_dict(orient='records')

In [39]:
X_eval = dv.transform(eval_dicts)

In [40]:
y_eval = df_eval['duration'].values

In [42]:
y_pred_eval = model.predict(X_eval)

# Question 6

In [43]:
mean_squared_error(y_eval, y_pred_eval, squared=False)

11.014283163400654