In [52]:
!python3 -V

Python 3.9.13


In [53]:
import pandas as pd

In [56]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [68]:
import warnings
warnings.simplefilter('ignore')

# Question 1: Number of columns in Jan 2022 Yellow Taxi Trip data

In [69]:
df_train = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
df_val = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')

len(df_train.columns)

19

# Question 2:  Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data

In [70]:
df_train['duration'] = df_train.tpep_dropoff_datetime - df_train.tpep_pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)
round(df_train['duration'].std(),2)

46.45

# Question 3: Fraction of the records left after dropping the outliers

In [71]:
df_train_clean = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
round(len(df_train_clean)/len(df_train)*100)

98

In [72]:
df_train = df_train_clean

In [78]:
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

# Question 4: Dimensionality after OHE

In [79]:
categorical = ['PULocationID', 'DOLocationID']
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

X_train.shape[1]

515

# Question 5: RMSE on train

In [80]:
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

In [81]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [82]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

round(mean_squared_error(y_train, y_pred, squared=False),2)

6.99

# Question 6: RMSE on validation

In [83]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

round(mean_squared_error(y_val, y_pred, squared=False),2)

7.79