In [45]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [46]:
# Reading parquet files as train/val dataframes
train_df = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
val_df = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [47]:
#Q1 Reading the data for January. How many columns are there?

len(train_df.columns)

19

In [48]:
#Q2 What's the standard deviation of the trips duration in January?

# Adding the new feature for duration
train_df['duration'] = train_df.tpep_dropoff_datetime - train_df.tpep_pickup_datetime
# Changing format of the duration column
train_df.duration = train_df.duration.apply(lambda td: td.total_seconds() / 60)
# Calculating standart deviation of duration
train_df['duration'].std()

42.594351241920904

In [49]:
#Q3 What fraction of the records left after you dropped the outliers?

# Calculating the fraction
((train_df['duration'] >= 1) & (train_df['duration'] <= 60)).mean()

0.9812202822125979

In [50]:
# Filtering df for to 1 - 60 mins range
train_df = train_df[((train_df['duration'] >= 1) & (train_df['duration'] <= 60))]

In [51]:
#Q4 What's the dimensionality of this matrix (number of columns)?


# Grouping necessary features
categorical = ['PULocationID','DOLocationID']

# Changing type of categorical to str
train_df[categorical] = train_df[categorical].astype(str) 

# Applying one-hot encoding to categorical features
dv = DictVectorizer()
train_dict = train_df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
len(dv.feature_names_)

515

In [52]:
#Q5 What's the RMSE on train?


target = 'duration'
y_train = train_df[target].values

# Training the model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predicting target
y_pred = lr.predict(X_train)

In [31]:
# Calculating the RMSE
root_mean_squared_error(y_train, y_pred)

7.6492619633678824

In [54]:
#Q6 What's the RMSE on validation?


# Preparing val_df 
val_df = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')
val_df['duration'] = val_df.tpep_dropoff_datetime - val_df.tpep_pickup_datetime
val_df.duration = val_df.duration.apply(lambda td: td.total_seconds() / 60)
val_df = val_df[((val_df['duration'] >= 1) & (val_df['duration'] <= 60))]
categorical = ['PULocationID','DOLocationID']
val_df[categorical] = val_df[categorical].astype(str)
val_dict = val_df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dict)
y_val = val_df[target].values

# Predicting for validation
y_pred_val = lr.predict(X_val)

# Calculating the RMSE
root_mean_squared_error(y_val, y_pred_val)

7.81181893596011