In [31]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [32]:
jan_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-01.parquet')
feb_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-02.parquet')

In [33]:
print("Number of columns in January DataFrame:", len(jan_df.columns))

Number of columns in January DataFrame: 20


In [34]:
jan_df['duration'] = (jan_df['tpep_dropoff_datetime'] - jan_df['tpep_pickup_datetime']).dt.total_seconds() / 60
feb_df['duration'] = (feb_df['tpep_dropoff_datetime'] - feb_df['tpep_pickup_datetime']).dt.total_seconds() / 60

In [35]:
print("Standard deviation of duration in January DataFrame:", jan_df['duration'].std())

Standard deviation of duration in January DataFrame: 38.71358219498144


In [36]:
jan_df_before_filter = len(jan_df)
print("Jan dateframe length before filtering:", jan_df_before_filter)

Jan dateframe length before filtering: 3475226


In [37]:
jan_df = jan_df[(jan_df['duration'] >= 1) & (jan_df['duration'] <= 60)] 
feb_df = feb_df[(feb_df['duration'] >= 1) & (feb_df['duration'] <= 60)]
len_jan_df_after_filter = len(jan_df)
print("Jan dateframe length after filtering:", len_jan_df_after_filter)

Jan dateframe length after filtering: 3403248


In [38]:
fraction_jan_df_after_filter = (len_jan_df_after_filter / jan_df_before_filter)*100
print("Fraction of Jan dataframe after filtering:", round(fraction_jan_df_after_filter, 2))

Fraction of Jan dataframe after filtering: 97.93


In [39]:
categorical = ['PULocationID', 'DOLocationID']
jan_df[categorical] = jan_df[categorical].astype(str)
feb_df[categorical] = feb_df[categorical].astype(str)

In [40]:
feb_df.dtypes

VendorID                          int32
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
Airport_fee                     float64
cbd_congestion_fee              float64
duration                        float64
dtype: object

In [41]:
dv = DictVectorizer()
train_dicts = jan_df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
print("Shape of X_train:", X_train.shape)
y_train = jan_df['duration'].values

Shape of X_train: (3403248, 519)


In [42]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)
mse = mean_squared_error(y_train, y_pred)
rmse = np.sqrt(mse)
print("RMSE on training data:", rmse)

RMSE on training data: 7.707822122519484


In [43]:
val_dicts = feb_df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = feb_df['duration'].values
y_pred = lr.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("RMSE on validation data:", rmse)

RMSE on validation data: 7.950985572400415
