In [None]:
pip install pandas numpy scikit-learn pyarrow fastparquet

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
# Dataset loading
file_path = "/content/yellow_tripdata_2023-01.parquet"
df = pd.read_parquet(file_path)
# Q1: Number of columns in January dataset
print(f"Number of columns: {df.shape[1]}")

df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
# Computing trip duration in mins
df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
# Q2: Standard deviation of trip duration
std_duration = df['trip_duration'].std()
print(f"Standard deviation of trip duration: {std_duration:.2f}")
# Removing outliers 
df_filtered = df[(df['trip_duration'] >= 1) & (df['trip_duration'] <= 60)]
print(f"Fraction of records left: {len(df_filtered) / len(df):.2%}")
# One-hot encoding
categorical = ['PULocationID', 'DOLocationID']
df_filtered[categorical] = df_filtered[categorical].astype(str)
dv = DictVectorizer(sparse=True)  # Use sparse matrices to save memory
X = dv.fit_transform(df_filtered[categorical].to_dict(orient='records'))
print(f"Feature matrix dimensionality: {X.shape[1]}") 

# Train-test split
y = df_filtered['trip_duration']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
# Computing RMSE on training data
y_pred_train = model.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"RMSE on train: {rmse_train:.2f}")
# Loading dataset
file_path_feb = "/content/yellow_tripdata_2023-01.parquet"
df_feb = pd.read_parquet(file_path_feb)
df_feb['tpep_pickup_datetime'] = pd.to_datetime(df_feb['tpep_pickup_datetime'])
df_feb['tpep_dropoff_datetime'] = pd.to_datetime(df_feb['tpep_dropoff_datetime'])
df_feb['trip_duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_feb_filtered = df_feb[(df_feb['trip_duration'] >= 1) & (df_feb['trip_duration'] <= 60)]
df_feb_filtered[categorical] = df_feb_filtered[categorical].astype(str)
X_val_feb = dv.transform(df_feb_filtered[categorical].to_dict(orient='records'))
y_val_feb = df_feb_filtered['trip_duration']
# Compute RMSE on validation data
y_pred_val = model.predict(X_val_feb)
rmse_val = np.sqrt(mean_squared_error(y_val_feb, y_pred_val))
print(f"RMSE on validation: {rmse_val:.2f}")