<a href="https://colab.research.google.com/github/yemnaing/2005-stock-markets-analytics-zoomcamp/blob/main/HW1_MLOP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install if needed (optional, Colab already has pandas and pyarrow)
# !pip install pandas pyarrow

import pandas as pd

# Download the data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet

# Read the Parquet file
df = pd.read_parquet("yellow_tripdata_2023-01.parquet")

# Show number of columns
print("Number of columns:", len(df.columns))

# Optionally display column names
df.columns

--2025-05-21 15:58:32--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.7, 3.163.157.72, 3.163.157.133, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-01.parquet’


2025-05-21 15:58:36 (16.2 MB/s) - ‘yellow_tripdata_2023-01.parquet’ saved [47673370/47673370]

Number of columns: 19


Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [2]:
import pandas as pd

# Load the data again if not already loaded
df = pd.read_parquet("yellow_tripdata_2023-01.parquet")

# Convert pickup and dropoff times to datetime
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

# Compute duration in minutes
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Show standard deviation
print("Standard deviation (in minutes):", round(df['duration'].std(), 2))

Standard deviation (in minutes): 42.59


In [3]:
# Filter durations between 1 and 60 minutes
filtered_df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]

# Calculate fraction of records remaining
fraction = len(filtered_df) / len(df)
print(f"Fraction of records remaining: {fraction:.2%}")


Fraction of records remaining: 98.12%


In [4]:
from sklearn.feature_extraction import DictVectorizer

# Select only the pickup and dropoff location IDs, convert to string
df_filtered = filtered_df.copy()
df_filtered['PULocationID'] = df_filtered['PULocationID'].astype(str)
df_filtered['DOLocationID'] = df_filtered['DOLocationID'].astype(str)

# Convert to list of dicts
dicts = df_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Initialize DictVectorizer
dv = DictVectorizer()

# Fit and transform
X = dv.fit_transform(dicts)

# Check the number of columns (features)
print("Number of columns (features):", X.shape[1])


Number of columns (features): 515


In [6]:
!pip install -U scikit-learn




In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from math import sqrt

# Target variable
y = df_filtered['duration'].values

# Train linear regression model
lr = LinearRegression()
lr.fit(X, y)

# Predict on training data
y_pred = lr.predict(X)

# Calculate RMSE
# rmse = mean_squared_error(y, y_pred, squared=False)
mse = mean_squared_error(y, y_pred)
rmse = sqrt(mse)
print("RMSE on training data:", round(rmse, 2))


RMSE on training data: 7.65


In [10]:
# Download February data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

# Load February data
df_val = pd.read_parquet("yellow_tripdata_2023-02.parquet")

# Convert to datetime
df_val['tpep_pickup_datetime'] = pd.to_datetime(df_val['tpep_pickup_datetime'])
df_val['tpep_dropoff_datetime'] = pd.to_datetime(df_val['tpep_dropoff_datetime'])

# Compute duration in minutes
df_val['duration'] = (df_val['tpep_dropoff_datetime'] - df_val['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter duration between 1 and 60
df_val_filtered = df_val[(df_val['duration'] >= 1) & (df_val['duration'] <= 60)].copy()

# Convert pickup and dropoff IDs to strings
df_val_filtered['PULocationID'] = df_val_filtered['PULocationID'].astype(str)
df_val_filtered['DOLocationID'] = df_val_filtered['DOLocationID'].astype(str)

# Create dicts for vectorizer
dicts_val = df_val_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Transform using January's DictVectorizer (dv)
X_val = dv.transform(dicts_val)

# Target variable
y_val = df_val_filtered['duration'].values

# Predict on validation data
y_val_pred = lr.predict(X_val)

# Calculate RMSE
#rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)
mse = mean_squared_error(y_val, y_val_pred)
rmse_val = sqrt(mse)
print("RMSE on validation data:", round(rmse_val, 2))


--2025-05-21 16:09:52--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.163.157.7, 3.163.157.133, 3.163.157.96, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.163.157.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-02.parquet.2’


2025-05-21 16:09:55 (19.7 MB/s) - ‘yellow_tripdata_2023-02.parquet.2’ saved [47748012/47748012]

RMSE on validation data: 7.81
