<a href="https://colab.research.google.com/github/sushkbura/MiniProject_ML_ModelEvaluation/blob/main/Sush_MLE_MiniProject_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [5]:
url = "https://raw.githubusercontent.com/sushkbura/MiniProject_ML_ModelEvaluation/main/yellow_tripdata_2022-01.parquet"
df = pd.read_parquet(url, engine="pyarrow")

In [6]:
# Display the first few rows of the dataset
print(df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         1  2022-01-01 00:35:40   2022-01-01 00:53:29              2.0   
1         1  2022-01-01 00:33:43   2022-01-01 00:42:07              1.0   
2         2  2022-01-01 00:53:21   2022-01-01 01:02:19              1.0   
3         2  2022-01-01 00:25:21   2022-01-01 00:35:23              1.0   
4         2  2022-01-01 00:36:48   2022-01-01 01:14:20              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           3.80         1.0                  N           142           236   
1           2.10         1.0                  N           236            42   
2           0.97         1.0                  N           166           166   
3           1.09         1.0                  N           114            68   
4           4.30         1.0                  N            68           163   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [7]:
# Drop rows with missing values.
df = df.dropna()

In [8]:
# Create new feature, 'trip_duration'.
df["trip_duration"] = (
    (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"])
    .dt.total_seconds() / 60.0
)

In [9]:
# Create target variable name
target_variable = "total_amount"

In [10]:
# Create a list called feature_col to store column names
feature_cols = [
    "VendorID",
    "trip_distance",
    "payment_type",
    "PULocationID",
    "DOLocationID",
    "trip_duration",
]

In [11]:
# Split dataset into training and test sets
X = df[feature_cols]
y = df[target_variable]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [12]:
# Create a baseline for mean absolute error of total amount

# Calculate the mean total fare from the training set
baseline_pred = y_train.mean()

# Predict this value for every example in the test set
y_pred_baseline = np.full_like(y_test, fill_value=baseline_pred, dtype=float)

# Evaluate using Mean Absolute Error
baseline_mae = mean_absolute_error(y_test, y_pred_baseline)

print("Baseline MAE (predicting the mean):", baseline_mae)

Baseline MAE (predicting the mean): 9.198227928516678


In [13]:
# Use Scikit-Learn's ColumnTransformer to preprocess the categorical and
# continuous features independently.

numeric_features = ["trip_distance", "trip_duration"]
categorical_features = ["VendorID", "payment_type", "PULocationID", "DOLocationID"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ]
)

In [14]:
# Create a pipeline object containing the column transformations and regression
# model.

linreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

In [15]:
# Fit the pipeline on the training data.

linreg_pipeline.fit(X_train, y_train)

# Make predictions on the test data.

y_pred_linreg = linreg_pipeline.predict(X_test)

# Evaluate the model using MAE and compare to baseline
linreg_mae = mean_absolute_error(y_test, y_pred_linreg)
print("Baseline MAE (predicting the mean):", baseline_mae)
print("Linear Regression MAE:", linreg_mae)

Baseline MAE (predicting the mean): 9.198227928516678
Linear Regression MAE: 3.3854544988959763


In [26]:
train_sample_size = 50_000  # to speed up the process

if len(X_train) > train_sample_size:
    X_train_small = X_train.sample(train_sample_size, random_state=42)
    y_train_small = y_train.loc[X_train_small.index]
else:
    X_train_small = X_train
    y_train_small = y_train

# Build random forest regressor model
rf_model = RandomForestRegressor(
    n_estimators=100,   # fewer trees than 200 → faster
    max_depth=20,      # shallower trees → faster
    n_jobs=-1,
    random_state=42
)

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", rf_model)
])

# Fit the pipeline on the training data.
rf_pipeline.fit(X_train_small, y_train_small)

# Make predictions on the test data
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluate using MAE
rf_mae = mean_absolute_error(y_test, y_pred_rf)
print("Baseline MAE (predicting the mean):", baseline_mae)
print("Linear Regression MAE:", linreg_mae)
print("Random Forest MAE:", rf_mae)

Baseline MAE (predicting the mean): 9.198227928516678
Linear Regression MAE: 3.3854544988959763
Random Forest MAE: 1.6246708511752004


In [19]:
# Define the hyperparameters to tune.
param_grid = {
    "model__n_estimators": [50, 100],   # number of trees
    "model__max_depth": [10, 20],       # tree depth
    "model__min_samples_split": [2],    # min samples to split (kept fixed but included)
}

# Use a small sample for Grid Search so it completes on free Colab
train_sample_size = 5_000
if len(X_train) > train_sample_size:
    X_train_gs = X_train.sample(train_sample_size, random_state=42)
    y_train_gs = y_train.loc[X_train_gs.index]
else:
    X_train_gs = X_train
    y_train_gs = y_train

rf_base = RandomForestRegressor(
    n_jobs=-1,
    random_state=42
)

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", rf_base)
])

# Perform grid search to find the best hyperparameters. This could take a while.
grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=3,
    scoring="neg_mean_absolute_error",
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_gs, y_train_gs)

# Get the best model and its parameters.
print("Best parameters found:", grid_search.best_params_)
print("Best CV score (negative MAE):", grid_search.best_score_)


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best parameters found: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best CV score (negative MAE): -1.7717111165844397


In [22]:

# Get the best model and its parameters.
best_params = grid_search.best_params_

best_n_estimators = best_params["model__n_estimators"]
best_max_depth = best_params["model__max_depth"]
best_min_samples_split = best_params["model__min_samples_split"]

# Sample training data for faster training in free Colab
train_sample_size = 50_000
if len(X_train) > train_sample_size:
    X_train_final = X_train.sample(train_sample_size, random_state=42)
    y_train_final = y_train.loc[X_train_final.index]
else:
    X_train_final = X_train
    y_train_final = y_train

# Fit the best classifier on the training data.
rf_best = RandomForestRegressor(
    n_estimators=best_n_estimators,
    max_depth=best_max_depth,
    min_samples_split=best_min_samples_split,
    n_jobs=-1,
    random_state=42
)

rf_best_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", rf_best)
])

rf_best_pipeline.fit(X_train_final, y_train_final)

# Make predictions on the test data
y_pred_best = rf_best_pipeline.predict(X_test)

tuned_rf_mae = mean_absolute_error(y_test, y_pred_best)

print("Baseline MAE (predicting the mean):", baseline_mae)
print("Linear Regression MAE:", linreg_mae)
print("Tuned Random Forest MAE:", tuned_rf_mae)

Baseline MAE (predicting the mean): 9.198227928516678
Linear Regression MAE: 3.3854544988959763
Tuned Random Forest MAE: 1.6246708511752004


In [25]:
# The baseline model had an error of about $9.20.
#Linear Regression improved this to about $3.39.
#After tuning a Random Forest model, the error dropped to about $1.62.
#This means the Random Forest did the best job because it can learn more complex patterns in the data.