In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
import pickle

with open("cleaned_df_features.pkl", "rb") as f:
    df, numerics, curated_cat, other_cat = pickle.load(f)

In [3]:
# first we split trials

from sklearn.model_selection import train_test_split, KFold

# 10% for testing
[df_full_train,df_test] = train_test_split(df,test_size=0.1,random_state=42)
# 72% and 18% for train/val
[df_train,df_val] = train_test_split(df_full_train,test_size=0.2,random_state=42)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(36000, 14)
(9000, 14)
(5000, 14)


In [4]:

y_train = df_train.base_passenger_fare.values
df_train = df_train.drop(columns=['base_passenger_fare'])


In [5]:
# one-hot encoding

from sklearn.feature_extraction import DictVectorizer
import time

with open("dv_full.pkl", "rb") as f:
    dv_full = pickle.load(f)

# all features
X_full_train = dv_full.transform(df_train.to_dict(orient='records'))



In [6]:

y_val = df_val.base_passenger_fare.values
df_val = df_val.drop(columns=['base_passenger_fare'])

X_full_val = dv_full.transform(df_val.to_dict(orient='records'))


In [7]:
ftns_full = dv_full.get_feature_names_out()
ftns_full[[562,563,564,565,566]]

array(['trip_miles', 'trip_miles_log1p', 'trip_time', 'trip_time_log1p',
       'wait_time_sec_log1p'], dtype=object)

In [8]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Optional: progress bar
from tqdm.auto import tqdm

print("\n||||||| All features |||||||\n")

# ------------------------------------
# 1. Linear models: L2 (Ridge) search
# ------------------------------------

# Note: with one-hot/sparse features, use with_mean=False
# If X_* are sparse, this will keep them sparse where possible.


# only scale 3 numeric features
numeric_idx = [562,563,564,565,566]
all_idx = list(range(568))
cat_idx = [j for j in all_idx if j not in numeric_idx]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_idx),
        ("cat", "passthrough", cat_idx),
    ]
)


ridge_alphas = [0.0, 0.1, 1.0, 10.0, 100.0]

print("\n=== Ridge (L2) hyperparameter search ===")

ridge_results = []
for alpha in tqdm(ridge_alphas, desc="Ridge alphas"):
    model_ridge = Pipeline([
        ("scaler", preprocess),
        ("reg", Ridge(alpha=alpha, random_state=0))
    ])

    t0 = time.time()
    model_ridge.fit(X_full_train, y_train)

    y_pred = model_ridge.predict(X_full_train)
    rmse_train = root_mean_squared_error(y_train, y_pred)
    
    y_pred = model_ridge.predict(X_full_val)
    rmse_val = root_mean_squared_error(y_val, y_pred)
    
    t1 = time.time()
    train_time = t1 - t0

    print(f"alpha={alpha:8.4f} | train_RMSE={rmse_train:8.4f} | val_RMSE={rmse_val:8.4f} | time={train_time/60:5.2f} min")
    ridge_results.append((alpha, rmse_train, rmse_val, train_time, model_ridge))

# Pick best Ridge by validation RMSE
best_ridge_alpha, best_ridge_rmse_train, best_ridge_rmse_val, best_ridge_time, best_ridge_model = min(
    ridge_results,
    key=lambda x: x[2]
)

print(f"\nBest Ridge alpha: {best_ridge_alpha}")
print(f"Best Ridge training RMSE: {best_ridge_rmse_train:.4f}")
print(f"Best Ridge validation RMSE: {best_ridge_rmse_val:.4f}")
print(f"Best Ridge training time: {best_ridge_time/60:.2f} minutes")


||||||| All features |||||||


=== Ridge (L2) hyperparameter search ===


Ridge alphas:   0%|          | 0/5 [00:00<?, ?it/s]

alpha=  0.0000 | train_RMSE=  7.1372 | val_RMSE=  6.5283 | time= 0.00 min
alpha=  0.1000 | train_RMSE=  7.1373 | val_RMSE=  6.5277 | time= 0.00 min
alpha=  1.0000 | train_RMSE=  7.1379 | val_RMSE=  6.5243 | time= 0.00 min
alpha= 10.0000 | train_RMSE=  7.1442 | val_RMSE=  6.5162 | time= 0.00 min
alpha=100.0000 | train_RMSE=  7.1866 | val_RMSE=  6.5077 | time= 0.00 min

Best Ridge alpha: 100.0
Best Ridge training RMSE: 7.1866
Best Ridge validation RMSE: 6.5077
Best Ridge training time: 0.00 minutes


Ridge regression on all features achieved XGB's performance on limited features (32% improvement). But still far from overfitting.

In [9]:
from sklearn.metrics import root_mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Optional: progress bar
from tqdm.auto import tqdm

print("\n||||||| All features |||||||\n")

# ------------------------------------
# 1. Linear models: L2 (Ridge) search
# ------------------------------------

# Note: with one-hot/sparse features, use with_mean=False
# If X_* are sparse, this will keep them sparse where possible.


# only scale 3 numeric features
numeric_idx = [562,563,564,565,566]
all_idx = list(range(568))
cat_idx = [j for j in all_idx if j not in numeric_idx]

# exclude the non log1p numerics
numeric_idx = [563,565,566]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_idx),
        ("cat", "passthrough", cat_idx),
    ],
    remainder="drop"
)


ridge_alphas = [0.0, 0.1, 1.0, 10.0, 100.0]

print("\n=== Ridge (L2) hyperparameter search ===")

ridge_results = []
for alpha in tqdm(ridge_alphas, desc="Ridge alphas"):
    model_ridge = Pipeline([
        ("scaler", preprocess),
        ("reg", Ridge(alpha=alpha, random_state=0))
    ])

    t0 = time.time()
    model_ridge.fit(X_full_train, y_train)

    y_pred = model_ridge.predict(X_full_train)
    rmse_train = root_mean_squared_error(y_train, y_pred)
    
    y_pred = model_ridge.predict(X_full_val)
    rmse_val = root_mean_squared_error(y_val, y_pred)
    
    t1 = time.time()
    train_time = t1 - t0

    print(f"alpha={alpha:8.4f} | train_RMSE={rmse_train:8.4f} | val_RMSE={rmse_val:8.4f} | time={train_time/60:5.2f} min")
    ridge_results.append((alpha, rmse_train, rmse_val, train_time, model_ridge))

# Pick best Ridge by validation RMSE
best_ridge_alpha, best_ridge_rmse_train, best_ridge_rmse_val, best_ridge_time, best_ridge_model = min(
    ridge_results,
    key=lambda x: x[2]
)

print(f"\nBest Ridge alpha: {best_ridge_alpha}")
print(f"Best Ridge training RMSE: {best_ridge_rmse_train:.4f}")
print(f"Best Ridge validation RMSE: {best_ridge_rmse_val:.4f}")
print(f"Best Ridge training time: {best_ridge_time/60:.2f} minutes")


||||||| All features |||||||


=== Ridge (L2) hyperparameter search ===


Ridge alphas:   0%|          | 0/5 [00:00<?, ?it/s]

alpha=  0.0000 | train_RMSE=  8.6908 | val_RMSE=  8.6861 | time= 0.00 min
alpha=  0.1000 | train_RMSE=  8.6908 | val_RMSE=  8.6858 | time= 0.00 min
alpha=  1.0000 | train_RMSE=  8.6913 | val_RMSE=  8.6841 | time= 0.00 min
alpha= 10.0000 | train_RMSE=  8.6985 | val_RMSE=  8.6776 | time= 0.00 min
alpha=100.0000 | train_RMSE=  8.7758 | val_RMSE=  8.7190 | time= 0.00 min

Best Ridge alpha: 10.0
Best Ridge training RMSE: 8.6985
Best Ridge validation RMSE: 8.6776
Best Ridge training time: 0.00 minutes


I realized the first ridge regression contains "trip_miles" and "trip_time" (in addition to their log1p version).  
In the 32% improvement of ridge regression V2 (RMSE=6.72 compared to 10), 22% comes from these two features, and 10% comes from pickup and dropoff location.  
This actually makes sense, since they share the skewness as y.  
I will keep them for SGD and XGBoost.

In [10]:
# GridSearchCV version of SGD ElasticNet

from sklearn.linear_model import SGDRegressor
from tqdm.auto import tqdm
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


# only scale 3 numeric features
numeric_idx = [562,563,564,565,566]
all_idx = list(range(568))
cat_idx = [j for j in all_idx if j not in numeric_idx]

preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_idx),
        ("cat", "passthrough", cat_idx),
    ]
)


SGDalphas = [1e-5, 1e-4, 1e-3, 1e-2]
l1_ratios = [0.0, 0.05, 0.1, 0.2, 0.5]

model_SGD = Pipeline([
    ("scaler", preprocess),
    ("reg", SGDRegressor( \
        loss="squared_error",
        penalty="elasticnet",
        eta0=1e-5,
        max_iter=500,
        shuffle=True,
        early_stopping=True,
        n_iter_no_change=5,
        validation_fraction=0.1,
        tol=1e-3,
        verbose=0)
    )
])

param_grid = {
    "reg__alpha": SGDalphas,
    "reg__l1_ratio": l1_ratios,
}

search = GridSearchCV(
    estimator=model_SGD,
    param_grid=param_grid,
    scoring="neg_root_mean_squared_error",
    cv=3,               # careful: this is expensive on 14.5M rows
    n_jobs=2,           # 1 pipeline at a time; let SGD use threads
    verbose=2,
    return_train_score=True,
)

t0 = time.time()
search.fit(X_full_train, y_train)
t1 = time.time()
print(f"GridSearchCV done in {(t1 - t0)/60:.2f} minutes")


print("SGDEN Best params:", search.best_params_)
print("SGDEN Best CV score (RMSE):", -search.best_score_)
print(f"SGDEN search time: {(t1 - t0)/60:.2f} minutes")

best_sgd = search.best_estimator_

    
# Evaluate best XGB on validation set explicitly:
y_pred_sgd = best_sgd.predict(X_full_train)
sgd_train_rmse = root_mean_squared_error(y_train, y_pred_sgd)

y_pred_sgd = best_sgd.predict(X_full_val)
sgd_val_rmse = root_mean_squared_error(y_val, y_pred_sgd)
print(f"SGDEN training RMSE (best model): {sgd_train_rmse:.4f}")
print(f"SGDEN validation RMSE (best model): {sgd_val_rmse:.4f}")




Fitting 3 folds for each of 20 candidates, totalling 60 fits
GridSearchCV done in 0.24 minutes
SGDEN Best params: {'reg__alpha': 0.001, 'reg__l1_ratio': 0.5}
SGDEN Best CV score (RMSE): 9.151588832521895
SGDEN search time: 0.24 minutes
SGDEN training RMSE (best model): 9.2400
SGDEN validation RMSE (best model): 9.0919


SGD elastic nest is worse than ridge regression again.

In [11]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


# -------------------------------
# 3. XGBoost: randomized search
# -------------------------------

# portion out some data from _train for early stopping
X_temp, X_stop_xgb, y_temp, y_stop_xgb = train_test_split(
    X_full_train, y_train, test_size=0.01, random_state=42)
# use fewer data for training for speed
X_temp, X_train_xgb, y_temp, y_train_xgb = train_test_split(
    X_temp, y_temp, test_size=0.02, random_state=42)

print(f"eval (early stopping) on {y_stop_xgb.shape[0]:d} rows")
print(f"train (CV tuning) on {y_train_xgb.shape[0]:d} rows")

# Define parameter distribution
param_dist = {
    'max_depth': [5,8,11,16],
    'learning_rate': [0.3, 0.1, 0.05],
    'min_child_weight': [8, 25, 50, 100],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.3, 0.6, 0.8]
}

# Create XGBClassifier
xgb = XGBRegressor(
    tree_method="hist",
    enable_categorical=True,  # if using pandas categorical dtypes
    n_estimators=2000,        # large, rely on early stopping
    objective="reg:squarederror",
    eval_metric="rmse",
    early_stopping_rounds=10,
    n_jobs=-1
)

fit_params = {
    "eval_set": [(X_stop_xgb, y_stop_xgb)],
    "verbose": False,
}


search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,          # e.g. 50 random trials
    scoring="neg_root_mean_squared_error",
    verbose=4,          # shows progress of the search
    n_jobs=1,
    cv=3   
)

print("\n=== XGBoost hyperparameter search ===")
t0 = time.time()
search.fit(X_train_xgb, y_train_xgb, **fit_params)
t1 = time.time()




print("XGB Best params:", search.best_params_)
print("XGB Best CV score (RMSE):", -search.best_score_)
print(f"XGB search time: {(t1 - t0)/60:.2f} minutes")

best_xgb = search.best_estimator_

    
# Evaluate best XGB on validation set explicitly:
y_pred_xgb = best_xgb.predict(X_full_train)
xgb_train_rmse = root_mean_squared_error(y_train, y_pred_xgb)

y_pred_xgb = best_xgb.predict(X_full_val)
xgb_val_rmse = root_mean_squared_error(y_val, y_pred_xgb)
print(f"XGB training RMSE (best model): {xgb_train_rmse:.4f}")
print(f"XGB validation RMSE (best model): {xgb_val_rmse:.4f}")



# ------------------------------------
# 4. Summary of model comparison
# ------------------------------------

#print("\n=== Summary (validation RMSE) ===")
#print(f"Ridge (L2)   : {best_ridge_rmse:.4f}  (alpha={best_ridge_alpha})")
#print(f"XGBoost      : {xgb_val_rmse:.4f}")

eval (early stopping) on 360 rows
train (CV tuning) on 713 rows

=== XGBoost hyperparameter search ===
Fitting 3 folds for each of 50 candidates, totalling 150 fits
[CV 1/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=5, min_child_weight=100, subsample=1.0;, score=-7.876 total time=   0.1s
[CV 2/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=5, min_child_weight=100, subsample=1.0;, score=-7.858 total time=   0.0s
[CV 3/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=5, min_child_weight=100, subsample=1.0;, score=-8.509 total time=   0.1s
[CV 1/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=16, min_child_weight=100, subsample=0.8;, score=-8.185 total time=   0.0s
[CV 2/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=16, min_child_weight=100, subsample=0.8;, score=-8.078 total time=   0.1s
[CV 3/3] END colsample_bytree=0.3, learning_rate=0.3, max_depth=16, min_child_weight=100, subsample=0.8;, score=-8.988 total time=   0.0s
[CV 1/3] E

In [12]:
# let's check the results
results = search.cv_results_
df_results = pd.DataFrame({
    "mean_fit_time": results["mean_fit_time"],
    "param_subsample": results["param_subsample"],
    "param_min_child_weight": results["param_min_child_weight"],
    "param_max_depth": results["param_max_depth"],
    "param_learning_rate": results["param_learning_rate"],
    "param_colsample_bytree": results["param_colsample_bytree"],
    "mean_CV_RMSE": -results["mean_test_score"],
    "std_test_score": results["std_test_score"],
    "rank": results["rank_test_score"],
})

df_results.sort_values("rank",inplace=True)
print(df_results["mean_CV_RMSE"].describe())

count    50.000000
mean      7.044493
std       0.703876
min       6.204531
25%       6.460469
50%       6.751465
75%       7.877572
max       8.417053
Name: mean_CV_RMSE, dtype: float64


In [13]:
df_results

Unnamed: 0,mean_fit_time,param_subsample,param_min_child_weight,param_max_depth,param_learning_rate,param_colsample_bytree,mean_CV_RMSE,std_test_score,rank
47,0.17505,0.8,8,5,0.1,0.8,6.204531,0.384654,1
28,0.174067,1.0,8,5,0.1,0.6,6.234218,0.406984,2
27,0.22588,0.8,8,8,0.1,0.8,6.237001,0.411708,3
35,0.258309,0.8,8,11,0.1,0.6,6.250593,0.37073,4
15,0.419726,0.8,8,11,0.05,0.6,6.300675,0.369596,5
31,0.203627,1.0,8,5,0.1,0.3,6.305102,0.438442,6
6,0.398088,1.0,8,8,0.05,0.8,6.323062,0.325216,7
49,0.352745,0.8,8,8,0.05,0.8,6.336359,0.373349,8
37,0.523685,1.0,8,8,0.05,0.3,6.416644,0.399045,9
10,0.121226,0.8,8,16,0.3,0.8,6.436916,0.445556,10
