In [34]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Baseline Regression Model: Mercedes Test Bench

## 1. New DF according to EDA results

In [9]:
# Load datasets
train_df = pd.read_csv("data/train.csv")
test_df  = pd.read_csv("data/test.csv")

# Drop columns with no variance (from EDA)
drop_cols = ['X11','X93','X107','X233','X235','X268','X289','X290','X293','X297','X330','X347']
train_df = train_df.drop(columns=drop_cols, errors="ignore")
test_df  = test_df.drop(columns=drop_cols, errors="ignore")

# 3) One-Hot Encode categorical columns
categorical_cols = ['X0','X1','X2','X3','X4','X5','X6','X8']
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

# Ohe on train categoricals
ohe_train = ohe.fit_transform(train_df[categorical_cols])
# Ohe on test categoricals
ohe_test  = ohe.transform(test_df[categorical_cols])

# Transformed dataframes (drop original categoricals, join OHE)
train_transformed = train_df.drop(columns=categorical_cols).join(ohe_train)
test_transformed  = test_df.drop(columns=categorical_cols).join(ohe_test)

# Quick checks
print("Shapes")
print("Original train_df shape:    ", train_df.shape)
print("Transformed train_df shape: ", train_transformed.shape)

print("\n Dropped Categorical Columns")
print(categorical_cols)

print("\n New OHE Columns Names")
print(ohe.get_feature_names_out(categorical_cols)[:20])  # show first 20 OHE columns
print(f"Total new OHE columns: {len(ohe.get_feature_names_out(categorical_cols))}")


Shapes
Original train_df shape:     (4209, 366)
Transformed train_df shape:  (4209, 553)

 Dropped Categorical Columns
['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']

 New OHE Columns Names
['X0_a' 'X0_aa' 'X0_ab' 'X0_ac' 'X0_ad' 'X0_af' 'X0_ai' 'X0_aj' 'X0_ak'
 'X0_al' 'X0_am' 'X0_ao' 'X0_ap' 'X0_aq' 'X0_as' 'X0_at' 'X0_au' 'X0_aw'
 'X0_ax' 'X0_ay']
Total new OHE columns: 195


In [14]:
# review from EDA: find the most correlated features 
num_cols = train_transformed.select_dtypes(include=np.number).columns
corr_series = (
    train_transformed[num_cols]
    .corr(numeric_only=True)['y']
    .drop(labels=['y','ID'], errors='ignore')
    .dropna()
    .abs()
    .sort_values(ascending=False)
)
print(corr_series.index[:12])  # shows top dozen names in order


Index(['X314', 'X261', 'X127', 'X279', 'X263', 'X232', 'X29', 'X0_az', 'X54',
       'X76', 'X136', 'X328'],
      dtype='object')


## 2. Basline model 3 features from the most correlated with Y 

### Define X and Y

In [15]:
# Define X, y from transformed training DF
feature_cols_all = [c for c in train_transformed.columns if c not in ['ID', 'y']] # exclude ID and target
X_all = train_transformed[feature_cols_all]
y_all = train_transformed['y']


Select only the top 3 most correlated features


In [16]:
# 1) Use the features you found by EDA correlation
top_by_corr = ['X314','X261','X127','X279','X263','X232','X29','X0_az','X54','X76','X136','X328']

# # exclude any missing columns
top_by_corr = [c for c in top_by_corr if c in X_all.columns] 

Select top 3 features

In [17]:
top3_feats  = top_by_corr[:3]
print("Top 3 features:", top3_feats)

Top 3 features: ['X314', 'X261', 'X127']


### Train-Test split

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42
)

### Fit Regression model (train)

In [20]:
lin_reg_top3 = LinearRegression()
lin_reg_top3.fit(X_train[top3_feats], y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Evaluate

In [26]:
# Predict on validation set
y_pred_top3 = lin_reg_top3.predict(X_valid[top3_feats])

# Metrics
rmse_top3 = np.sqrt(mean_squared_error(y_valid, y_pred_top3))
r2_top3   = r2_score(y_valid, y_pred_top3) 

print("Top 3 features:", top3_feats)

print("Evaluation: Linear Regression (Top-3 Features)")
print(f"RMSE: {rmse_top3:.3f}")
print(f"R²:   {r2_top3:.3f}")

Top 3 features: ['X314', 'X261', 'X127']
Evaluation: Linear Regression (Top-3 Features)
RMSE: 9.637
R²:   0.403


## 3. Baseline Model with 10 top features correlated with Y 

In [24]:
# 0) Define X, y
feature_cols_all = [c for c in train_transformed.columns if c not in ['ID', 'y']]
X_all = train_transformed[feature_cols_all]
y_all = train_transformed['y']

# features found by EDA correlation
top_by_corr = ['X314','X261','X127','X279','X263','X232','X29','X0_az','X54','X76','X136','X328']

# keep only existing columns
top_by_corr = [c for c in top_by_corr if c in X_all.columns]

#Select top 10
top10_feats = top_by_corr[:10]

print("Top 10 features:", top10_feats)

# 2) Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42)

# 3) Fit & evaluate the Top-10 model
lin_reg_top10 = LinearRegression()
lin_reg_top10.fit(X_train[top10_feats], y_train)

y_pred_top10 = lin_reg_top10.predict(X_valid[top10_feats])
rmse_top10 = np.sqrt(mean_squared_error(y_valid, y_pred_top10))
r2_top10   = r2_score(y_valid, y_pred_top10)

print("\nEvaluation: Linear Regression (Top-10 Features)")
print(f"RMSE: {rmse_top10:.3f}")
print(f"R²:   {r2_top10:.3f}")


Top 10 features: ['X314', 'X261', 'X127', 'X279', 'X263', 'X232', 'X29', 'X0_az', 'X54', 'X76']

Evaluation: Linear Regression (Top-10 Features)
RMSE: 8.306
R²:   0.557


## 4. Evaluation Top 3 vs Top 10 


### Evaluation: Linear Regression (Top-3 Features)
- RMSE: 9.637 
model is off about 9.6 secounds (avg error)
- R²:   0.403
- Top 3 features: ['X314', 'X261', 'X127'] 

- Since most y values are around 90–110 seconds (from EDA), RMSE shows roughly a 10% error margin. 
- R2 shows that the model explains about 40% of the variance in the target variable y

### Evaluation: Linear Regression (Top-10 Features)
- RMSE: 8.306
model is off about 8.3 secounds (avg error)
- R²:   0.557
-Top 10 features: ['X314', 'X261', 'X127', 'X279', 'X263', 'X232', 'X29', 'X0_az', 'X54', 'X76'] 

**conclusions Model Top 10**
- Improvement of ~1.3 seconds over the top-3 model (RMSE)
- The model explains about 56% of the variance in y (R2) 

**Conclussion:**  Model improved with 10 features. Proceed to regularize in order to improve R2 and RMSE. Lower RMSE shows that Top 10 features model is the best alternative for a baseline. 

# 5. Regularization features: Ridge regression with Grid Search

### Scale (regularize) and perform Ridge regression with dafault tunning
Since it seems that several fatures are correlated (Ex= features transformed by OHE) we decided to go 1st with a Ridge Regression to handle multicollinearity better. Depending on results we can try Lasso to identify what features are important. 

Note: We'll use all features on Ridge and add alpha parameter for tuning grid search

In [None]:
# 1) Define X, y (using ALL features)
# use train_transformed df we created above (after dropping constants + OHE) 

feature_cols_all = [c for c in train_transformed.columns if c not in ['ID', 'y']]
X = train_transformed[feature_cols_all]
y = train_transformed['y']

# 2) Train/validation split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

## 3) Scale features (important for Ridge/Lasso)

scaler = StandardScaler() # scale features to mean=0, std=1 
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


# 4) Fit Ridge regression
ridge = Ridge(alpha=1.0)   # alpha is the how we control the grid search (can tune later). 
ridge.fit(X_train_scaled, y_train)

# 5) Predictions
y_pred = ridge.predict(X_test_scaled)

# 6) Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred) 

print("Ridge Regression (all features)")
print(f"RMSE: {rmse:.3f}")   # average prediction error in seconds
print(f"R²:   {r2:.3f}")     # proportion of variance explained

Ridge Regression (all features)
RMSE: 8.370
R²:   0.550


No much improvement over Top-10 linear regression, but at least no overfitting

## Ridge Regression with hyperparametr tunning (alpha: Grid Search)

In [None]:
# 1) Define X, y from transformed DataFrame (all features)
feature_cols_all = [c for c in train_transformed.columns if c not in ['ID', 'y']]
X = train_transformed[feature_cols_all]
y = train_transformed['y']

# 2) Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

# 3) Scale features (fit scaler on TRAIN, then transform both) Note: there is a better method using Pipelines (future learn: reduce leaks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# 4) Set up Ridge and the alpha grid to search
ridge = Ridge()  # alpha will be done by GridSearchCV
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

# 5) Grid search (3-fold CV on the TRAIN split)
ridge_cv = GridSearchCV(
    estimator=ridge,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

# Fit on TRAIN ONLY (already scaled)
ridge_cv.fit(X_train_scaled, y_train)

print("Best params:", ridge_cv.best_params_)
print("Best CV R²:", ridge_cv.best_score_)

# 6) Evaluate the best model on the TEST split
y_pred = ridge_cv.predict(X_test_scaled)

mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2   = r2_score(y_test, y_pred)

print("\nRidge (all features) — Test Performance")
print(f"MAE:  {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²:   {r2:.3f}")

# 7) What is the best stimator? best alpha?
best_ridge = ridge_cv.best_estimator_
print("\nChosen alpha:", best_ridge.alpha)

# (optional) intercept and coefficients:
print("Intercept:", best_ridge.intercept_)
coefs = best_ridge.coef_
print("Number of coefficients:", len(coefs))

Best params: {'alpha': 100.0}
Best CV R²: 0.5099806747308392

Ridge (all features) — Test Performance
MAE:  5.592
RMSE: 8.221
R²:   0.566

Chosen alpha: 100.0
Intercept: 100.68579447579448
Number of coefficients: 551


### What are the best parameters after grid search? 
- **Best Alpha = 100** Out of the grid [0.001, 0.01, 0.1, 1, 10, 100], the best performing alpha was 100 (very high alpha = strong regularization to control overfitting. Strong penalties) 
- **Best CV R2** = 0.509 Ridge explained around 51% of the variance - maybe: good generalization? 

## Performance
- **MAE: 5.592** model prediction are off by 5.6 seconds 
- **RMSE: 8.221** Average error is 8,2 seconds (not to different from baseline) 
- **R²: 0.566** Model explains 57% of the data variance 

#Conclusions
- Despite there is not a big improvement in R2 when compared with the baseline model, Ridge regression can help to avoid overfiting

** Ridge wih all features gave the best performance so far:**

- RMSE improved to 8.2 vs 9.6 (Top-3) and 8.3 (Top-10).

- R² improved to 0.566, explaining more variance than simpler baseline models