# Model Training

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score


In [44]:
df = pd.read_csv("../data/cleaned_vehicle_data.csv")

In [45]:
df.head(20)

Unnamed: 0,make,model,price,fuel,transmission,trim,drivetrain,car_age
0,Jeep,Wagoneer,74600.0,Gasoline,8-Speed Automatic,Series II,Four-wheel Drive,1
1,Jeep,Grand Cherokee,50170.0,Gasoline,8-Speed Automatic,Laredo,Four-wheel Drive,1
2,GMC,Yukon XL,96410.0,Gasoline,Automatic,Denali,Four-wheel Drive,1
3,Dodge,Durango,46835.0,Gasoline,8-Speed Automatic,Pursuit,All-wheel Drive,2
4,RAM,3500,81663.0,Diesel,6-Speed Automatic,Laramie,Four-wheel Drive,1
5,Nissan,Murano,46000.0,Gasoline,Automatic CVT,Platinum,All-wheel Drive,1
6,Jeep,Wagoneer,63862.0,Gasoline,8-Speed Automatic,Base,Rear-wheel Drive,1
7,Ford,F-350,89978.0,Diesel,1-Speed Automatic,Lariat Super Duty,Four-wheel Drive,1
8,Hyundai,Tucson Hybrid,42230.0,Hybrid,6-Speed Automatic,Limited,All-wheel Drive,1
9,Jeep,Grand Cherokee,42773.0,Gasoline,Automatic,Altitude,Four-wheel Drive,1


In [47]:
# Separate features and target
X = df.drop("price", axis=1)
y = df["price"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Identify column types
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Preprocessor for pipeline
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
])

# ----------------------------- #
# 🔧 1. Linear Regression
# ----------------------------- #
linreg_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

linreg_pipeline.fit(X_train, y_train)
y_pred_linreg = linreg_pipeline.predict(X_test)
print("Linear Regression R²:", r2_score(y_test, y_pred_linreg))


# ----------------------------- #
# 🔧 2. Ridge Regression with Hyperparameter Tuning
# ----------------------------- #
ridge_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Ridge())
])

ridge_params = {
    "regressor__alpha": [0.01, 0.1, 1, 10, 100]
}

ridge_grid = GridSearchCV(ridge_pipeline, ridge_params, cv=5, scoring="r2", n_jobs=-1)
ridge_grid.fit(X_train, y_train)

print("Best Ridge R²:", ridge_grid.best_score_)
print("Best Ridge Params:", ridge_grid.best_params_)


# ----------------------------- #
# 🔧 3. Lasso Regression with Hyperparameter Tuning
# ----------------------------- #
lasso_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", Lasso(max_iter=10000))
])

lasso_params = {
    "regressor__alpha": [0.01, 0.1, 1, 10, 100]
}

lasso_grid = GridSearchCV(lasso_pipeline, lasso_params, cv=5, scoring="r2", n_jobs=-1)
lasso_grid.fit(X_train, y_train)

print("Best Lasso R²:", lasso_grid.best_score_)
print("Best Lasso Params:", lasso_grid.best_params_)


# ----------------------------- #
# 🔧 4. ElasticNet with Hyperparameter Tuning
# ----------------------------- #
elastic_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", ElasticNet(max_iter=10000))
])

elastic_params = {
    "regressor__alpha": [0.01, 0.1, 1, 10],
    "regressor__l1_ratio": [0.1, 0.5, 0.9]
}

elastic_grid = GridSearchCV(elastic_pipeline, elastic_params, cv=5, scoring="r2", n_jobs=-1)
elastic_grid.fit(X_train, y_train)

print("Best ElasticNet R²:", elastic_grid.best_score_)
print("Best ElasticNet Params:", elastic_grid.best_params_)


Linear Regression R²: 0.8569565545437482
Best Ridge R²: 0.8090977022982118
Best Ridge Params: {'regressor__alpha': 0.1}
Best Lasso R²: 0.7943814937539072
Best Lasso Params: {'regressor__alpha': 1}
Best ElasticNet R²: 0.8054078760777355
Best ElasticNet Params: {'regressor__alpha': 0.01, 'regressor__l1_ratio': 0.9}


In [50]:
from sklearn.model_selection import cross_val_score

# Use the same pipeline you already trained: linreg_pipeline
scores = cross_val_score(linreg_pipeline, X, y, cv=5, scoring='r2')

print("Cross-validated R² scores:", scores)
print("Mean R²:", scores.mean())


Cross-validated R² scores: [0.74833934 0.81808601 0.87618222 0.85274481 0.77899187]
Mean R²: 0.8148688495564278


In [52]:
import joblib
joblib.dump(linreg_pipeline, '../model/linear_regression_model.pkl')


['../model/linear_regression_model.pkl']