In [1]:
%load_ext kedro

In [2]:
df = catalog.load('feature_data')

In [34]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, RANSACRegressor, BayesianRidge, ARDRegression, SGDRegressor, PassiveAggressiveRegressor
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [13]:
reg_features = [
    'num__milage',
    'num__model_year', 
    'num__eng_hp',
    'num__premium_score',
    'num__col_enc',
    'num__eng_volume',
    'num__clean_title',
    'num__trans_mordernity',
    'cat__eng_class_Basic',
    'cat__trans_tech_High_Gear_Automatic'
]

selected_features = [
    'num__model_year',
    'num__milage',  # Note: your actual column is 'milage' not 'mileage'
    'num__accident',
    'num__clean_title',
    'num__premium_score',
    'num__eng_cylinders',
    'num__eng_volume',
    'num__eng_hp',
    'num__trans_mordernity',  # Note: your actual column is 'mordernity' not 'modernity'
    'num__col_enc',
    'cat__eng_class_Basic',
    'cat__eng_class_High_Performance',
    'cat__eng_class_Modern_Efficient',
    'cat__eng_class_Turbocharged',
    'cat__trans_tech_High_Gear_Automatic',
    'cat__trans_tech_Standard_Automatic',
    'cat__trans_gear_count_category_10_Speed',
    'cat__trans_gear_count_category_4-5_Speed',
    'cat__trans_gear_count_category_8-9_Speed',
    'cat__trans_gear_count_category_Unknown_Gears'
]

In [14]:
X = df[reg_features]
y = df['num__price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
models = {
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'HuberRegressor': HuberRegressor(),
    'RANSACRegressor': RANSACRegressor(),
    'BayesianRidge': BayesianRidge(),
    'ARDRegression': ARDRegression(),
    'SGDRegressor': SGDRegressor(),
    'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor()
}


In [26]:
results = []

for name, model in models.items():
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)
        results.append({
            'Model': name,
            'MAE': mae,
            'RMSE': rmse, 
            'R2_Score': r2
        })
        
    except Exception as e:
        print(f"Error with {name}: {e}")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('RMSE')

In [27]:
results_df

Unnamed: 0,Model,MAE,RMSE,R2_Score
10,GradientBoostingRegressor,0.24903,0.858228,0.1771752
11,HistGradientBoostingRegressor,0.249502,0.860659,0.1725076
0,LinearRegression,0.276643,0.865505,0.1631633
1,Ridge,0.276643,0.865505,0.1631632
6,BayesianRidge,0.276609,0.865506,0.1631603
7,ARDRegression,0.277141,0.865653,0.1628773
8,SGDRegressor,0.267753,0.866303,0.1616192
4,HuberRegressor,0.239889,0.887393,0.1203009
5,RANSACRegressor,0.259233,0.913291,0.06820603
3,ElasticNet,0.369032,0.946127,-7.329772e-07
