In [1]:
# %% [markdown]
# # MODEL BUILDING

# %% [markdown]
# ## import the relevant libraries

# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import pickle


In [2]:

# %%
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')
data.head()



Unnamed: 0,name,manufacturer,year,age,kilometerage,engine,transmission,price
0,Ford Fiesta,FORD,2003,22,175418.06,Petrol,Automatic,1500
1,Vauxhall Corsa,VAUXHALL,2003,22,175418.06,Petrol,Automatic,1500
2,Vauxhall Zafira,VAUXHALL,2003,22,175418.06,Petrol,Automatic,1500
3,Peugeot 107,PEUGEOT,2003,22,175418.06,Petrol,Automatic,1500
4,Vauxhall Corsa,VAUXHALL,2003,22,175418.06,Petrol,Automatic,1500


In [3]:

# %% [markdown]
# ## drop the name and year columns because it is irrelevant in our model building

# %%
data = data.drop(['year'], axis=1)

# %%
data.head()

# %%
data['name'].unique()

# %% [markdown]
# # label encode the categorical values

# %%
le_name = LabelEncoder()
le_manufacturer = LabelEncoder()
le_engine = LabelEncoder()
le_transmission = LabelEncoder()
data['name'] = le_name.fit_transform(data['name'])
data['manufacturer'] = le_manufacturer.fit_transform(data['manufacturer'])
data['engine'] = le_engine.fit_transform(data['engine'])
data['transmission'] = le_transmission.fit_transform(data['transmission'])

# %%
data.tail(
)

# %%




Unnamed: 0,name,manufacturer,age,kilometerage,engine,transmission,price
63968,63,4,5,50404.5288,3,0,84995
63969,466,41,2,4184.284,2,0,84995
63970,572,43,2,3057.746,0,0,84995
63971,398,34,3,12824.83046,3,0,85000
63972,79,5,1,7903.46874,2,0,85000


In [4]:
print(data['name'].unique()) 
print(data[data['name'] == 1].shape[0])

# %%
# creating X and y variables
X = data.drop(['price'], axis=1)
y = data['price'] * 10
X.head()
y.head()

# %%
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# %%
X_train.shape, X_test.shape, y_train.shape, y_test.shape


[149 506 523 374  98 178 153 266 378 217  70 363 411 135  19 179 104 501
 579 157 458 382 142 514 224 108 161 494 555 365 469 184 426 128 105 347
 141 145 164 433 273 380 174 371 228 442 291 459 419  65 170 272 285 330
 529 226 240  10 114 537  76 416 368 391 427 189 468 107 231 491 279 467
 392 553   9   3 581 379 592 188 513  13 521 341 276  59 452 229 274 447
 372 448 251 421 385 116 438 129 476 499  95 196 485 110 323 301 275 136
 299 559 126 503 342 177 351 197 449 570 429 361 190 584 199  21 286 302
 242 300 577 267  15 102 113 456 393 483 264 413 292 163 358 144 373 278
 386 348 210 227  20 571 139 500 534 515 357 356 451 362 213 241 152  93
 353 159 530 281 289 345 256  78 453 103 284 326 115 344 200  16 502   2
 375 290 158 282 394 211 441 590 558 376 431 460 566 520 531 335 444 493
 524 137 130 585 578 223 498 270 583 446 245 420 370 109 321 387 191 162
 132 214 410 322 203 143 324 295 173   7 280 352 146  17 455 327 192 138
 225 366  39 383 415 175 422 450 517 388 461 484 28

((51178, 6), (12795, 6), (51178,), (12795,))

In [5]:

# %%
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)


# %%


[[0.55405405 0.61363636 0.0877193  0.19286702 0.         0.        ]
 [0.85472973 0.95454545 0.07017544 0.06140988 0.75       0.5       ]
 [0.51013514 0.61363636 0.01754386 0.05043235 0.         1.        ]
 ...
 [0.41385135 0.47727273 0.07017544 0.06427079 0.5        0.        ]
 [0.80405405 0.93181818 0.         0.012588   0.5        0.        ]
 [0.9847973  1.         0.38596491 0.25139992 0.75       0.        ]]


[[0.85472973 0.95454545 0.14035088 0.1499341  0.         0.5       ]
 [0.69425676 0.79545455 0.12280702 0.23787007 0.75       0.5       ]
 [0.1097973  0.11363636 0.03508772 0.0459674  0.75       0.5       ]
 ...
 [0.07263514 0.06818182 0.05263158 0.07407503 0.75       1.        ]
 [0.62668919 0.70454545 0.         0.0028802  0.75       0.        ]
 [0.82939189 0.93181818 0.03508772 0.06728278 0.5        0.        ]]


In [6]:
# models = {
#     'linear_model': LinearRegression(),
#     'lgbm_model':LGBMRegressor(random_state = 123),
#     'rf_model': RandomForestRegressor(
#         random_state=123,
#         n_estimators=200,
#         max_depth=12,
#         min_samples_leaf=5 
#     ),
    
#     'xgboost_model': XGBRegressor(
#         random_state=123,
#         n_estimators=200,
#         max_depth=8,
#         learning_rate=0.05,
#         tree_method='hist' # Faster training
#     ),
    
#     'ridge_model': Ridge(
#         random_state=123,
#         alpha=1.0          # Default regularization strength
#     )
#  }

# # %%
# def train_model(models: dict) -> pd.DataFrame:
#     """
#     It takes in a dictionary containing a key-pair of model name and estimators.
#     It returns a data frame containing the metrics of the trained model.
#     """
#     my_dict = {}
#     name_list, train_score_list, r_sqd_list, mae_list, rmse_list = [], [], [], [], []
#     for name, estimator in models.items():
#         # fit
#         estimator.fit(X_train, y_train)

#         # make predictions
#         y_pred = estimator.predict(X_test)

#         # metrics
#         train_score = estimator.score(X_train, y_train)
#         r_sqd = metrics.r2_score(y_test, y_pred)
#         mae = metrics.mean_absolute_error(y_test, y_pred)
#         mse = metrics.mean_squared_error(y_test, y_pred)
#         rmse = np.sqrt(mse)

#         # add the metrics to the empty list
#         name_list.append(name)
#         train_score_list.append(train_score)
#         r_sqd_list.append(r_sqd)
#         mae_list.append(mae)
#         rmse_list.append(rmse)

#     my_dict["Name"] = name_list
#     my_dict["Train_Score"] = train_score_list
#     my_dict["R_squared"] = r_sqd_list
#     my_dict["Mean_absolute_error"] = mae_list
#     my_dict["Root_mean_sqd_error"] = rmse_list

#     my_dataframe = pd.DataFrame(my_dict)
#     my_dataframe = my_dataframe.sort_values("Root_mean_sqd_error")
#     return my_dataframe

# # %%
# train_model(models)



KeyboardInterrupt: 

In [None]:
# grid = {
#     'colsample_bytree': [0.6,0.7,0.8],
#     'learning_rate': [0.01,0.05,0.1],
#     'max_depth': [9,12,14],
#     'min_child_weight': [0.01,0.05,0.1],
#     'n_estimators': [150,250 ,350],
#     'subsample': [0.8,0.7,0.9],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# best 0.7 , 0.1, 9, 0.1 , 350, 0.8 
# grid = {
#     'colsample_bytree': [0.7],
#     'learning_rate': [0.1],
#     'max_depth': [9],
#     'min_child_weight': [0.1],
#     'n_estimators': [350],
#     'subsample': [0.8],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# grid = {
#     'colsample_bytree': [0.6,0.7,0.8],
#     'learning_rate': [0.01,0.05,0.1],
#     'max_depth': [9,12,14],
#     'min_child_weight': [0.01,0.05,0.1],
#     'n_estimators': [150,250 ,350],
#     'subsample': [0.8,0.7,0.9],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# grid = {
#     'colsample_bytree': [0.6, 0.7, 0.8],  # Fraction of features to be used for each tree
#     'learning_rate': [0.01, 0.05, 0.1],   # Step size shrinkage used to prevent overfitting
#     'max_depth': [9, 12, 14],             # Maximum depth of a tree
#     'min_child_weight': [1, 5, 10],       # Minimum sum of instance weight (hessian) needed in a child
#     'n_estimators': [150, 250, 350],      # Number of boosting rounds
#     'subsample': [0.7, 0.8, 0.9],         # Fraction of samples to be used for each tree
#     'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition on a leaf node
#     'reg_alpha': [0, 0.01, 0.1],          # L1 regularization term on weights
#     'reg_lambda': [1, 1.5, 2]             # L2 regularization term on weights
# }

# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1 
# )
grid = {
    'colsample_bytree': [0.7],  # Fraction of features to be used for each tree
    'learning_rate': [0.1],   # Step size shrinkage used to prevent overfitting
    'max_depth': [9],             # Maximum depth of a tree
    'min_child_weight': [1],       # Minimum sum of instance weight (hessian) needed in a child
    'n_estimators': [350],      # Number of boosting rounds
    'subsample': [0.8],         # Fraction of samples to be used for each tree
    'gamma': [0],               # Minimum loss reduction required to make a further partition on a leaf node
    'reg_alpha': [0.1],          # L1 regularization term on weights
    'reg_lambda': [1.5]             # L2 regularization term on weights
}

model = GridSearchCV(
    estimator=XGBRegressor(
        random_state=123,
        tree_method='hist'  # Keep the fast training method
    ),
    param_grid=grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1 
)
# Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 350, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.8

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [21]:
# %%
y_pred = model.predict(X_test)
print("Best parameters:", model.best_params_)
print("\nPrediction Metrics:")
print("R-squared Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))

# %%
grid_model = pd.DataFrame({
    'model': ['XGboost'],
    'r_squared': [metrics.r2_score(y_test, y_pred)],
    'mae': [mean_absolute_error(y_test, y_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
    })
grid_model


# # %%
# data = {"model": model, "normalization": norm}
# with open('../models/regressor3.pkl', 'wb') as file:
#     pickle.dump(data, file)



Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 350, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.8}

Prediction Metrics:
R-squared Score: 0.8978823187678526
Mean Absolute Error: 27792.53282807005
Root Mean Squared Error: 45230.05728960372


Unnamed: 0,model,r_squared,mae,rmse
0,XGboost,0.897882,27792.532828,45230.05729


In [10]:
# %% [markdown]
# #### Make Predictions on new data.

# %%
# Create test configurations using exact names from dataset
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')

test_configurations = [
    ['Mercedes-Benz C Class', 'MERCEDES-BENZ', 5, 50000.0, 'Petrol', 'Automatic'],  # Luxury newer car
    ['Toyota Corolla', 'TOYOTA', 5, 50000.0, 'Petrol', 'Manual'],     # Economy newer car
    ['Bmw X5', 'BMW', 5, 50000.0, 'Diesel', 'Automatic'],            # Luxury SUV
    ['Dacia Sandero', 'DACIA', 5, 50000.0, 'Petrol', 'Manual'],      # Budget car
    ['Ford Fiesta', 'FORD', 5, 50000.0, 'Petrol', 'Manual'],         # Popular economy car
    ['Audi A5', 'AUDI', 5, 50000.0, 'Diesel', 'Automatic']           # Premium car
]

# Use the rest of the corrected code I provided earlier with these test configurations
# Fit the label encoders once with the training data
le_name.fit(data['name'])
le_manufacturer.fit(data['manufacturer'])
le_engine.fit(data['engine'])
le_transmission.fit(data['transmission'])

results = []
for config in test_configurations:
    try:
        # Create numpy array with the right shape
        new_data = np.zeros((1, 6))
        
        try:
            # Transform each feature using the appropriate encoder
            new_data[0, 0] = le_name.transform([config[0]])[0]  # Changed from fit_transform
            new_data[0, 1] = le_manufacturer.transform([config[1]])[0]
            new_data[0, 2] = float(config[2])  # age
            new_data[0, 3] = float(config[3])  # kilometerage
            new_data[0, 4] = le_engine.transform([config[4]])[0]
            new_data[0, 5] = le_transmission.transform([config[5]])[0]
        except ValueError as e:
            print(f"Warning: Unknown category in {config[0]}: {str(e)}")
            continue
            
        # Normalize using the same scaler used during training
        normalized_data = norm.transform(new_data)
        
        # Predict (removed the *10 multiplication)
        price = model.predict(normalized_data)
        
        results.append({
            'Car': config[0],
            'Manufacturer': config[1],
            'Age': config[2],
            'Mileage': f"{config[3]:,.0f}",
            'Engine': config[4],
            'Transmission': config[5],
            'Estimated Price': f"{price[0]:,.0f} MAD"  # Removed *10
        })
    except Exception as e:
        print(f"Error processing configuration {config}: {str(e)}")
        continue

# Display results
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': ['name', 'manufacturer', 'age', 'kilometerage', 'engine', 'transmission'],
    'importance': model.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)
print("\nPredictions:")
result_df = pd.DataFrame(results)
print(result_df)


Feature Importance:
        feature  importance
2           age    0.300223
5  transmission    0.292134
1  manufacturer    0.133431
0          name    0.122963
4        engine    0.090465
3  kilometerage    0.060784

Predictions:
                     Car   Manufacturer  Age Mileage  Engine Transmission  \
0  Mercedes-Benz C Class  MERCEDES-BENZ    5  50,000  Petrol    Automatic   
1         Toyota Corolla         TOYOTA    5  50,000  Petrol       Manual   
2                 Bmw X5            BMW    5  50,000  Diesel    Automatic   
3          Dacia Sandero          DACIA    5  50,000  Petrol       Manual   
4            Ford Fiesta           FORD    5  50,000  Petrol       Manual   
5                Audi A5           AUDI    5  50,000  Diesel    Automatic   

  Estimated Price  
0     329,492 MAD  
1     138,938 MAD  
2     463,164 MAD  
3     118,575 MAD  
4     120,329 MAD  
5     235,272 MAD  


In [17]:
# Combine training and test data
X_combined = np.vstack((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

# Fit the model on the combined dataset
model.fit(X_combined, y_combined, verbose=1)

# Make predictions on the combined dataset
y_pred_combined = model.predict(X_combined)

# Evaluate the model on the combined dataset
print("R-squared Score:", metrics.r2_score(y_combined, y_pred_combined))
print("Mean Absolute Error:", mean_absolute_error(y_combined, y_pred_combined))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_combined, y_pred_combined)))

R-squared Score: 0.9048598751839635
Mean Absolute Error: 28507.76036970255
Root Mean Squared Error: 43862.93864369047


In [19]:
# %% [markdown]
# #### Make Predictions on new data.

# %%
# Create test configurations using exact names from dataset
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')

test_configurations = [
    ['Mercedes-Benz C Class', 'MERCEDES-BENZ', 5, 50000.0, 'Petrol', 'Automatic'],  # Luxury newer car
    ['Toyota Corolla', 'TOYOTA', 5, 50000.0, 'Petrol', 'Manual'],     # Economy newer car
    ['Bmw X5', 'BMW', 5, 50000.0, 'Diesel', 'Automatic'],            # Luxury SUV
    ['Dacia Sandero', 'DACIA', 5, 50000.0, 'Petrol', 'Manual'],      # Budget car
    ['Ford Fiesta', 'FORD', 5, 50000.0, 'Petrol', 'Manual'],         # Popular economy car
    ['Audi A5', 'AUDI', 5, 50000.0, 'Diesel', 'Automatic']           # Premium car
]

# Use the rest of the corrected code I provided earlier with these test configurations
# Fit the label encoders once with the training data
le_name.fit(data['name'])
le_manufacturer.fit(data['manufacturer'])
le_engine.fit(data['engine'])
le_transmission.fit(data['transmission'])

results = []
for config in test_configurations:
    try:
        # Create numpy array with the right shape
        new_data = np.zeros((1, 6))
        
        try:
            # Transform each feature using the appropriate encoder
            new_data[0, 0] = le_name.transform([config[0]])[0]  # Changed from fit_transform
            new_data[0, 1] = le_manufacturer.transform([config[1]])[0]
            new_data[0, 2] = float(config[2])  # age
            new_data[0, 3] = float(config[3])  # kilometerage
            new_data[0, 4] = le_engine.transform([config[4]])[0]
            new_data[0, 5] = le_transmission.transform([config[5]])[0]
        except ValueError as e:
            print(f"Warning: Unknown category in {config[0]}: {str(e)}")
            continue
            
        # Normalize using the same scaler used during training
        normalized_data = norm.transform(new_data)
        
        # Predict (removed the *10 multiplication)
        price = model.predict(normalized_data)
        
        results.append({
            'Car': config[0],
            'Manufacturer': config[1],
            'Age': config[2],
            'Mileage': f"{config[3]:,.0f}",
            'Engine': config[4],
            'Transmission': config[5],
            'Estimated Price': f"{price[0]:,.0f} MAD"  # Removed *10
        })
    except Exception as e:
        print(f"Error processing configuration {config}: {str(e)}")
        continue

# Display results
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': ['name', 'manufacturer', 'age', 'kilometerage', 'engine', 'transmission'],
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)
print("\nPredictions:")
result_df = pd.DataFrame(results)
print(result_df)


Feature Importance:
        feature  importance
2           age    0.329431
5  transmission    0.241670
1  manufacturer    0.179594
0          name    0.112557
4        engine    0.075778
3  kilometerage    0.060969

Predictions:
                     Car   Manufacturer  Age Mileage  Engine Transmission  \
0  Mercedes-Benz C Class  MERCEDES-BENZ    5  50,000  Petrol    Automatic   
1         Toyota Corolla         TOYOTA    5  50,000  Petrol       Manual   
2                 Bmw X5            BMW    5  50,000  Diesel    Automatic   
3          Dacia Sandero          DACIA    5  50,000  Petrol       Manual   
4            Ford Fiesta           FORD    5  50,000  Petrol       Manual   
5                Audi A5           AUDI    5  50,000  Diesel    Automatic   

  Estimated Price  
0     295,195 MAD  
1     128,416 MAD  
2     437,590 MAD  
3     119,119 MAD  
4     123,987 MAD  
5     225,432 MAD  
