In [1]:
# %% [markdown]
# # MODEL BUILDING

# %% [markdown]
# ## import the relevant libraries

# %%
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import ExtraTreesRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')
import pickle


In [2]:

# %%
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')
data.head()



Unnamed: 0,name,manufacturer,year,age,kilometerage,engine,transmission,price
0,Ford Fiesta,FORD,2003,22,175418,Petrol,Automatic,1500
1,Vauxhall Corsa,VAUXHALL,2003,22,175418,Petrol,Automatic,1500
2,Vauxhall Zafira,VAUXHALL,2003,22,175418,Petrol,Automatic,1500
3,Peugeot 107,PEUGEOT,2003,22,175418,Petrol,Automatic,1500
4,Vauxhall Corsa,VAUXHALL,2003,22,175418,Petrol,Automatic,1500


In [3]:

# %% [markdown]
# ## drop the name and year columns because it is irrelevant in our model building

# %%
data = data.drop(['year'], axis=1)

# %%
data.head()

# %%
data['name'].unique()

# %% [markdown]
# # label encode the categorical values

# %%
le_name = LabelEncoder()
le_manufacturer = LabelEncoder()
le_engine = LabelEncoder()
le_transmission = LabelEncoder()
data['name'] = le_name.fit_transform(data['name'])
data['manufacturer'] = le_manufacturer.fit_transform(data['manufacturer'])
data['engine'] = le_engine.fit_transform(data['engine'])
data['transmission'] = le_transmission.fit_transform(data['transmission'])

# %%
data.tail(
)

# %%




Unnamed: 0,name,manufacturer,age,kilometerage,engine,transmission,price
63968,63,4,5,50404,3,0,84995
63969,466,41,2,4184,2,0,84995
63970,572,43,2,3057,0,0,84995
63971,398,34,3,12824,3,0,85000
63972,79,5,1,7903,2,0,85000


In [4]:
print(data['name'].unique()) 
print(data[data['name'] == 1].shape[0])

# %%
# creating X and y variables
X = data.drop(['price'], axis=1)
y = data['price'] * 10
X.head()
y.head()

# %%
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

# %%
X_train.shape, X_test.shape, y_train.shape, y_test.shape


[149 506 523 374  98 178 153 266 378 217  70 363 411 135  19 179 104 501
 579 157 458 382 142 514 224 108 161 494 555 365 469 184 426 128 105 347
 141 145 164 433 273 380 174 371 228 442 291 459 419  65 170 272 285 330
 529 226 240  10 114 537  76 416 368 391 427 189 468 107 231 491 279 467
 392 553   9   3 581 379 592 188 513  13 521 341 276  59 452 229 274 447
 372 448 251 421 385 116 438 129 476 499  95 196 485 110 323 301 275 136
 299 559 126 503 342 177 351 197 449 570 429 361 190 584 199  21 286 302
 242 300 577 267  15 102 113 456 393 483 264 413 292 163 358 144 373 278
 386 348 210 227  20 571 139 500 534 515 357 356 451 362 213 241 152  93
 353 159 530 281 289 345 256  78 453 103 284 326 115 344 200  16 502   2
 375 290 158 282 394 211 441 590 558 376 431 460 566 520 531 335 444 493
 524 137 130 585 578 223 498 270 583 446 245 420 370 109 321 387 191 162
 132 214 410 322 203 143 324 295 173   7 280 352 146  17 455 327 192 138
 225 366  39 383 415 175 422 450 517 388 461 484 28

((51178, 6), (12795, 6), (51178,), (12795,))

In [5]:

# %%
# feature scale the X_train and X_test values

norm = MinMaxScaler().fit(X_train)

# transform training data
X_train = norm.transform(X_train)

# transform testing data
X_test = norm.transform(X_test)

print(X_train)
print('\n')
print(X_test)


# %%


[[0.55405405 0.61363636 0.0877193  0.19286727 0.         0.        ]
 [0.85472973 0.95454545 0.07017544 0.06141017 0.75       0.5       ]
 [0.51013514 0.61363636 0.01754386 0.05043244 0.         1.        ]
 ...
 [0.41385135 0.47727273 0.07017544 0.06427045 0.5        0.        ]
 [0.80405405 0.93181818 0.         0.01258764 0.5        0.        ]
 [0.9847973  1.         0.38596491 0.25139918 0.75       0.        ]]


[[0.85472973 0.95454545 0.14035088 0.14993508 0.         0.5       ]
 [0.69425676 0.79545455 0.12280702 0.23787077 0.75       0.5       ]
 [0.1097973  0.11363636 0.03508772 0.04596824 0.75       0.5       ]
 ...
 [0.07263514 0.06818182 0.05263158 0.0740757  0.75       1.        ]
 [0.62668919 0.70454545 0.         0.00288026 0.75       0.        ]
 [0.82939189 0.93181818 0.03508772 0.06728253 0.5        0.        ]]


In [6]:
models = {
    'linear_model': LinearRegression(),
    'lgbm_model':LGBMRegressor(random_state = 123),
    'rf_model': RandomForestRegressor(
        random_state=123,
        n_estimators=200,
        max_depth=12,
        min_samples_leaf=5 
    ),
    
    'xgboost_model': XGBRegressor(
        random_state=123,
        n_estimators=200,
        max_depth=8,
        learning_rate=0.05,
        tree_method='hist' # Faster training
    ),
    
    'ridge_model': Ridge(
        random_state=123,
        alpha=1.0          # Default regularization strength
    )
}

def train_model(models: dict) -> pd.DataFrame:
    """
    It takes in a dictionary containing a key-pair of model name and estimators.
    It returns a data frame containing the metrics of the trained model.
    """
    my_dict = {}
    name_list, train_score_list, r_sqd_list, mae_list, rmse_list = [], [], [], [], []
    for name, estimator in models.items():
        # fit
        estimator.fit(X_train, y_train)

        # make predictions
        y_pred = estimator.predict(X_test)

        # metrics
        train_score = estimator.score(X_train, y_train)
        r_sqd = metrics.r2_score(y_test, y_pred)
        mae = metrics.mean_absolute_error(y_test, y_pred)
        mse = metrics.mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)

        # add the metrics to the empty list
        name_list.append(name)
        train_score_list.append(train_score)
        r_sqd_list.append(r_sqd)
        mae_list.append(mae)
        rmse_list.append(rmse)

    my_dict["Name"] = name_list
    my_dict["Train_Score"] = train_score_list
    my_dict["R_squared"] = r_sqd_list
    my_dict["Mean_absolute_error"] = mae_list
    my_dict["Root_mean_sqd_error"] = rmse_list

    my_dataframe = pd.DataFrame(my_dict)
    my_dataframe = my_dataframe.sort_values("Root_mean_sqd_error")
    return my_dataframe

train_model(models)



Unnamed: 0,Name,Train_Score,R_squared,Mean_absolute_error,Root_mean_sqd_error
3,xgboost_model,0.90824,0.880619,30993.313013,48903.996996
2,rf_model,0.885372,0.856261,32447.815176,53661.730446
1,lgbm_model,0.85437,0.844469,36318.839399,55819.325641
4,ridge_model,0.450127,0.433038,74466.377762,106574.598803
0,linear_model,0.45013,0.432941,74456.266741,106583.705446


In [7]:
# grid = {
#     'colsample_bytree': [0.6,0.7,0.8],
#     'learning_rate': [0.01,0.05,0.1],
#     'max_depth': [9,12,14],
#     'min_child_weight': [0.01,0.05,0.1],
#     'n_estimators': [150,250 ,350],
#     'subsample': [0.8,0.7,0.9],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# best 0.7 , 0.1, 9, 0.1 , 350, 0.8 
# grid = {
#     'colsample_bytree': [0.7],
#     'learning_rate': [0.1],
#     'max_depth': [9],
#     'min_child_weight': [0.1],
#     'n_estimators': [350],
#     'subsample': [0.8],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# grid = {
#     'colsample_bytree': [0.6,0.7,0.8],
#     'learning_rate': [0.01,0.05,0.1],
#     'max_depth': [9,12,14],
#     'min_child_weight': [0.01,0.05,0.1],
#     'n_estimators': [150,250 ,350],
#     'subsample': [0.8,0.7,0.9],
# }


# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1  # Use all available cores
# )

# grid = {
#     'colsample_bytree': [0.6, 0.7, 0.8],  # Fraction of features to be used for each tree
#     'learning_rate': [0.01, 0.05, 0.1],   # Step size shrinkage used to prevent overfitting
#     'max_depth': [9, 12, 14],             # Maximum depth of a tree
#     'min_child_weight': [1, 5, 10],       # Minimum sum of instance weight (hessian) needed in a child
#     'n_estimators': [150, 250, 350],      # Number of boosting rounds
#     'subsample': [0.7, 0.8, 0.9],         # Fraction of samples to be used for each tree
#     'gamma': [0, 0.1, 0.2],               # Minimum loss reduction required to make a further partition on a leaf node
#     'reg_alpha': [0, 0.01, 0.1],          # L1 regularization term on weights
#     'reg_lambda': [1, 1.5, 2]             # L2 regularization term on weights
# }

# model = GridSearchCV(
#     estimator=XGBRegressor(
#         random_state=123,
#         tree_method='hist'  # Keep the fast training method
#     ),
#     param_grid=grid,
#     scoring='neg_root_mean_squared_error',
#     cv=5,
#     verbose=1,
#     n_jobs=-1 
# )
grid = {
    'colsample_bytree': [0.7],  # Fraction of features to be used for each tree
    'learning_rate': [0.1],   # Step size shrinkage used to prevent overfitting
    'max_depth': [9],             # Maximum depth of a tree
    'min_child_weight': [1],       # Minimum sum of instance weight (hessian) needed in a child
    'n_estimators': [350],      # Number of boosting rounds
    'subsample': [0.8],         # Fraction of samples to be used for each tree
    'gamma': [0],               # Minimum loss reduction required to make a further partition on a leaf node
    'reg_alpha': [0.1],          # L1 regularization term on weights
    'reg_lambda': [1.5]             # L2 regularization term on weights
}

model = GridSearchCV(
    estimator=XGBRegressor(
        random_state=123,
        tree_method='hist'  # Keep the fast training method
    ),
    param_grid=grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1 
)
# Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 350, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.8

In [9]:
model.fit(X_train,y_train, verbose = 1)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [10]:
# %%
y_pred = model.predict(X_test)
print("Best parameters:", model.best_params_)
print("\nPrediction Metrics:")
print("R-squared Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred)))

# %%
grid_model = pd.DataFrame({
    'model': ['XGboost'],
    'r_squared': [metrics.r2_score(y_test, y_pred)],
    'mae': [mean_absolute_error(y_test, y_pred)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_test, y_pred))]
    })
grid_model


# # %%
# data = {"model": model, "normalization": norm}
# with open('../models/regressor3.pkl', 'wb') as file:
#     pickle.dump(data, file)



Best parameters: {'colsample_bytree': 0.7, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 350, 'reg_alpha': 0.1, 'reg_lambda': 1.5, 'subsample': 0.8}

Prediction Metrics:
R-squared Score: 0.8978823187678526
Mean Absolute Error: 27792.53282807005
Root Mean Squared Error: 45230.05728960372


Unnamed: 0,model,r_squared,mae,rmse
0,XGboost,0.897882,27792.532828,45230.05729


In [10]:
# %% [markdown]
# #### Make Predictions on new data.

# %%
# Create test configurations using exact names from dataset
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')

test_configurations = [
    ['Mercedes-Benz C Class', 'MERCEDES-BENZ', 5, 50000.0, 'Petrol', 'Automatic'],  # Luxury newer car
    ['Toyota Corolla', 'TOYOTA', 5, 50000.0, 'Petrol', 'Manual'],     # Economy newer car
    ['Bmw X5', 'BMW', 5, 50000.0, 'Diesel', 'Automatic'],            # Luxury SUV
    ['Dacia Sandero', 'DACIA', 5, 50000.0, 'Petrol', 'Manual'],      # Budget car
    ['Ford Fiesta', 'FORD', 5, 50000.0, 'Petrol', 'Manual'],         # Popular economy car
    ['Audi A5', 'AUDI', 5, 50000.0, 'Diesel', 'Automatic']           # Premium car
]

# Use the rest of the corrected code I provided earlier with these test configurations
# Fit the label encoders once with the training data
le_name.fit(data['name'])
le_manufacturer.fit(data['manufacturer'])
le_engine.fit(data['engine'])
le_transmission.fit(data['transmission'])

results = []
for config in test_configurations:
    try:
        # Create numpy array with the right shape
        new_data = np.zeros((1, 6))
        
        try:
            # Transform each feature using the appropriate encoder
            new_data[0, 0] = le_name.transform([config[0]])[0]  # Changed from fit_transform
            new_data[0, 1] = le_manufacturer.transform([config[1]])[0]
            new_data[0, 2] = float(config[2])  # age
            new_data[0, 3] = float(config[3])  # kilometerage
            new_data[0, 4] = le_engine.transform([config[4]])[0]
            new_data[0, 5] = le_transmission.transform([config[5]])[0]
        except ValueError as e:
            print(f"Warning: Unknown category in {config[0]}: {str(e)}")
            continue
            
        # Normalize using the same scaler used during training
        normalized_data = norm.transform(new_data)
        
        # Predict (removed the *10 multiplication)
        price = model.predict(normalized_data)
        
        results.append({
            'Car': config[0],
            'Manufacturer': config[1],
            'Age': config[2],
            'Mileage': f"{config[3]:,.0f}",
            'Engine': config[4],
            'Transmission': config[5],
            'Estimated Price': f"{price[0]:,.0f} MAD"  # Removed *10
        })
    except Exception as e:
        print(f"Error processing configuration {config}: {str(e)}")
        continue

# Display results
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': ['name', 'manufacturer', 'age', 'kilometerage', 'engine', 'transmission'],
    'importance': model.best_estimator_.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)
print("\nPredictions:")
result_df = pd.DataFrame(results)
print(result_df)


Feature Importance:
        feature  importance
2           age    0.300223
5  transmission    0.292134
1  manufacturer    0.133431
0          name    0.122963
4        engine    0.090465
3  kilometerage    0.060784

Predictions:
                     Car   Manufacturer  Age Mileage  Engine Transmission  \
0  Mercedes-Benz C Class  MERCEDES-BENZ    5  50,000  Petrol    Automatic   
1         Toyota Corolla         TOYOTA    5  50,000  Petrol       Manual   
2                 Bmw X5            BMW    5  50,000  Diesel    Automatic   
3          Dacia Sandero          DACIA    5  50,000  Petrol       Manual   
4            Ford Fiesta           FORD    5  50,000  Petrol       Manual   
5                Audi A5           AUDI    5  50,000  Diesel    Automatic   

  Estimated Price  
0     329,492 MAD  
1     138,938 MAD  
2     463,164 MAD  
3     118,575 MAD  
4     120,329 MAD  
5     235,272 MAD  


In [11]:
# Combine training and test data
X_combined = np.vstack((X_train, X_test))
y_combined = np.concatenate((y_train, y_test))

# Fit the model on the combined dataset
model.fit(X_combined, y_combined, verbose=1)

# Make predictions on the combined dataset
y_pred_combined = model.predict(X_combined)

# Evaluate the model on the combined dataset
print("R-squared Score:", metrics.r2_score(y_combined, y_pred_combined))
print("Mean Absolute Error:", mean_absolute_error(y_combined, y_pred_combined))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_combined, y_pred_combined)))
grid_model = pd.DataFrame({
    'model': ['XGboost'],
    'r_squared': [metrics.r2_score(y_combined, y_pred_combined)],
    'mae': [mean_absolute_error(y_combined, y_pred_combined)],
    'rmse': [np.sqrt(metrics.mean_squared_error(y_combined, y_pred_combined))]
    })
grid_model

Fitting 5 folds for each of 1 candidates, totalling 5 fits
R-squared Score: 0.9445242869575412
Mean Absolute Error: 21615.355905816406
Root Mean Squared Error: 33494.02286236115


Unnamed: 0,model,r_squared,mae,rmse
0,XGboost,0.944524,21615.355906,33494.022862


In [19]:
# %% [markdown]
# #### Make Predictions on new data.

# %%
# Create test configurations using exact names from dataset
data = pd.read_csv('../dataSet/cleaned_car_data2.csv')

test_configurations = [
    ['Mercedes-Benz C Class', 'MERCEDES-BENZ', 5, 50000.0, 'Petrol', 'Automatic'],  # Luxury newer car
    ['Toyota Corolla', 'TOYOTA', 5, 50000.0, 'Petrol', 'Manual'],     # Economy newer car
    ['Bmw X5', 'BMW', 5, 50000.0, 'Diesel', 'Automatic'],            # Luxury SUV
    ['Dacia Sandero', 'DACIA', 5, 50000.0, 'Petrol', 'Manual'],      # Budget car
    ['Ford Fiesta', 'FORD', 5, 50000.0, 'Petrol', 'Manual'],         # Popular economy car
    ['Audi A5', 'AUDI', 5, 50000.0, 'Diesel', 'Automatic']           # Premium car
]

# Use the rest of the corrected code I provided earlier with these test configurations
# Fit the label encoders once with the training data
le_name.fit(data['name'])
le_manufacturer.fit(data['manufacturer'])
le_engine.fit(data['engine'])
le_transmission.fit(data['transmission'])

results = []
for config in test_configurations:
    try:
        # Create numpy array with the right shape
        new_data = np.zeros((1, 6))
        
        try:
            # Transform each feature using the appropriate encoder
            new_data[0, 0] = le_name.transform([config[0]])[0]  # Changed from fit_transform
            new_data[0, 1] = le_manufacturer.transform([config[1]])[0]
            new_data[0, 2] = float(config[2])  # age
            new_data[0, 3] = float(config[3])  # kilometerage
            new_data[0, 4] = le_engine.transform([config[4]])[0]
            new_data[0, 5] = le_transmission.transform([config[5]])[0]
        except ValueError as e:
            print(f"Warning: Unknown category in {config[0]}: {str(e)}")
            continue
            
        # Normalize using the same scaler used during training
        normalized_data = norm.transform(new_data)
        
        # Predict (removed the *10 multiplication)
        price = model.predict(normalized_data)
        
        results.append({
            'Car': config[0],
            'Manufacturer': config[1],
            'Age': config[2],
            'Mileage': f"{config[3]:,.0f}",
            'Engine': config[4],
            'Transmission': config[5],
            'Estimated Price': f"{price[0]:,.0f} MAD"  # Removed *10
        })
    except Exception as e:
        print(f"Error processing configuration {config}: {str(e)}")
        continue

# Display results
print("\nFeature Importance:")
feature_importance = pd.DataFrame({
    'feature': ['name', 'manufacturer', 'age', 'kilometerage', 'engine', 'transmission'],
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
print(feature_importance)
print("\nPredictions:")
result_df = pd.DataFrame(results)
print(result_df)


Feature Importance:
        feature  importance
2           age    0.329431
5  transmission    0.241670
1  manufacturer    0.179594
0          name    0.112557
4        engine    0.075778
3  kilometerage    0.060969

Predictions:
                     Car   Manufacturer  Age Mileage  Engine Transmission  \
0  Mercedes-Benz C Class  MERCEDES-BENZ    5  50,000  Petrol    Automatic   
1         Toyota Corolla         TOYOTA    5  50,000  Petrol       Manual   
2                 Bmw X5            BMW    5  50,000  Diesel    Automatic   
3          Dacia Sandero          DACIA    5  50,000  Petrol       Manual   
4            Ford Fiesta           FORD    5  50,000  Petrol       Manual   
5                Audi A5           AUDI    5  50,000  Diesel    Automatic   

  Estimated Price  
0     295,195 MAD  
1     128,416 MAD  
2     437,590 MAD  
3     119,119 MAD  
4     123,987 MAD  
5     225,432 MAD  


### Abstract
The Car Price Prediction Project aims to develop a machine learning application that accurately predicts the prices of used cars based on various features such as manufacturer, engine type, transmission, mileage, price, and age. This project involves data scraping from the AA Cars website, comprehensive data cleaning and preprocessing, exploratory data analysis, and the implementation of multiple machine learning models. The final model, trained on a combined dataset, demonstrates a slight performance improvement over individual datasets. The project also includes an interactive web interface built with Flask, allowing users to input car details and receive price predictions and recommendations for similar cars. This paper presents the methodology, model evaluation, and results, highlighting the effectiveness of the chosen approach in predicting car prices and providing recommendations.

### Introduction
The used car market is a dynamic and complex industry where accurate pricing is crucial for both buyers and sellers. Predicting the price of a used car involves considering various factors such as the car's manufacturer, engine type, transmission, mileage, price, and age. Traditional methods of car price estimation often rely on expert knowledge and manual assessments, which can be subjective and inconsistent. With the advent of machine learning, it is possible to develop models that can predict car prices more accurately and consistently.

This research paper presents the Car Price Prediction Project, which aims to leverage machine learning techniques to predict the prices of used cars and recommend similar cars based on input features. The project involves several key steps: data scraping from the AA Cars website, data cleaning and preprocessing, exploratory data analysis, and the implementation and evaluation of multiple machine learning models. The final model, trained on a combined dataset, shows a slight performance improvement, indicating the effectiveness of the chosen approach.

The paper is structured as follows: Section 2 reviews related work in the field of car price prediction and machine learning models. Section 3 describes the methodology, including data collection, preprocessing, feature engineering, model training, and the recommendation system. Section 4 presents the results, including model evaluation and comparison. Section 5 discusses the limitations of the approach and potential future work. Finally, Section 6 concludes the paper by summarizing the key findings and contributions.

By developing an accurate and reliable car price prediction model and a recommendation system, this project aims to provide valuable insights and tools for both buyers and sellers in the used car market, ultimately contributing to more informed decision-making and fairer pricing.

### Related Works

Research on estimating the price of used cars is relatively recent and not extensively covered. In her MSc thesis, Listiani [3] demonstrated that a regression model using support vector machines (SVM) can predict the residual price of leased cars more accurately than simple multiple regression or multivariate regression. SVMs are particularly effective in handling high-dimensional data (numerous features used to predict the price) and can avoid both overfitting and underfitting. She employed a genetic algorithm to optimize the SVM parameters efficiently. However, the study did not express the improvement of SVM regression over simple regression in straightforward measures like mean deviation or variance.

In another university thesis, Richardson [4] explored the hypothesis that car manufacturers aim to produce vehicles that do not depreciate quickly. Using multiple regression analysis, he found that hybrid cars (vehicles with both an internal combustion engine and an electric motor) retain their value better than traditional vehicles. This is likely due to increased environmental concerns and higher fuel efficiency. The study also considered other factors such as age, mileage, make, and MPG (miles per gallon). Data for this study was collected from various websites.

Wu et al. [5] utilized a neuro-fuzzy knowledge-based system to predict the price of used cars, considering only three factors: the make of the car, the year of manufacture, and the engine style. The proposed system produced results comparable to simple regression methods. In the USA, car dealers sell hundreds of thousands of cars annually through leasing. Most of these cars are returned at the end of the leasing period and must be resold. Accurately pricing these cars is crucial for economic success. To address this, Du et al. [6] developed the ODAV (Optimal Distribution of Auction Vehicles) system, which not only estimates the best resale price but also advises on the optimal location to sell the car. Given the vast size of the United States, the selling location significantly impacts the price of used cars. A k-nearest neighbor regression model was used for price forecasting. Since its inception in 2003, the system has distributed over two million vehicles.

Gonggi [7] proposed a model based on artificial neural networks to forecast the residual value of private used cars. The study focused on features such as mileage, manufacturer, and estimated useful life. The model was optimized to handle nonlinear relationships, which simple linear regression methods cannot manage. The model proved to be reasonably accurate in predicting the residual value of used cars.

In a study by Sameerchand Pudaruth [8], supervised machine learning techniques were applied to predict the price of used cars in Mauritius. The predictions were based on historical data collected from daily newspapers. Various techniques, including multiple linear regression analysis, k-nearest neighbors, naïve Bayes, and decision trees, were used to make the predictions. The predictions were evaluated and compared to identify the best-performing methods. The study concluded that predicting the price of used cars is a challenging problem that requires sophisticated algorithms for high accuracy. All four methods provided comparable performance.

Inspired by the study conducted by Sameerchand Pudaruth [8], this research explores the application of machine learning techniques to predict the price of used cars. These studies illustrate the diverse approaches and techniques used in predicting the price of used cars, such as support vector machines, multiple regression analysis, neuro-fuzzy systems, k-nearest neighbor regression, and artificial neural networks. Each method has its advantages and limitations, and the choice of method depends on the specific requirements and constraints of the problem

### Limitations

Despite the promising results obtained from the Car Price Prediction Project, several limitations need to be acknowledged:

1. **Data Quality and Availability**:
   - The accuracy of the predictions heavily depends on the quality and completeness of the data. The data scraped from the AA Cars website may contain inaccuracies, missing values, or inconsistencies that could affect the model's performance.
   - The dataset used in this project may not be representative of the entire used car market, as it is limited to the listings available on a single website. This could lead to biased predictions that do not generalize well to other datasets or markets.

2. **Geographical and Currency Limitations**:
   - The data was scraped from the AA Cars website, which primarily serves the UK market. However, the project aims to predict car prices for Moroccan users. Although we converted prices from GBP to MAD by multiplying by 12.8, other regional factors influencing car prices in Morocco may not be captured.
   - Differences in market dynamics, consumer preferences, and economic conditions between the UK and Morocco could impact the accuracy and relevance of the predictions.

3. **Feature Limitations**:
   - The features used in the model, such as manufacturer, engine type, transmission, mileage, price, and age, may not capture all the factors influencing a car's price. Other important factors, such as the car's condition, service history, and market demand, were not included due to data unavailability.
   - The model does not account for external factors such as economic conditions, seasonal trends, or regional variations, which can significantly impact car prices.

4. **Model Complexity and Interpretability**:
   - While complex models like XGBoost and CatBoost can achieve high accuracy, they are often less interpretable than simpler models like linear regression. This can make it challenging to understand the model's decision-making process and explain the predictions to end-users.
   - The recommendation system based on K-Nearest Neighbors (KNN) may not scale well with large datasets, as the computational complexity increases with the number of data points.

5. **Hyperparameter Tuning**:
   - Hyperparameter tuning was performed using GridSearchCV, which can be computationally expensive and time-consuming. More advanced techniques like Bayesian optimization or random search could potentially yield better results with less computational effort.
   - The optimal hyperparameters found during tuning may not be universally applicable to all datasets or markets, limiting the model's generalizability.

6. **Deployment and Maintenance**:
   - The deployment of the web application on Azure Web App ensures accessibility, but it also introduces challenges related to scalability, security, and maintenance. Ensuring the application can handle high traffic and remains secure against potential threats requires ongoing effort.
   - Regular updates to the model and data are necessary to maintain the accuracy and relevance of the predictions. This involves continuous monitoring, retraining, and validation of the model as new data becomes available.

7. **User Interaction and Experience**:
   - The web interface built with Flask provides a user-friendly way to input car details and receive predictions, but it may not cater to all user needs or preferences. Enhancing the user experience with additional features, such as interactive visualizations or personalized recommendations, could improve user satisfaction.
   - The recommendation system suggests similar cars based on input features, but it may not always align with user preferences or expectations. Incorporating user feedback and preferences into the recommendation algorithm could enhance its relevance and usefulness.

By acknowledging these limitations, we can identify areas for improvement and future work to enhance the accuracy, robustness, and usability of the Car Price Prediction Project.

### Limitations

Despite the promising results, several limitations need to be acknowledged:

1. **Data Quality and Availability**:
   - The accuracy of predictions depends on the quality of data scraped from the AA Cars website, which may contain inaccuracies or inconsistencies.
   - The dataset may not represent the entire used car market, leading to biased predictions.

2. **Geographical and Currency Limitations**:
   - The data is from the UK market (AA Cars website), but the project targets Moroccan users. Prices were converted from GBP to MAD by multiplying by 12.8, but regional factors affecting car prices in Morocco may not be captured.
   - Differences in market dynamics and economic conditions between the UK and Morocco could impact prediction accuracy.

3. **Feature Limitations**:
   - Important factors like car condition, service history, and market demand were not included due to data unavailability.
   - External factors such as economic conditions and seasonal trends were not considered.

4. **Model Complexity and Interpretability**:
   - Complex models like XGBoost and CatBoost are less interpretable than simpler models, making it challenging to explain predictions.
   - The KNN-based recommendation system may not scale well with large datasets.

5. **Hyperparameter Tuning**:
   - Hyperparameter tuning using GridSearchCV is computationally expensive and time-consuming. More advanced techniques could yield better results.
   - Optimal hyperparameters may not be universally applicable, limiting generalizability.

6. **Deployment and Maintenance**:
   - Deployment on Azure Web App introduces challenges related to scalability, security, and maintenance.
   - Regular updates to the model and data are necessary to maintain accuracy and relevance.

7. **User Interaction and Experience**:
   - The Flask web interface may not cater to all user needs. Enhancing the user experience with additional features could improve satisfaction.
   - The recommendation system may not always align with user preferences. Incorporating user feedback could enhance its relevance.

By acknowledging these limitations, we can identify areas for improvement and future work to enhance the Car Price Prediction Project's accuracy, robustness, and usability.

### Conclusion

The Car Price Prediction Project successfully demonstrates the application of machine learning techniques to predict the prices of used cars and recommend similar cars based on input features. By leveraging data scraped from the AA Cars website, comprehensive data cleaning and preprocessing, and the implementation of multiple machine learning models, the project achieved a slight performance improvement with the final model trained on a combined dataset.

The project highlights the effectiveness of advanced machine learning models such as XGBoost and CatBoost in handling high-dimensional data and providing accurate predictions. Additionally, the K-Nearest Neighbors (KNN) algorithm proved useful in developing a recommendation system that suggests similar cars based on user input.

Despite the promising results, several limitations were identified, including data quality and availability, geographical and currency differences, feature limitations, model complexity, and deployment challenges. Addressing these limitations in future work could further enhance the accuracy, robustness, and usability of the prediction and recommendation systems.

The deployment of the web application using Flask and Azure Web App ensures accessibility and provides a user-friendly interface for users to input car details and receive predictions. However, ongoing maintenance and updates are necessary to keep the application relevant and accurate.

In conclusion, this project provides valuable insights and tools for both buyers and sellers in the used car market, contributing to more informed decision-making and fairer pricing. Future work should focus on addressing the identified limitations, incorporating additional features, and exploring more sophisticated algorithms to further improve the system's performance and user experience.