In [83]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [84]:
# reading the csv file
df=pd.read_csv('./data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [85]:
# finding the number of missing values in each column
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [86]:
# droping the irrelevant columns
df.drop(columns=['car_name','brand'],axis=1,inplace=True)

In [87]:
# creating the dependent and independent variables
X=df.drop('selling_price',axis=1)
y=df[['selling_price']]


In [88]:
from sklearn.preprocessing import LabelEncoder,StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

le = LabelEncoder()
X['model']=le.fit_transform(X['model'])

num_features=X.select_dtypes(exclude='object').columns
one_hot_columns=['seller_type',	'fuel_type',	'transmission_type']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(drop='first'), one_hot_columns)
    ],
    remainder='passthrough')

X = preprocessor.fit_transform(X)

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [90]:
from sklearn.linear_model import LinearRegression, Lasso,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,root_mean_squared_error


models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Decision Tree': DecisionTreeRegressor(),
    'KNN Regressor': KNeighborsRegressor(),
    'Ridge Regression': Ridge(),
    'Random Forest': RandomForestRegressor()
}

for name, model in models.items():
    print(f"Training {name}...")

    model.fit(X_train, y_train)    
    # Predicting on the test set
    y_pred = model.predict(X_test)

    # Evaluating the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)  
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - MSE: {mse} , MAE: {mae}, RMSE: {rmse}, R2: {r2}")

    print("-"*50)

Training Linear Regression...
Linear Regression - MSE: 252588750547.2279 , MAE: 279686.6479172666, RMSE: 502582.08339258167, R2: 0.664459536959453
--------------------------------------------------
Training Lasso Regression...
Lasso Regression - MSE: 252587811816.28342 , MAE: 279682.7921036617, RMSE: 502581.1494836266, R2: 0.6644607839754627
--------------------------------------------------
Training Decision Tree...
Decision Tree - MSE: 104676022142.5965 , MAE: 128846.86182289977, RMSE: 323536.7400197333, R2: 0.860947722878093
--------------------------------------------------
Training KNN Regressor...
KNN Regressor - MSE: 91424152395.59682 , MAE: 124343.66688290625, RMSE: 302364.27103015466, R2: 0.8785515888516486
--------------------------------------------------
Training Ridge Regression...
Ridge Regression - MSE: 252578974672.51627 , MAE: 279625.15757643053, RMSE: 502572.3576486437, R2: 0.66447252329206
--------------------------------------------------
Training Random Forest...
R

In [91]:
# intialize parameters for the hyperparameter tuning
# selecting models that are performing best 
rf_param = {'n_estimators': [100, 200, 500,1000],
                'max_depth': [None, 10, 20,5],
                'min_samples_split': [2, 5, 10, 20],
                'max_features': [5,'auto',7,8]}  
knn_param = {'n_neighbors': [2,3, 5, 10,20,40]}

In [92]:
# list of models to be tuned
randomcv_models=[('Random Forest Regressor',RandomForestRegressor(),rf_param),
                 ('KNN Regressor',KNeighborsRegressor(),knn_param)]

In [93]:
from sklearn.model_selection import RandomizedSearchCV

best_models = {}
for name, model, param in randomcv_models:
    print(f"Tuning {name}...")
    # create the randomized search cv object
    random_search = RandomizedSearchCV(model, param_distributions=param, n_iter=10, cv=3, verbose=2, n_jobs=-1)
    random_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {random_search.best_params_}")

    best_models[name] = random_search.best_params_

    print('='*50)
    print('\n')

Tuning Random Forest Regressor...
Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters for Random Forest Regressor: {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 7, 'max_depth': None}


Tuning KNN Regressor...
Fitting 3 folds for each of 6 candidates, totalling 18 fits
Best parameters for KNN Regressor: {'n_neighbors': 5}




In [94]:
best_models

{'Random Forest Regressor': {'n_estimators': 200,
  'min_samples_split': 5,
  'max_features': 7,
  'max_depth': None},
 'KNN Regressor': {'n_neighbors': 5}}

In [95]:
# retraining teh models with the best parameters
for name, model, param in randomcv_models:
    print(f"Retraining {name} with best parameters...")
    model = model.set_params(**best_models[name])
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluating the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)  
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - MSE: {mse} , MAE: {mae}, RMSE: {rmse}, R2: {r2}")

    print("="*50)
    print('\n')

Retraining Random Forest Regressor with best parameters...
Random Forest Regressor - MSE: 47162097453.61174 , MAE: 95609.94660074444, RMSE: 217168.36199965165, R2: 0.9373495771950879


Retraining KNN Regressor with best parameters...
KNN Regressor - MSE: 91424152395.59682 , MAE: 124343.66688290625, RMSE: 302364.27103015466, R2: 0.8785515888516486


