**MODEL BUILDING ON USED BIKE PRICE PREDICTION**

Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

Reading the data

In [None]:
project_data = pd.read_csv(r"/content/drive/MyDrive/Project/30-09-2023.csv",index_col = 0)
project_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69218 entries, TVS Star City Plus Dual Tone 110cc to Bajaj Pulsar
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   price      69218 non-null  int64 
 1   city       69218 non-null  object
 2   km_driven  69218 non-null  int64 
 3   owner      69218 non-null  object
 4   brand      69218 non-null  object
dtypes: int64(2), object(3)
memory usage: 3.2+ MB


Converting categorical variables into dummy variables


In [None]:
project_data = pd.get_dummies(project_data,drop_first = True)
project_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 69218 entries, TVS Star City Plus Dual Tone 110cc to Bajaj Pulsar
Columns: 878 entries, price to brand_yamaha
dtypes: int64(2), uint8(876)
memory usage: 59.4+ MB


Seperating input and output variables

In [None]:
project_data.info()
X = project_data.drop(["price"],axis = "columns")
y = project_data["price"]

<class 'pandas.core.frame.DataFrame'>
Index: 69218 entries, TVS Star City Plus Dual Tone 110cc to Bajaj Pulsar
Columns: 878 entries, price to brand_yamaha
dtypes: int64(2), uint8(876)
memory usage: 59.4+ MB


Splitting the data for training and testing

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(55374, 877) (13844, 877) (55374,) (13844,)


Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()

lin_model.fit(X_train,y_train)

lin_predictions = lin_model.predict(X_test)



# Calculate the R-squared value for the test data
r2_linear_value = r2_score(y_test, lin_predictions)
print(f"R2 Score: {r2_linear_value:.4f}")



R2 Score: -104915933133.8538


Cross validation - Linear Regression


In [None]:
k = 10

linear_scores = cross_val_score(lin_model,X_train,y_train,cv = k,scoring = 'r2')

for i , linear_r2 in enumerate(linear_scores):
    print(f"Fold {i+1} r2 score:{linear_r2:.4f}")

linear_r2_mean = linear_scores.mean()

print("Mean of r2_score for Linear Regression is %f" % linear_r2_mean)



Fold 1 r2 score:-276600718547488.0000
Fold 2 r2 score:0.7576
Fold 3 r2 score:0.7487
Fold 4 r2 score:0.7548
Fold 5 r2 score:0.7456
Fold 6 r2 score:0.7485
Fold 7 r2 score:0.7520
Fold 8 r2 score:0.7509
Fold 9 r2 score:-407854043174.2244
Fold 10 r2 score:0.7521
Mean of r2_score for Linear Regression is -27700857259065.621094


Evaluation

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,lin_predictions))
print(rmse)



19143.88852055391


Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_model = DecisionTreeRegressor()

decision_model.fit(X_train,y_train)

predicted_decisions = decision_model.predict(X_test)

In [None]:
#r2_value = r2_score(y_test,predicted_decisions)
#print(r2_value)
#print(np.sqrt(mean_squared_error(y_test,predicted_decisions)))
#print(mean_squared_error(y_test,predicted_decisions,squared = False))
#print(mean_squared_error(y_test,predicted_decisions))


0.9329322785258679
9935.21872168699


In [None]:
k = 10
decision_scores = cross_val_score(decision_model, X_train, y_train, cv=k, scoring='r2')


# Print the r2 score for each fold and the mean r2 score
for i, decision_r2 in enumerate(decision_scores):
    print(f"Fold {i + 1} r2 score: {decision_r2:.2f}")

r2_mean = decision_scores.mean()
print(f"Mean r2 score: {r2_mean:.2f}")

Fold 1 r2 score: 0.91
Fold 2 r2 score: 0.92
Fold 3 r2 score: 0.91
Fold 4 r2 score: 0.93
Fold 5 r2 score: 0.92
Fold 6 r2 score: 0.93
Fold 7 r2 score: 0.92
Fold 8 r2 score: 0.93
Fold 9 r2 score: 0.92
Fold 10 r2 score: 0.91
Mean r2 score: 0.92


Random Forest Regresor

In [None]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor()

random_forest.fit(X_train,y_train)

random_forest_predictions = random_forest.predict(X_test)

Random Forest Cross Validation

In [None]:
k = 10

random_forest_scores = cross_val_score(random_forest,X_train,y_train,cv = k,scoring = "r2")

for i , random_forest_r2 in enumerate(random_forest_scores):
    print(f"Fold {i+1} r2 score:{random_forest_r2:.2f}")

random_forest_r2_mean = random_forest_scores.mean()

print("Mean of r2 scores in random forest model is:",random_forest_r2_mean)

Fold 1 r2 score:0.92
Fold 2 r2 score:0.93
Fold 3 r2 score:0.92
Fold 4 r2 score:0.93
Fold 5 r2 score:0.93
Fold 6 r2 score:0.93
Fold 7 r2 score:0.93
Fold 8 r2 score:0.94
Fold 9 r2 score:0.93
Fold 10 r2 score:0.92
Mean of r2 scores in random forest model is: 0.9276234065825635


XG Boost Algorithm

In [None]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()

xgb_model.fit(X_train,y_train)

xgb_predictions = xgb_model.predict(X_test)



XG BOOST - CROSS VALIDATION

In [None]:
xgboost_scores = cross_val_score(xgb_model,X_train,y_train,cv = 10,scoring = "r2")

for i , xgboost_r2 in enumerate(xgboost_scores):
    print(f"Fold {i + 1} r2 score:{xgboost_r2:.2f}")

xgboost_r2_mean = xgboost_scores.mean()

print("Mean of r2 scores in xgboost model is:",xgboost_r2_mean)


Fold 1 r2 score:0.87
Fold 2 r2 score:0.87
Fold 3 r2 score:0.87
Fold 4 r2 score:0.88
Fold 5 r2 score:0.88
Fold 6 r2 score:0.87
Fold 7 r2 score:0.88
Fold 8 r2 score:0.88
Fold 9 r2 score:0.87
Fold 10 r2 score:0.87
Mean of r2 scores in xgboost model is: 0.8752565707416711


Support Vector Machine

In [None]:
from sklearn.svm import SVR

svr_model = SVR()

svr_model.fit(X_train,y_train)

svr_predictions = svr_model.predict(X_test)


Support Vector Machine - CROSS VALIDAITON

In [None]:
svr_scores = cross_val_score(svr_model,X_train,y_train,cv = 10,scoring = "r2")

for i , svr_r2 in enumerate(svr_scores):
    print(f"Fold {i+1} r2 score:{svr_r2:.2f}")

svr_r2_mean = svr_scores.mean()

print("Mean of r2 scores in svr model is:",svr_r2_mean)

Neural Networks -- Multi-Layer-Perceptron

In [None]:
from sklearn.neural_network import MLPRegressor

MLP_model = MLPRegressor()

MLP_model.fit(X_train,y_train)

MLP_predictions = MLP_model.predict(X_test)

Neural Networks - CROSS VALIDATION

In [None]:
mlp_scores = cross_val_score(MLP_model,X_train,y_train,cv = 10,scoring = "r2")

for i , mlp_r2 in enumerate(mlp_scores):
   print(f"Fold {i+1} r2 score:{mlp_r2:.2f}")

mlp_r2_mean = mlp_scores.mean()

print("Mean of r2 scores in mlp model is:",mlp_r2_mean)



Fold 1 r2 score:0.6374
Fold 2 r2 score:0.6505
Fold 3 r2 score:0.6057
Fold 4 r2 score:0.6469
Fold 5 r2 score:0.6760
Fold 6 r2 score:0.6557
Fold 7 r2 score:0.6619
Fold 8 r2 score:0.6620
Fold 9 r2 score:0.6569
Fold 10 r2 score:0.6273
Mean of r2 scores in mlp model is: 0.6480142594430036




KNN - Regressor

In [None]:
from sklearn.neighbors import KNeighborsRegressor

k_values = [2,3,4,5,6,7,8,9]

for k in k_values:

    KNN = KNeighborsRegressor(n_neighbors = k)

    KNN.fit(X_train,y_train)

    knn_predictions = KNN.predict(X_test)

    knn_r2 = r2_score(y_test,knn_predictions)

    print("R2 - score of KNN:",knn_r2)


R2 - score of KNN: 0.8447106001257477
R2 - score of KNN: 0.8119050484359053
R2 - score of KNN: 0.7923234201749609
R2 - score of KNN: 0.7745305868173631
R2 - score of KNN: 0.7647618113351289
R2 - score of KNN: 0.7578472239390879
R2 - score of KNN: 0.7517294777664966
R2 - score of KNN: 0.7473209497987706


Lasso Regression

In [None]:
from sklearn.linear_model import Lasso

lasso_model = Lasso()

lasso_model.fit(X_train,y_train)

lasso_predictions = lasso_model.predict(X_test)

Cross Validation

In [None]:
lasso_scores = cross_val_score(lasso_model,X_train,y_train,cv = 10,scoring = "r2")

for i, lasso_r2 in enumerate(lasso_scores):
    print(f"Fold {i+1} r2 score:{lasso_r2:.2f}")

lasso_r2_mean = lasso_scores.mean()
print("Mean of r2 score values in Lasso Regression:",lasso_r2_mean)

Fold 1 r2 score:0.74
Fold 2 r2 score:0.76
Fold 3 r2 score:0.75
Fold 4 r2 score:0.76
Fold 5 r2 score:0.75
Fold 6 r2 score:0.75
Fold 7 r2 score:0.75
Fold 8 r2 score:0.75
Fold 9 r2 score:0.75
Fold 10 r2 score:0.75
Mean of r2 score values in Lasso Regression: 0.7500832503793315


Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge_model = Ridge()

ridge_model.fit(X_train,y_train)

ridge_predictions = ridge_model.predict(X_test)

Cross Validation

In [None]:
ridge_scores = cross_val_score(ridge_model,X_train,y_train,cv = 10,scoring = "r2")

for i, ridge_r2 in enumerate(ridge_scores):
    print(f"Fold {i+1} r2 score:{ridge_r2:.2f}")

ridge_r2_mean = ridge_scores.mean()
print("Mean of r2 score values in Lasso Regression:",ridge_r2_mean)

Fold 1 r2 score:0.75
Fold 2 r2 score:0.76
Fold 3 r2 score:0.75
Fold 4 r2 score:0.76
Fold 5 r2 score:0.75
Fold 6 r2 score:0.75
Fold 7 r2 score:0.75
Fold 8 r2 score:0.75
Fold 9 r2 score:0.75
Fold 10 r2 score:0.75
Mean of r2 score values in Lasso Regression: 0.7530282620058311
