# used car price prediction

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#prepropcessing'
from sklearn.model_selection import train_test_split,GridSearchCV
#metrics and models
from sklearn.metrics import r2_score,mean_squared_error
import xgboost  as xgb
import warnings
warnings.filterwarnings("ignore")

# Read datasets

In [111]:
df=pd.read_csv(r"D:\vehicles_data_students.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,price,year,manufacturer,model,condition,cylinders,fuel,odometer,title_status,transmission,drive,size,type,paint_color,county,state,lat,long
0,55483,7315914053,0,2018.0,ram,promaster 2500,excellent,,gas,44244.0,clean,automatic,,,van,,,ca,32.7928,-116.9665
1,162368,7310885048,13995,2017.0,mazda,cx-3,,4 cylinders,gas,7037.0,rebuilt,automatic,,,SUV,white,,ia,41.207382,-96.023096
2,234393,7308243856,19990,2019.0,mitsubishi,eclipse cross sp,good,,gas,35313.0,clean,other,4wd,,hatchback,white,,nc,35.19,-80.83
3,276110,7315817729,0,2019.0,honda,cr-v,,,gas,25626.0,clean,automatic,,,SUV,orange,,ny,40.854573,-74.120219
4,349033,7301620999,42900,2015.0,chevrolet,corvette,excellent,8 cylinders,gas,29000.0,clean,automatic,,,convertible,black,,sc,34.755562,-82.906419


In [6]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-1.7.3-py3-none-win_amd64.whl (89.1 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.7.3


# 😒 Based on my analysis I am dropping some features into the datasets that are not useful  for the prediction

drop_columns=['Unnamed: 0','id','title_status','size','lat','long']
df.drop(columns=drop_columns,inplace=True)

In [112]:
drop_columns=['Unnamed: 0','id','title_status','size','lat','long','county']
df=df.drop(columns=drop_columns,axis=1)

# I have successfully deleted the columns that are not useful,lets proceed further 

In [113]:
df.head()

Unnamed: 0,price,year,manufacturer,model,condition,cylinders,fuel,odometer,transmission,drive,type,paint_color,state
0,0,2018.0,ram,promaster 2500,excellent,,gas,44244.0,automatic,,van,,ca
1,13995,2017.0,mazda,cx-3,,4 cylinders,gas,7037.0,automatic,,SUV,white,ia
2,19990,2019.0,mitsubishi,eclipse cross sp,good,,gas,35313.0,other,4wd,hatchback,white,nc
3,0,2019.0,honda,cr-v,,,gas,25626.0,automatic,,SUV,orange,ny
4,42900,2015.0,chevrolet,corvette,excellent,8 cylinders,gas,29000.0,automatic,,convertible,black,sc


In [114]:
df.shape


(64032, 13)

In [115]:
#lets check the null values
df.isnull().sum()

price               0
year              158
manufacturer     2569
model             802
condition       26097
cylinders       26511
fuel              424
odometer          669
transmission      353
drive           19471
type            13785
paint_color     19505
state               0
dtype: int64

# Since the missing values in all the columns cannot be filled with some analysis so i am bound to delete those rows containing nan


In [116]:
df=df.dropna()

In [117]:
df.shape #After deleting the null values we are left with below rows and columns


(17491, 13)

# Now from our domain knowledge we can say that the ads might have been posted frequently which hints towards data duplicay.so we go ahead and delete the duplicate entry and than check the size and shape of the data

In [118]:
df=df.drop_duplicates()

In [119]:
df.shape #We can see that around 1000 duplicates are deleted and our assumption was correct



(16399, 13)

# Now lets check if the continous data are distributed properly

In [120]:
df.describe()

Unnamed: 0,price,year,odometer
count,16399.0,16399.0,16399.0
mean,16115.49,2009.294469,113906.6
std,133631.4,9.782876,221782.0
min,0.0,1918.0,0.0
25%,5500.0,2006.0,57510.0
50%,10500.0,2011.0,103860.0
75%,21590.0,2015.0,149076.0
max,17000000.0,2022.0,10000000.0


# FILTER CATEGORICAL FEATURES

In [121]:
numerics=['int8','int16','int32','int64','float16','float32','float64']
categorical_columns=[]
features=df.columns.values.tolist()
for col in features:
    if df[col].dtype in numerics:
        continue
    categorical_columns.append(col)

In [122]:
categorical_columns

['manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'transmission',
 'drive',
 'type',
 'paint_color',
 'state']

# lets enocde the categorical columns to numeric data as machine understands only numeric data using get_dummies that pandas provide

In [123]:
df_dummies=pd.get_dummies(df[categorical_columns],drop_first=False)

In [124]:
df_dummies.head()

Unnamed: 0,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_datsun,manufacturer_dodge,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
df_dummies.shape

(16399, 4327)

In [None]:
#Why columns has increased?
# we can see that get_dummies which is equal to one hot encoder converts all the category column in the no of columns 
#that number of categories present in there and returns 1 wherever it is present.

# Now lets join these df_dummies columns to the original dataset and than remove the categories that were already there

In [126]:
df=df.join(df_dummies)

In [127]:
df.shape

(16399, 4340)

In [128]:
df.drop(columns=categorical_columns,axis=1,inplace=True)

In [129]:
df.head(2)

Unnamed: 0,price,year,odometer,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
5,0,2006.0,149000.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,20995,2011.0,92001.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [130]:
df=df[df['price']>1000]
df=df[df['price']<40000]

In [131]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14742 entries, 9 to 64031
Columns: 4330 entries, price to state_wy
dtypes: float64(2), int64(1), uint8(4327)
memory usage: 61.3 MB


# we are done with the feature engineering phase lets start the model building phase.

In [None]:
#Lets divide the remainig dataset into features and labels

In [132]:
X=df.drop(columns='price')
y=df.price

In [134]:
X.head()

Unnamed: 0,year,odometer,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
9,2011.0,92001.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
22,2014.0,96007.0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
29,2016.0,34425.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,2012.0,49200.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,2017.0,73057.0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [137]:
#Lets split  the data intp train and test
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.25,random_state=10)

In [138]:
xgb=xgb.XGBRegressor()

In [139]:
xgb.fit(X_train,y_train)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)

In [140]:
#prediction on train data
pred=xgb.predict(X_train)

In [141]:
r2_score(y_train,pred)

0.912127131091649

In [None]:
#The model has fit the data 91 percent that means that it has 91% less variance around the line than the mean

In [142]:
#lets precict on testing data

In [143]:
y_pred=xgb.predict(X_test)

In [144]:
y_pred

array([20277.29  , 15951.761 , 12970.122 , ...,  5815.5103,  8331.498 ,
       24479.371 ], dtype=float32)

In [145]:
r2_score(y_test,y_pred)

0.8534549321629015

# The testing accuracay is found out to be 85%  which is good enough

# Now lets do the hyperparamater tuning and improve the score

In [None]:
grid.best_params_