In [108]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import math
from sklearn.linear_model import LinearRegression
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import sklearn
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.metrics import r2_score
from math import sqrt

In [8]:
df = pd.read_csv("cleaned2.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,state
0,0,35990,2010.0,chevrolet,good,8 cylinders,gas,32742.0,clean,other,2,other,south
1,1,7500,2014.0,hyundai,excellent,4 cylinders,gas,93600.0,clean,automatic,2,sedan,south
2,2,4900,2006.0,bmw,good,6 cylinders,gas,87046.0,clean,automatic,2,SUV,south
3,5,29590,2016.0,toyota,good,6 cylinders,gas,33290.0,clean,other,4,pickup,south
4,6,39990,2012.0,ford,good,8 cylinders,gas,9692.0,clean,other,2,coupe,south


In [10]:
df["price"]=np.log1p(df["price"])

## Encoding

### For Numerical Data

#### year

In [11]:
year_min = df["year"].min()
year_max = df["year"].max()
df["year"] = ((df["year"] - year_min) / (year_max - year_min))

#### odometer

In [12]:
odo_min = df["odometer"].min()
odo_max = df["odometer"].max()
df["odometer"] = ((df["odometer"] - odo_min) / (odo_max - odo_min))

### For Ordinal Data

#### cylinders

In [13]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(list(df['cylinders'].astype(str).values))
df['cylinders'] = label_encoder.transform(list(df['cylinders'].astype(str).values))

In [14]:
#Normalizing Cylinder
cyl_min = df["cylinders"].min()
cyl_max = df["cylinders"].max()
df["cylinders"] = ((df["cylinders"] - cyl_min) / (cyl_max - cyl_min))

### OneHotEncoding

In [15]:
train_df = pd.get_dummies(df)

In [16]:
len(train_df)

318824

In [17]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,price,year,cylinders,odometer,drive,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,...,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon,state_midwest,state_northeast,state_south,state_west
0,0,10.491024,0.583333,0.857143,0.13095,2,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,8.922792,0.75,0.428571,0.374431,2,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
2,2,8.497195,0.416667,0.714286,0.34821,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,5,10.295226,0.833333,0.714286,0.133143,4,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
4,6,10.59641,0.666667,0.857143,0.038732,2,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Splitting the Data into test and train

In [18]:
y = train_df["price"]
x = train_df.drop(['price', 'Unnamed: 0'], axis=1)
x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(x,y,test_size=0.2,random_state=42)

In [19]:
x_train.head()

Unnamed: 0,year,cylinders,odometer,drive,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,...,type_other,type_pickup,type_sedan,type_truck,type_van,type_wagon,state_midwest,state_northeast,state_south,state_west
220052,0.958333,0.714286,0.040324,2,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
10665,0.333333,0.428571,0.273475,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
102772,0.75,0.428571,0.308018,2,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
90016,0.791667,0.428571,0.528394,4,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
294161,0.708333,1.0,0.386033,4,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0


### GradientBoosting Model

In [112]:
#GradientBoosting model
gb_model = GradientBoostingRegressor(
    n_estimators = 500, 
    max_depth = 8, 
    learning_rate = 0.3)

In [None]:
#Training the Gradient boost model
gb_model = gb_model.fit(x_train, y_train)

In [None]:
#Predict test
y_pred=gb_model.predict(x_test)
#Predict train
train_pred = gb.model(x_train)

In [None]:
#train RMSE - Gradient Boosting
mse1 = mean_squared_error(y_train, train_pred)
rmse1 = sqrt(mse1)
print("rmse(Train):", rmse1)
#test RMSE - Gradient Boosting
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print("rmse(Test):", rmse)

In [None]:
#train R^2 - Gradient Boosting
r_sqr1 = r2_score(y_train, train_pred)
print("r_squared(Train):",r_sqr1)
#test R^2 - Gradient Boosting
r_sqr = r2_score(y_test, y_pred)
print("r_squared(Test):",r_sqr)

In [105]:
#feature importance for Gradient Boost Model
cat_columns=['manufacturer','condition','cylinders','fuel','title_status','transmission','type','state']
feature_imp = pd.DataFrame(list(zip(gb_model.feature_importances_,x_train)),columns = ("importance","features"))

feature_split = pd.DataFrame(feature_imp["features"].str.split("_",n = 1, expand = True))
feature_cat = pd.concat([feature_imp["importance"], feature_split], axis=1, join='inner')
feature_cat.columns = ["importance","feature","split2"]
feature_score = feature_cat.groupby('feature', as_index=False)['importance'].sum()
feature_score

Unnamed: 0,feature,importance
0,condition,0.009664
1,cylinders,0.092697
2,drive,0.059916
3,fuel,0.051545
4,manufacturer,0.062843
5,odometer,0.142817
6,state,0.010585
7,title,0.005514
8,transmission,0.005672
9,type,0.06402


### XGBoost

In [64]:
#XGBoost Model
regressor = xgb.XGBRegressor(
    n_estimators=100,
    reg_lambda=1,
    gamma=0,
    max_depth=5
)

In [65]:
#Training the XGBoost Model
regressor.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
#Predict test
y_pred = regressor.predict(x_test)
#Predict train
train_pred = regressor.predict(x_train)

In [67]:
#train RMSE - XGBoost
mse_train = mean_squared_error(y_train, train_pred)
rmse = math.sqrt(mse_train)
print("rmse(Train):",rmse)
#test RMSE - XGBoost
mse = mean_squared_error(y_test,y_pred)
rmse=math.sqrt(mse)
rmse
print("rmse(Test):",rmse)

rmse(Train): 0.2754892285733874
rmse(Test): 0.28333842752516675


In [68]:
#train R^2 - XGBoost
r_sqr1 = r2_score(y_train, train_pred)
print("r_squared(Train):",r_sqr1)
#test R^2 - XGBoost
r_sqr = r2_score(y_test, y_pred)
print("r_squared(Test):",r_sqr)

r_squared(Train): 0.8577193686904432
r_squared(Test): 0.8485268276098412


In [106]:
#feature importance for XGBoost Model
cat_columns=['manufacturer','condition','cylinders','fuel','title_status','transmission','type','state']
feature_imp = pd.DataFrame(list(zip(regressor.feature_importances_,x_train)),columns = ("importance","features"))

feature_split = pd.DataFrame(feature_imp["features"].str.split("_",n = 1, expand = True))
feature_cat = pd.concat([feature_imp["importance"], feature_split], axis=1, join='inner')
feature_cat.columns = ["importance","feature","split2"]
feature_score = feature_cat.groupby('feature', as_index=False)['importance'].sum()
feature_score

Unnamed: 0,feature,importance
0,condition,0.01647
1,cylinders,0.028663
2,drive,0.045809
3,fuel,0.121036
4,manufacturer,0.404269
5,odometer,0.027604
6,state,0.02196
7,title,0.021583
8,transmission,0.018939
9,type,0.178855
