## GradientBoost  Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

In [2]:
df = pd.read_csv('cardekho_imputated.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


## Data Cleaning

In [4]:
## Checking null values
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [5]:
df.drop(['car_name','brand','Unnamed: 0'],inplace=True,axis=1)
""

''

In [6]:
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [7]:
df['model'].unique()

array(['Alto', 'Grand', 'i20', 'Ecosport', 'Wagon R', 'i10', 'Venue',
       'Swift', 'Verna', 'Duster', 'Cooper', 'Ciaz', 'C-Class', 'Innova',
       'Baleno', 'Swift Dzire', 'Vento', 'Creta', 'City', 'Bolero',
       'Fortuner', 'KWID', 'Amaze', 'Santro', 'XUV500', 'KUV100', 'Ignis',
       'RediGO', 'Scorpio', 'Marazzo', 'Aspire', 'Figo', 'Vitara',
       'Tiago', 'Polo', 'Seltos', 'Celerio', 'GO', '5', 'CR-V',
       'Endeavour', 'KUV', 'Jazz', '3', 'A4', 'Tigor', 'Ertiga', 'Safari',
       'Thar', 'Hexa', 'Rover', 'Eeco', 'A6', 'E-Class', 'Q7', 'Z4', '6',
       'XF', 'X5', 'Hector', 'Civic', 'D-Max', 'Cayenne', 'X1', 'Rapid',
       'Freestyle', 'Superb', 'Nexon', 'XUV300', 'Dzire VXI', 'S90',
       'WR-V', 'XL6', 'Triber', 'ES', 'Wrangler', 'Camry', 'Elantra',
       'Yaris', 'GL-Class', '7', 'S-Presso', 'Dzire LXI', 'Aura', 'XC',
       'Ghibli', 'Continental', 'CR', 'Kicks', 'S-Class', 'Tucson',
       'Harrier', 'X3', 'Octavia', 'Compass', 'CLS', 'redi-GO', 'Glanza',
       

In [8]:
## Numerical Features 
numerical_features = [ feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of numerical Features : -' , len(numerical_features))
## Categorical features
categorical_features = [ feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of categorical Features : -' , len(categorical_features))
## Discrete Features
descrite_features = [ feature for feature in numerical_features if len(df[feature].unique())<=25]
print('Num of descrite Features : -' , len(descrite_features))
## continues Features
continues_features = [ feature for feature  in numerical_features if feature not in descrite_features ]
print('Num of continuous Features : -' , len(continues_features))

Num of numerical Features : - 7
Num of categorical Features : - 4
Num of descrite Features : - 2
Num of continuous Features : - 5


In [9]:
df.head()


Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [10]:
from sklearn.model_selection import train_test_split
X = df.drop('selling_price',axis=1)
y = df['selling_price']

In [11]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


## Feature Scaling and Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X['model']= label_encoder.fit_transform(X['model'])

In [13]:
X.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats
0,7,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5
1,54,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5
2,118,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5
3,7,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5
4,38,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5


In [14]:


## Creating the coulmn Transformer
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['seller_type','fuel_type','transmission_type']

from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',ohe_transformer,cat_features),
        ('StandardScaler',numeric_transformer,num_features)
    ],remainder='passthrough'

)

In [15]:
X = preprocessor.fit_transform(X)

In [16]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [17]:
X_train , X_test  , y_train , y_test  = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape , y_train.shape

((12328, 14), (12328,))

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression ,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [19]:
def evaluate_model(true,predicted):
    print('MAE:-', mean_absolute_error(true,predicted))
    print('MSE:-', mean_squared_error(true,predicted))
    print('RMSE:-', np.sqrt(mean_squared_error(true,predicted)))
    print('R2 SCORE:-', r2_score(true,predicted))
    

In [20]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor()
}

In [21]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
LinearRegression
Traning Perfomance
MAE:- 268101.6070829937
MSE:- 306756099359.7596
RMSE:- 553855.6665411663
R2 SCORE:- 0.6217719576765959
------------------------------------------------------
Testing Perfomance
MAE:- 279618.5794158429
MSE:- 252550062888.5656
RMSE:- 502543.5930230985
R2 SCORE:- 0.6645109298852004
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
Ridge
Traning Perfomance
MAE:- 268059.8014688271
MSE:- 306756818740.9266
RMSE:- 553856.3159709624
R2 SCORE:- 0.6217710706848424
------------------------------------------------------
Testing Perfomance
MAE:- 279557.21689302375
MSE:- 252540243247.9703
RMSE:- 502533.82298903057
R2 SCORE:- 0.6645239743566786
-----------------------------------------------

## HyperParameter Tunning

In [22]:
# No hyperparameters for LinearRegression
lr_params = {}

ridge_params = {
    'alpha': [0.1, 1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag']
}

lasso_params = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'selection': ['cyclic', 'random']
}

knn_params = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

dt_params = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['squared_error', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

ada_params = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

gb_params = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.20, 0.40, 0.60, 0.80, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.20, 0.40, 0.60, 0.80, 1.0],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features': ['sqrt', 'log2']
}


In [23]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [24]:
randomcv_models = [
    ('LinearRegression', LinearRegression(), lr_params),
    ('Ridge', Ridge(), ridge_params),
    ('Lasso', Lasso(), lasso_params),
    ('KNeighborsRegressor', KNeighborsRegressor(), knn_params),
    ('DecisionTreeRegressor', DecisionTreeRegressor(), dt_params),
    ('RandomForestRegressor', RandomForestRegressor(), rf_params),
    ('AdaBoostRegressor', AdaBoostRegressor(), ada_params),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), gb_params),
]

In [None]:
model_params = {}

for name , model , params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=5,verbose=2,n_jobs=-1)
    random.fit(X_train,y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(f'------Best Params--for {model_name}_-------')
    print(model_params[model_name])

Fitting 5 folds for each of 1 candidates, totalling 5 fits




[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
[CV] END .................................................... total time=   0.0s
Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV] END ..............................alpha=0.1, solver=svd; total time=   0.0s
[CV] END .........................alpha=0.1, solver=cholesky; total time=   0.0s
[CV] END .........................alpha=0.1, solver=cholesky; total time=   0.0s
[CV] END .........................alpha=0.1, solver=cholesky; total time=   0.1s
[CV] END ..............................alpha=0.1, solver=svd; total time=   0.1s
[CV] END ..............................alpha=0.1, solver=svd; total time=   0.1s
[CV] END .........................alpha=0.1, solver=cholesky; total time=   0.0s
[CV] END .........................alpha=0.1, solver=cholesky; total time=   0.1s
[CV] END .............................alpha=0.1, solver=lsqr; total time=   0.0s
[CV] END .............................alpha=0.1, solver=lsqr; total time=   0.0s
[CV] END .............................alpha=0.1, solver=lsqr; total time=   0.0s
[CV] END .............................alpha=0.1, solver=lsqr; total time=   0.0s
[CV] END ...................



[CV] END ......................alpha=0.001, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=cyclic; total time=   0.1s
[CV] END .......................alpha=0.01, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=random; total time=   0.2s
[CV] END .......................alpha=0.01, selection=cyclic; total time=   0.1s
[CV] END .......................alpha=0.01, selection=cyclic; total time=   0.1s
[CV] END ......................alpha=0.001, selection=random; total time=   0.2s
[CV] END .......................alpha=0.01, selection=cyclic; total time=   0.1s
[CV] END .......................alpha=0.01, selection=cyclic; total time=   0.1s
[CV] END ...................



Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.2s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.2s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.3s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END .....algorithm=auto, n_neighbors=5, weights=uniform; total time=   0.3s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.2s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END ....algorithm=auto, n_neighbors=3, weights=distance; total time=   0.3s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END .....algorithm=auto, n_neighbors=3, weights=uniform; total time=   0.3s
[CV] END .....algorithm=auto, n_neighbors=5, we

  _data = np.array(data, dtype=dtype, copy=copy,


[CV] END criterion=poisson, max_depth=30, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=poisson, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=poisson, max_depth=30, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=poisson, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=poisson, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=poisson, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=poisson, max_depth=None, min_samples_leaf=1, min_samples_split=2; total time=   0.1s
[CV] END criterion=poisson, max_depth=30, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=poisson, max_depth=30, min_samples_leaf=1, min_samples_split=10; total time=   0.1s
[CV] END criterion=poisson, max_depth=30, min_samples_leaf=1, min_sa

In [37]:
models = {

    'GradientBoostingRegressor':GradientBoostingRegressor(
        subsample=1,
        n_estimators=300,
        min_samples_split=6,
        max_features='log2',
        loss='squared_error',
        learning_rate=0.8,
        criterion='friedman_mse'

    )
}

In [38]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
GradientBoostingRegressor
Traning Perfomance
MAE:- 82587.10561525948
MSE:- 15651757559.12726
RMSE:- 125106.9844538156
R2 SCORE:- 0.9807014966194154
------------------------------------------------------
Testing Perfomance
MAE:- 111899.97392259422
MSE:- 52176749154.69182
RMSE:- 228422.30441594755
R2 SCORE:- 0.9306880827693778
