## GradientBoost  Regression

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

In [45]:
df = pd.read_csv('cars.csv')

In [46]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


## Data Cleaning

In [47]:
## Checking null values
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [48]:
df.drop(['car_name','brand','Unnamed: 0'],inplace=True,axis=1)
""

KeyError: "['car_name', 'brand', 'Unnamed: 0'] not found in axis"

In [49]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [50]:
df['name'].unique()

array(['Maruti 800 AC', 'Maruti Wagon R LXI Minor',
       'Hyundai Verna 1.6 SX', ..., 'Mahindra Verito 1.5 D6 BSIII',
       'Toyota Innova 2.5 VX (Diesel) 8 Seater BS IV',
       'Hyundai i20 Magna 1.4 CRDi'], dtype=object)

In [51]:
## Numerical Features 
numerical_features = [ feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of numerical Features : -' , len(numerical_features))
## Categorical features
categorical_features = [ feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of categorical Features : -' , len(categorical_features))
## Discrete Features
descrite_features = [ feature for feature in numerical_features if len(df[feature].unique())<=25]
print('Num of descrite Features : -' , len(descrite_features))
## continues Features
continues_features = [ feature for feature  in numerical_features if feature not in descrite_features ]
print('Num of continuous Features : -' , len(continues_features))

Num of numerical Features : - 3
Num of categorical Features : - 5
Num of descrite Features : - 0
Num of continuous Features : - 3


In [52]:
df.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [53]:
from sklearn.model_selection import train_test_split
X = df.drop('selling_price',axis=1)
y = df['selling_price']

In [54]:
X.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,141000,Diesel,Individual,Manual,Second Owner


## Feature Scaling and Encoding

In [55]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X['name']= label_encoder.fit_transform(X['name'])

In [56]:
X.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
0,775,2007,70000,Petrol,Individual,Manual,First Owner
1,1041,2007,50000,Petrol,Individual,Manual,First Owner
2,505,2012,100000,Diesel,Individual,Manual,First Owner
3,118,2017,46000,Petrol,Individual,Manual,First Owner
4,279,2014,141000,Diesel,Individual,Manual,Second Owner


In [57]:


## Creating the coulmn Transformer
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['seller_type','fuel','transmission']

from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder',ohe_transformer,cat_features),
        ('StandardScaler',numeric_transformer,num_features)
    ],remainder='passthrough'

)

In [58]:
X = preprocessor.fit_transform(X)

In [16]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,1.247335,-0.000276,-1.324259,-1.263352,-0.403022
1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.225693,-0.343933,-0.690016,-0.192071,-0.554718,-0.432571,-0.403022
2,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.536377,1.647309,0.084924,-0.647583,-0.554718,-0.479113,-0.403022
3,1.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.519714,0.983562,-0.360667,0.292211,-0.936610,-0.779312,-0.403022
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.666211,-0.012060,-0.496281,0.735736,0.022918,-0.046502,-0.403022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15406,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.508844,0.983562,-0.869744,0.026096,-0.767733,-0.757204,-0.403022
15407,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-0.556082,-1.339555,-0.728763,-0.527711,-0.216964,-0.220803,2.073444
15408,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.407551,-0.012060,0.220539,0.344954,0.022918,0.068225,-0.403022
15409,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.426247,-0.343933,72.541850,-0.887326,1.329794,0.917158,2.073444


In [17]:
X_train , X_test  , y_train , y_test  = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape , y_train.shape

((12328, 14), (12328,))

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression ,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [19]:
def evaluate_model(true,predicted):
    print('MAE:-', mean_absolute_error(true,predicted))
    print('MSE:-', mean_squared_error(true,predicted))
    print('RMSE:-', np.sqrt(mean_squared_error(true,predicted)))
    print('R2 SCORE:-', r2_score(true,predicted))
    

In [20]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor()
}

In [60]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
LinearRegression
Traning Perfomance
MAE:- 268101.6070829937
MSE:- 306756099359.7596
RMSE:- 553855.6665411663
R2 SCORE:- 0.6217719576765959
------------------------------------------------------
Testing Perfomance
MAE:- 279618.5794158429
MSE:- 252550062888.5656
RMSE:- 502543.5930230985
R2 SCORE:- 0.6645109298852004
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
Ridge
Traning Perfomance
MAE:- 268059.8014688271
MSE:- 306756818740.9266
RMSE:- 553856.3159709624
R2 SCORE:- 0.6217710706848424
------------------------------------------------------
Testing Perfomance
MAE:- 279557.21689302375
MSE:- 252540243247.9703
RMSE:- 502533.82298903057
R2 SCORE:- 0.6645239743566786
-----------------------------------------------

## HyperParameter Tunning

In [62]:
# No hyperparameters for LinearRegression

rf_params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['squared_error', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}


gb_params = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.20, 0.40, 0.60, 0.80, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.20, 0.40, 0.60, 0.80, 1.0],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features': ['sqrt', 'log2']
}


In [63]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [64]:
randomcv_models = [
    ('RandomForestRegressor', RandomForestRegressor(), rf_params),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), gb_params),
]

In [65]:
model_params = {}

for name , model , params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=5,verbose=2,n_jobs=-1)
    random.fit(X_train,y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(f'------Best Params--for {model_name}_-------')
    print(model_params[model_name])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.1s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END criterion=poisson, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   7.7s
[CV] END criterion=poisson, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 

In [66]:
models = {

    'RandomForestRegressor':RandomForestRegressor(
       n_estimators=200,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
    max_depth=None,
    criterion='poisson'
    ),
    "GradientBoostingRegressor":GradientBoostingRegressor(
        subsample=0.6,
        n_estimators=500,
        min_samples_split=6,
        max_features='sqrt',
        loss='squared_error',
            learning_rate=0.4,
                criterion='friedman_mse'    )
}

In [67]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
RandomForestRegressor
Traning Perfomance
MAE:- 39542.4186405866
MSE:- 16404498182.485992
RMSE:- 128080.04599657978
R2 SCORE:- 0.9797733728985033
------------------------------------------------------
Testing Perfomance
MAE:- 98046.71419825175
MSE:- 42593627348.90844
RMSE:- 206382.2360304017
R2 SCORE:- 0.9434183612204969
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
GradientBoostingRegressor
Traning Perfomance
MAE:- 84198.90108767946
MSE:- 17186511022.211758
RMSE:- 131097.3341537186
R2 SCORE:- 0.9788091567474355
------------------------------------------------------
Testing Perfomance
MAE:- 106282.03876720375
MSE:- 44101460429.968636
RMSE:- 210003.47718542337
R2 SCORE:- 0.9414153464024947
