## GradientBoost  Regression

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

In [2]:
df = pd.read_csv('cars.csv')

In [3]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


## Data Cleaning

In [4]:
## Checking null values
df.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [5]:
df.drop(['car_name','brand','Unnamed: 0'],inplace=True,axis=1)
""

KeyError: "['car_name', 'brand', 'Unnamed: 0'] not found in axis"

In [49]:
df.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [None]:
df['name'].unique()

AttributeError: 'numpy.ndarray' object has no attribute 'value_count'

In [36]:
df['name'].value_counts()


name
Maruti Swift Dzire VDI                     69
Maruti Alto 800 LXI                        59
Maruti Alto LXi                            47
Maruti Alto LX                             35
Hyundai EON Era Plus                       35
                                           ..
Hyundai Verna Transform CRDi VGT SX ABS     1
Maruti S-Presso VXI Plus                    1
Toyota Etios Liva 1.2 VX                    1
Toyota Yaris G                              1
Hyundai i20 Magna 1.4 CRDi                  1
Name: count, Length: 1491, dtype: int64

In [7]:
## Numerical Features 
numerical_features = [ feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of numerical Features : -' , len(numerical_features))
## Categorical features
categorical_features = [ feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of categorical Features : -' , len(categorical_features))
## Discrete Features
descrite_features = [ feature for feature in numerical_features if len(df[feature].unique())<=25]
print('Num of descrite Features : -' , len(descrite_features))
## continues Features
continues_features = [ feature for feature  in numerical_features if feature not in descrite_features ]
print('Num of continuous Features : -' , len(continues_features))

Num of numerical Features : - 3
Num of categorical Features : - 5
Num of descrite Features : - 0
Num of continuous Features : - 3


In [8]:
df.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [18]:
from sklearn.model_selection import train_test_split
X = df.drop('selling_price',axis=1)
y = df['selling_price']

In [19]:
X.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,141000,Diesel,Individual,Manual,Second Owner


## Feature Scaling and Encoding

In [55]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
X['name']= label_encoder.fit_transform(X['name'])

In [20]:
X.head()

Unnamed: 0,name,year,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,141000,Diesel,Individual,Manual,Second Owner


In [25]:
# Separate column names
num_features = X.select_dtypes(exclude='object').columns
cat_features = ['seller_type', 'fuel', 'transmission', 'name', 'owner']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define transformers
numeric_transformer = StandardScaler()
ohe_transformer = OneHotEncoder(drop='first')

# Combine into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', ohe_transformer, cat_features),
        ('StandardScaler', numeric_transformer, num_features)
    ],
    remainder='passthrough'
)


In [26]:
X = preprocessor.fit_transform(X)

In [27]:
pd.DataFrame(X)

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 5)\t1.0\n (0, 6)\t1.0\n ..."
1,"(0, 0)\t1.0\n (0, 5)\t1.0\n (0, 6)\t1.0\n ..."
2,"(0, 0)\t1.0\n (0, 2)\t1.0\n (0, 6)\t1.0\n ..."
3,"(0, 0)\t1.0\n (0, 5)\t1.0\n (0, 6)\t1.0\n ..."
4,"(0, 0)\t1.0\n (0, 2)\t1.0\n (0, 6)\t1.0\n ..."
...,...
4335,"(0, 0)\t1.0\n (0, 2)\t1.0\n (0, 6)\t1.0\n ..."
4336,"(0, 0)\t1.0\n (0, 2)\t1.0\n (0, 6)\t1.0\n ..."
4337,"(0, 0)\t1.0\n (0, 5)\t1.0\n (0, 6)\t1.0\n ..."
4338,"(0, 0)\t1.0\n (0, 2)\t1.0\n (0, 6)\t1.0\n ..."


In [28]:
X_train , X_test  , y_train , y_test  = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape , y_train.shape

((3472, 1503), (3472,))

In [29]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression ,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [30]:
def evaluate_model(true,predicted):
    print('MAE:-', mean_absolute_error(true,predicted))
    print('MSE:-', mean_squared_error(true,predicted))
    print('RMSE:-', np.sqrt(mean_squared_error(true,predicted)))
    print('R2 SCORE:-', r2_score(true,predicted))
    

In [31]:
models = {
    'LinearRegression':LinearRegression(),
    'Ridge':Ridge(),
    'Lasso':Lasso(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'RandomForestRegressor':RandomForestRegressor(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'GradientBoostingRegressor':GradientBoostingRegressor()
}

In [32]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
LinearRegression
Traning Perfomance
MAE:- 40437.83952742518
MSE:- 6186664187.428801
RMSE:- 78655.35065988073
R2 SCORE:- 0.9819073268777512
------------------------------------------------------
Testing Perfomance
MAE:- 123591.85496537713
MSE:- 121717237537.67645
RMSE:- 348879.97583363316
R2 SCORE:- 0.6011497741167633
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
Ridge
Traning Perfomance
MAE:- 87471.0275141369
MSE:- 25221138122.61987
RMSE:- 158811.64353604516
R2 SCORE:- 0.9262417040913777
------------------------------------------------------
Testing Perfomance
MAE:- 136091.71688655837
MSE:- 119298687834.69463
RMSE:- 345396.42128240794
R2 SCORE:- 0.6090750204899036


  model = cd_fast.sparse_enet_coordinate_descent(


------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
Lasso
Traning Perfomance
MAE:- 41302.24277867144
MSE:- 6196006068.049499
RMSE:- 78714.71316119687
R2 SCORE:- 0.9818800068895803
------------------------------------------------------
Testing Perfomance
MAE:- 116970.55512358867
MSE:- 126261886515.39632
RMSE:- 355333.4863412064
R2 SCORE:- 0.5862576001897752
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
KNeighborsRegressor
Traning Perfomance
MAE:- 113280.02701612904
MSE:- 64582920381.37787
RMSE:- 254131.69889129902
R2 SCORE:- 0.8111296116387215
------------------------------------------------------
Testing Perfomance
MAE:- 142159.61728110598
MSE:- 141129594841.8287
RMSE:- 375672.1906687115
R2 SCORE:- 0.537538216277303
------------------------------------------

## HyperParameter Tunning

In [62]:
# No hyperparameters for LinearRegression

rf_params = {
    'n_estimators': [100, 200, 300],
    'criterion': ['squared_error', 'absolute_error', 'poisson'],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}


gb_params = {
    'loss': ['squared_error', 'absolute_error', 'huber', 'quantile'],
    'learning_rate': [0.20, 0.40, 0.60, 0.80, 1.0],
    'n_estimators': [100, 200, 300, 400, 500],
    'subsample': [0.20, 0.40, 0.60, 0.80, 1.0],
    'criterion': ['friedman_mse', 'squared_error'],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features': ['sqrt', 'log2']
}


In [63]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [64]:
randomcv_models = [
    ('RandomForestRegressor', RandomForestRegressor(), rf_params),
    ('GradientBoostingRegressor', GradientBoostingRegressor(), gb_params),
]

In [65]:
model_params = {}

for name , model , params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,param_distributions=params,n_iter=100,cv=5,verbose=2,n_jobs=-1)
    random.fit(X_train,y_train)
    model_params[name] = random.best_params_

for model_name in model_params:
    print(f'------Best Params--for {model_name}_-------')
    print(model_params[model_name])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   1.9s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.0s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.1s
[CV] END criterion=squared_error, max_depth=20, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators=100; total time=   2.2s
[CV] END criterion=poisson, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=   7.7s
[CV] END criterion=poisson, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time= 

In [66]:
models = {

    'RandomForestRegressor':RandomForestRegressor(
       n_estimators=200,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features='sqrt',
    max_depth=None,
    criterion='poisson'
    ),
    "GradientBoostingRegressor":GradientBoostingRegressor(
        subsample=0.6,
        n_estimators=500,
        min_samples_split=6,
        max_features='sqrt',
        loss='squared_error',
            learning_rate=0.4,
                criterion='friedman_mse'    )
}

In [67]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    # Make Prediction
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)    

    print('------------------------------------------------------')
    print('------------------------------------------------------')
    print('------------------------------------------------------')

    print(list(models.keys())[i])

     #Training Performance
    print('Traning Perfomance')
    evaluate_model(y_train,y_pred_train)

    print('------------------------------------------------------')

    #Testing Performance
    print('Testing Perfomance')
    evaluate_model(y_test,y_pred_test)
    

------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
RandomForestRegressor
Traning Perfomance
MAE:- 39542.4186405866
MSE:- 16404498182.485992
RMSE:- 128080.04599657978
R2 SCORE:- 0.9797733728985033
------------------------------------------------------
Testing Perfomance
MAE:- 98046.71419825175
MSE:- 42593627348.90844
RMSE:- 206382.2360304017
R2 SCORE:- 0.9434183612204969
------------------------------------------------------
------------------------------------------------------
------------------------------------------------------
GradientBoostingRegressor
Traning Perfomance
MAE:- 84198.90108767946
MSE:- 17186511022.211758
RMSE:- 131097.3341537186
R2 SCORE:- 0.9788091567474355
------------------------------------------------------
Testing Perfomance
MAE:- 106282.03876720375
MSE:- 44101460429.968636
RMSE:- 210003.47718542337
R2 SCORE:- 0.9414153464024947
