# Gradient Boost Algorithm
## Author : Ashish Kumar Patra

In [17]:
# Filtering out the warnings and importing the required libraries

import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib notebook

## 1. Load the Dataset

In [18]:
train = pd.read_csv("C:\\Users\\ASHISH\\Desktop\\DataSets\\train.csv")
test = pd.read_csv("C:\\Users\\ASHISH\\Desktop\\DataSets\\test.csv")

In [19]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [20]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## 2. Check various DataType

In [21]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass  

(None, None)

## 3. Data Preparation

In [22]:
train.set_index("PassengerId", inplace=True)
test.set_index("PassengerId", inplace=True)

In [23]:
y_train = train["Survived"]

In [24]:
train.drop(labels="Survived", axis=1, inplace=True)

In [25]:
train.shape, test.shape

((891, 10), (418, 10))

In [26]:
train_test =  train.append(test)

In [27]:
train.isnull().sum()

Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [28]:
columns_to_drop = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
train_test.drop(labels=columns_to_drop, axis=1, inplace=True)

In [29]:
train_test_dummies = pd.get_dummies(train_test, columns=["Sex"])

In [30]:
train_test_dummies.shape

(1309, 4)

In [31]:
train_test_dummies.head()

Unnamed: 0_level_0,Pclass,Fare,Sex_female,Sex_male
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3,7.25,0,1
2,1,71.2833,1,0
3,3,7.925,1,0
4,1,53.1,1,0
5,3,8.05,0,1


In [32]:
train_test_dummies.isna().sum()

Pclass        0
Fare          1
Sex_female    0
Sex_male      0
dtype: int64

In [33]:
train_test_dummies.fillna(value=0.0, inplace=True)
train_test_dummies.isna().sum()

Pclass        0
Fare          0
Sex_female    0
Sex_male      0
dtype: int64

In [34]:
X_train = train_test_dummies.values[0:891]
X_test = train_test_dummies.values[891:]

In [35]:
X_train.shape, X_test.shape

((891, 4), (418, 4))

In [36]:
X_train

array([[ 3.    ,  7.25  ,  0.    ,  1.    ],
       [ 1.    , 71.2833,  1.    ,  0.    ],
       [ 3.    ,  7.925 ,  1.    ,  0.    ],
       ...,
       [ 3.    , 23.45  ,  1.    ,  0.    ],
       [ 1.    , 30.    ,  0.    ,  1.    ],
       [ 3.    ,  7.75  ,  0.    ,  1.    ]])

In [37]:
X_test

array([[ 3.    ,  7.8292,  0.    ,  1.    ],
       [ 3.    ,  7.    ,  1.    ,  0.    ],
       [ 2.    ,  9.6875,  0.    ,  1.    ],
       ...,
       [ 3.    ,  7.25  ,  0.    ,  1.    ],
       [ 3.    ,  8.05  ,  0.    ,  1.    ],
       [ 3.    , 22.3583,  0.    ,  1.    ]])

In [38]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [39]:

from sklearn.model_selection import train_test_split

X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = train_test_split(X_train_scale, y_train, random_state=0)

## 4. Implement Gradient Boost Algorithm

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

In [42]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=20, learning_rate = learning_rate, max_features=2, max_depth = 2, random_state = 0)
    gb.fit(X_train_sub, y_train_sub)
    print("Learning rate : ", learning_rate)
    print("Accuracy score (training) : {0:.3f}".format(gb.score(X_train_sub, y_train_sub)))
    print("Accuracy score (validation) : {0:.3f}".format(gb.score(X_validation_sub, y_validation_sub)))
    print()

Learning rate :  0.05
Accuracy score (training) : 0.789
Accuracy score (validation) : 0.780

Learning rate :  0.1
Accuracy score (training) : 0.792
Accuracy score (validation) : 0.780

Learning rate :  0.25
Accuracy score (training) : 0.816
Accuracy score (validation) : 0.803

Learning rate :  0.5
Accuracy score (training) : 0.826
Accuracy score (validation) : 0.834

Learning rate :  0.75
Accuracy score (training) : 0.831
Accuracy score (validation) : 0.789

Learning rate :  1
Accuracy score (training) : 0.831
Accuracy score (validation) : 0.789



In [44]:
gb = GradientBoostingClassifier(n_estimators=20, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train_sub, y_train_sub)
predictions = gb.predict(X_validation_sub)

print("Confusion Matrix :\n")
print(confusion_matrix(y_validation_sub, predictions))
print()
print("Classification Report :\n")
print(classification_report(y_validation_sub, predictions))

Confusion Matrix :

[[131   8]
 [ 29  55]]

Classification Report :

              precision    recall  f1-score   support

           0       0.82      0.94      0.88       139
           1       0.87      0.65      0.75        84

    accuracy                           0.83       223
   macro avg       0.85      0.80      0.81       223
weighted avg       0.84      0.83      0.83       223



In [45]:
y_scores_gb = gb.decision_function(X_validation_sub)
fpr_gb, tpr_gb, _ = roc_curve(y_validation_sub, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))

Area under ROC curve = 0.88


## 5. With Hyper-Parameter Optimization

In [46]:
from sklearn.model_selection import GridSearchCV

In [47]:
# Dictionary of Params

GB_params = {'n_estimators' : np.arange(5,50,5),
            'learning_rate' : [0.05, 0.5, 0.6, 0.7, 0.8, 0.9],
            'max_features' : np.arange(2,20,2),
            'max_depth' : np.arange(2,10,2),
            'random_state' : [0]}

In [48]:
from sklearn.ensemble import GradientBoostingClassifier

GBC = GradientBoostingClassifier()

In [49]:
GBC_HPO = GridSearchCV(GBC, GB_params, cv=5)

In [50]:
GBC_HPO.fit(X_train_sub, y_train_sub)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.05, 0.5, 0.6, 0.7, 0.8, 0.9],
                         'max_depth': array([2, 4, 6, 8]),
                         'max_features': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                         'n_estimators': array([ 5, 10, 15, 20, 25, 30, 35, 40, 45]),
                         'random_state': [0]})

In [51]:
GBC_HPO.best_score_

0.8129278419930422

In [52]:
GBC_HPO.best_params_

{'learning_rate': 0.8,
 'max_depth': 2,
 'max_features': 4,
 'n_estimators': 5,
 'random_state': 0}