# Gradient Boosting

### Importing Libraries

In [1]:
#Importing required libraries
import pandas as pd 
import numpy as np

### Loading the dataset

In [2]:
#reading the data
data=pd.read_csv('data_cleaned.csv')

In [3]:
#first five rows of the data
data.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.2833,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.1,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


### Separating independent and dependent variables.

In [4]:
#independent variables
x = data.drop(['Survived'], axis=1)

#dependent variable
y = data['Survived']

### Creating the train and test dataset

In [5]:
#import the train-test split
from sklearn.model_selection import train_test_split

In [6]:
#divide into train and test sets
train_x,test_x,train_y,test_y = train_test_split(x,y, random_state = 101, stratify=y)

## Building an GBDT Model

In [7]:
#Importing GBDT Classifier 
from sklearn.ensemble import GradientBoostingClassifier

In [8]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96)

In [9]:
#training the model
clf.fit(train_x,train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=96,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [10]:
#calculating score on test data
clf.score(test_x, test_y)

0.81165919282511212

# GBDT Hyperparameters

### Model Based Hyperparameters

1. **n_estimators:** Total number of trees.
2. **loss:** The loss function to be minimized. 
3. **subsample:**The fraction of observations to be selected for each tree. Selection is done by random sampling.
4. **random_state:** The random number seed so that same random numbers are generated every time.
5. **learning_rate:** This determines the impact of each tree on the final outcome 

In [11]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96, n_estimators=200, subsample=0.7)

In [12]:
#training the model
clf.fit(train_x,train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=96,
              subsample=0.7, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [13]:
#calculating score on test data
clf.score(test_x, test_y)

0.820627802690583

### Tree Based Hyperparameters

1. **max_depth:** Maximum depth to which tree can grow (stopping criteria)
2. **max_features:** The number of features to consider while searching for a best split
3. **max_leaf_nodes:** The maximum number of terminal nodes or leaves in a tree
4. **min_samples_leaf:** Minimum samples required in a terminal node or leaf (stopping criteria)
5. **min_samples_split:** Minimum number of samples required in a node for splitting (stopping criteria)

In [14]:
#creating an Gradient boosting instance
clf = GradientBoostingClassifier(random_state=96, min_samples_split=100, max_depth=4)

In [15]:
#training the model
clf.fit(train_x,train_y)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=100,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=96,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [16]:
#calculating score on test data
clf.score(test_x, test_y)

0.83856502242152464