## Gradient Boosting Classification

Using custom dataset.

In [1]:
%matplotlib inline

import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets  # imports datasets from scikit-learn
from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [3]:
glass = pd.read_csv("glassClass.csv")

glass.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


## Prepare data

In [4]:
Y = glass["Type"]  # response

X = glass.drop("Type", axis=1)  # predictors

## Split data

In [5]:
# 25% hold out for testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.25, random_state=25)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((160, 9), (54, 9), (160,), (54,))

### Number of sequential trees to be modeled

```py
(n_estimators=200, max_depth=3)
```

## Gradient Boosting Classifier

In [25]:
gbc1 = GradientBoostingClassifier(n_estimators=200, max_depth=3)

gbc1.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=200)

## Predict class labels

In [9]:
Y_pred = gbc1.predict(X_test)

Y_pred[0]

2

## Confusion matrix

In [10]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, Y_pred)
confusion_matrix

array([[11,  4,  1,  0,  0,  0],
       [ 2, 19,  1,  1,  0,  0],
       [ 1,  0,  1,  0,  0,  0],
       [ 0,  1,  0,  1,  0,  0],
       [ 0,  1,  0,  0,  2,  0],
       [ 1,  0,  0,  0,  0,  7]])

In [15]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, Y_pred))


0.7592592592592593


## % importance of the different features

In [16]:
print(gbc1.feature_importances_)


[0.13168625 0.05157711 0.11264008 0.19368293 0.05117067 0.0682659
 0.16316344 0.16287314 0.06494046]


## Score

In [17]:
print("\nTrain score:", gbc1.score(X_train, y_train))
print("\nTest score", gbc1.score(X_test, y_test))


Train score: 1.0

Test score 0.7592592592592593


### Change parameters and predict class labels

In [26]:
gbc2 = GradientBoostingClassifier(n_estimators=300, max_depth=5)

gbc2.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=5, n_estimators=300)

In [22]:
Y_pred = gbc2.predict(X_test)

Y_pred[0]

3

In [23]:
print("\naccuracy_score test and pred:\n", accuracy_score(y_test, Y_pred))


accuracy_score test and pred:
 0.7407407407407407
