In [1]:
# Gradient Boost

# Importing the libraries
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from matplotlib import pyplot

In [2]:
# load dataset
df = pd.read_csv('Data_without-72ROWS.csv')
array = df.values
print('Shape:', df.shape)
# set input matrix and target column
X = array[:, :-1]
y = array[:, 6]
# show first row of data
print(df.head())
print(df.describe())

Shape: (454, 7)
   Floor_Num  Total_Floor_Area  Column_Area  Masonry_Wall_AreaNS  \
0        2.0              1797         6.21                 0.00   
1        1.0               577         2.00                 0.00   
2        3.0               498         1.50                 1.35   
3        2.0               310         1.40                 1.02   
4        3.0               287         0.74                 3.74   

   Masonry_Wall_AreaEW  Captive_Columns Damage_Class  
0                 9.49                1       Severe  
1                 1.68                0        Light  
2                 0.40                0        Light  
3                 0.00                0        Light  
4                 1.74                1       Severe  
        Floor_Num  Total_Floor_Area  Column_Area  Masonry_Wall_AreaNS  \
count  454.000000        454.000000   454.000000           454.000000   
mean     3.061674        608.255507     1.956145             3.100198   
std      1.344413        5

In [3]:
# data split train/test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1234)

In [4]:
# taking care of the missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
imputer = imputer.fit(X[:, 1:6]) #upper bound is not included, but lower bound
X[:, 1:6] = imputer.transform(X[:, 1:6])

In [5]:
# encoding the dependent variable
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

In [6]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

In [7]:
# summarize the class distribution
target = df.values[:, -1]
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%s, Percentage=%.3f%%' % (k, v, per))

Class=Severe, Count=194, Percentage=42.731%
Class=Light, Count=138, Percentage=30.396%
Class=Moderate, Count=76, Percentage=16.740%
Class=None, Count=46, Percentage=10.132%


In [8]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

(454, 6) (454,) Counter({3: 194, 0: 138, 1: 76, 2: 46})


In [9]:
# Implementing SMOTE for the Imbalanced data in Multi-class classification
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)



In [10]:
print(X.shape, y.shape, Counter(y))

(602, 6) (602,) Counter({3: 194, 2: 194, 0: 138, 1: 76})


In [11]:
# To balance another minority class
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)



In [12]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

(720, 6) (720,) Counter({3: 194, 1: 194, 2: 194, 0: 138})


In [13]:
# To balance another minority class
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)



In [14]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

(776, 6) (776,) Counter({3: 194, 0: 194, 1: 194, 2: 194})


In [15]:
# Feature Scaling - 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [16]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

Learning rate:  0.05
Accuracy score (training): 0.526
Accuracy score (validation): 0.500
Learning rate:  0.075
Accuracy score (training): 0.550
Accuracy score (validation): 0.491
Learning rate:  0.1
Accuracy score (training): 0.538
Accuracy score (validation): 0.509
Learning rate:  0.25
Accuracy score (training): 0.626
Accuracy score (validation): 0.447
Learning rate:  0.5
Accuracy score (training): 0.791
Accuracy score (validation): 0.474
Learning rate:  0.75
Accuracy score (training): 0.853
Accuracy score (validation): 0.491
Learning rate:  1
Accuracy score (training): 0.859
Accuracy score (validation): 0.412


In [17]:
gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.5, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[15  5  3 14]
 [ 6  3  2  6]
 [ 1  0  2  6]
 [14  0  3 34]]
Classification Report
              precision    recall  f1-score   support

       Light       0.42      0.41      0.41        37
    Moderate       0.38      0.18      0.24        17
        None       0.20      0.22      0.21         9
      Severe       0.57      0.67      0.61        51

    accuracy                           0.47       114
   macro avg       0.39      0.37      0.37       114
weighted avg       0.46      0.47      0.46       114

