# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [2]:
df = pd.read_csv('Pokemon.csv')
df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
#             800 non-null int64
Name          800 non-null object
Type 1        800 non-null object
Type 2        414 non-null object
Total         800 non-null int64
HP            800 non-null int64
Attack        800 non-null int64
Defense       800 non-null int64
Sp. Atk       800 non-null int64
Sp. Def       800 non-null int64
Speed         800 non-null int64
Generation    800 non-null int64
Legendary     800 non-null bool
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [4]:
df.describe()

Unnamed: 0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation
count,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0,800.0
mean,362.81375,435.1025,69.25875,79.00125,73.8425,72.82,71.9025,68.2775,3.32375
std,208.343798,119.96304,25.534669,32.457366,31.183501,32.722294,27.828916,29.060474,1.66129
min,1.0,180.0,1.0,5.0,5.0,10.0,20.0,5.0,1.0
25%,184.75,330.0,50.0,55.0,50.0,49.75,50.0,45.0,2.0
50%,364.5,450.0,65.0,75.0,70.0,65.0,70.0,65.0,3.0
75%,539.25,515.0,80.0,100.0,90.0,95.0,90.0,90.0,5.0
max,721.0,780.0,255.0,190.0,230.0,194.0,230.0,180.0,6.0


# Encoding

In [5]:
df['Type 2'].fillna(df['Type 2'].mode()[0], inplace=True)

In [6]:
df['Type 1'].nunique()

18

In [7]:
df['Type 2'].nunique()

18

In [8]:
import category_encoders as ce

encoder = ce.BinaryEncoder(cols=['Type 1'])
df = encoder.fit_transform(df).drop('Type 1_0', axis=1)

df.head()

Unnamed: 0,Type 1_1,Type 1_2,Type 1_3,Type 1_4,Type 1_5,#,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,0,0,0,0,1,1,Bulbasaur,Poison,318,45,49,49,65,65,45,1,False
1,0,0,0,0,1,2,Ivysaur,Poison,405,60,62,63,80,80,60,1,False
2,0,0,0,0,1,3,Venusaur,Poison,525,80,82,83,100,100,80,1,False
3,0,0,0,0,1,3,VenusaurMega Venusaur,Poison,625,80,100,123,122,120,80,1,False
4,0,0,0,1,0,4,Charmander,Flying,309,39,52,43,60,50,65,1,False


In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['Type 2'])

df['Type 2'] = le.transform(df['Type 2'])
df.head()

Unnamed: 0,Type 1_1,Type 1_2,Type 1_3,Type 1_4,Type 1_5,#,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,0,0,0,0,1,1,Bulbasaur,13,318,45,49,49,65,65,45,1,False
1,0,0,0,0,1,2,Ivysaur,13,405,60,62,63,80,80,60,1,False
2,0,0,0,0,1,3,Venusaur,13,525,80,82,83,100,100,80,1,False
3,0,0,0,0,1,3,VenusaurMega Venusaur,13,625,80,100,123,122,120,80,1,False
4,0,0,0,1,0,4,Charmander,7,309,39,52,43,60,50,65,1,False


# K-Fold

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, matthews_corrcoef,f1_score,log_loss

In [11]:
data = df.drop(['Legendary','Name','Generation','#'], axis=1)
target = df['Legendary']

In [12]:
def calc_train_error(X_train, y_train, model):
#     '''returns in-sample error for already fit model.'''
    predictions = model.predict(X_train)
    predictProba = model.predict_proba(X_train)
    matt = matthews_corrcoef(y_train, predictions)
    f1 = f1_score(y_train, predictions, average='macro')
    report = classification_report(y_train, predictions)
    roc_auc = roc_auc_score(y_train, predictProba[:,1])
    accuracy = accuracy_score(y_train, predictions)
    confMatrix = confusion_matrix(y_train, predictions)
    logloss = log_loss(y_train,predictProba)
    return { 
        'report': report, 
        'matthew': matt, 
        'f1' : f1, 
        'roc': roc_auc, 
        'accuracy': accuracy,
        'confusion': confMatrix,
        'logloss': logloss
    }
    
def calc_validation_error(X_test, y_test, model):
#     '''returns out-of-sample error for already fit model.'''
    predictions = model.predict(X_test)
    predictProba = model.predict_proba(X_test)
    matt = matthews_corrcoef(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='macro')
    report = classification_report(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictProba[:,1])
    accuracy = accuracy_score(y_test, predictions)
    confMatrix = confusion_matrix(y_test, predictions)
    logloss = log_loss(y_test, predictProba)
    return { 
        'report': report, 
        'matthew': matt, 
        'f1' : f1, 
        'roc': roc_auc, 
        'accuracy': accuracy,
        'confusion': confMatrix,
        'logloss': logloss
    }
    
def calc_metrics(X_train, y_train, X_test, y_test, model):
#     '''fits model and returns the classification metrics for in-sample error and out-of-sample error'''
    model.fit(X_train, y_train)
    train_error = calc_train_error(X_train, y_train, model)
    validation_error = calc_validation_error(X_test, y_test, model)
    return train_error, validation_error

In [13]:
from sklearn.model_selection import KFold

K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

Gradient Boosting :
- https://medium.com/mlreview/gradient-boosting-from-scratch-1e317ae4587d
- https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [14]:
train_errors = []
validation_errors = []
for train_index, val_index in kf.split(data, target):
    
    # split data
    X_train, X_val = data.iloc[train_index], data.iloc[val_index]
    y_train, y_val = target.iloc[train_index], target.iloc[val_index]

    # instantiate model
    gbmodel = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

    #calculate errors
    train_error, val_error = calc_metrics(X_train, y_train, X_val, y_val, gbmodel)

    # append to appropriate list
    train_errors.append(train_error)
    validation_errors.append(val_error)

In [15]:
listItem = []
listIndex = []

for i,tr,val in zip(range(1,len(train_errors)+1),train_errors,validation_errors) :
    listItem.append([tr['accuracy'],val['accuracy'],tr['roc'],val['roc'],tr['f1'],
                     val['f1'],tr['matthew'],val['matthew'],tr['logloss'],val['logloss']])
    listIndex.append('{} Iteration'.format(i))

listItem.append(list(np.mean(listItem,axis=0)))
listIndex.append('Average')
    
dfEvaluate = pd.DataFrame(listItem, 
                    columns=['Train Accuracy', 
                            'Test Accuracy', 
                            'Train ROC AUC', 
                            'Test ROC AUC', 
                            'Train F1 Score',
                            'Test F1 Score',
                            'Train Matthews Corr Coef',
                            'Test Matthews Corr Coef',
                            'Train Log Loss',
                            'Test Log Loss'],
                    index=listIndex)
dfEvaluate

Unnamed: 0,Train Accuracy,Test Accuracy,Train ROC AUC,Test ROC AUC,Train F1 Score,Test F1 Score,Train Matthews Corr Coef,Test Matthews Corr Coef,Train Log Loss,Test Log Loss
1 Iteration,0.996875,0.96875,0.999938,0.992667,0.990054,0.891525,0.980109,0.802773,0.017071,0.06066
2 Iteration,0.996875,0.9625,1.0,0.990991,0.989887,0.874411,0.979976,0.75159,0.014044,0.061076
3 Iteration,0.998437,0.95625,1.0,0.987441,0.994813,0.848136,0.989678,0.696969,0.012817,0.076941
4 Iteration,0.998437,0.9625,1.0,0.995402,0.994526,0.864865,0.989112,0.759051,0.01421,0.08555
5 Iteration,0.998437,0.9125,1.0,0.968276,0.994526,0.756944,0.989112,0.515343,0.010884,0.17739
Average,0.997812,0.9525,0.999988,0.986955,0.992761,0.847176,0.985597,0.705145,0.013805,0.092323


In [16]:
for i, tr_err, val_err in zip(range(1,6,1),train_errors, validation_errors) :
    print('Report Train ke ' + str(i) + ' : ')
    print(tr_err['report'])
    print('Report Validation ke ' + str(i) + ' : ')
    print(val_err['report'])

Report Train ke 1 : 
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       585
        True       0.98      0.98      0.98        55

   micro avg       1.00      1.00      1.00       640
   macro avg       0.99      0.99      0.99       640
weighted avg       1.00      1.00      1.00       640

Report Validation ke 1 : 
              precision    recall  f1-score   support

       False       1.00      0.97      0.98       150
        True       0.67      1.00      0.80        10

   micro avg       0.97      0.97      0.97       160
   macro avg       0.83      0.98      0.89       160
weighted avg       0.98      0.97      0.97       160

Report Train ke 2 : 
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       587
        True       0.96      1.00      0.98        53

   micro avg       1.00      1.00      1.00       640
   macro avg       0.98      1.00      0.99       640
weight