# Import the dataset

In [1]:
# Import libraries and read the csv file
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import sidetable
%matplotlib inline

df = pd.read_csv('pokemon.csv')
df.drop(columns=['japanese_name', 'name'], inplace=True)

In [2]:
# Have a look at the first five rows
df.head()

Unnamed: 0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,percentage_male,pokedex_number,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
0,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,1,65,65,45,grass,poison,6.9,1,0
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,2,80,80,60,grass,poison,13.0,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,88.1,3,122,120,80,grass,poison,100.0,1,0
3,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,4,60,50,65,fire,,8.5,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,88.1,5,80,65,80,fire,,19.0,1,0


In [3]:
# Check for the columns having missing data
df.stb.missing(style=True)

Unnamed: 0,missing,total,percent
type2,384,801,47.94%
percentage_male,98,801,12.23%
weight_kg,20,801,2.50%
height_m,20,801,2.50%
abilities,0,801,0.00%
hp,0,801,0.00%
base_total,0,801,0.00%
capture_rate,0,801,0.00%
classfication,0,801,0.00%
defense,0,801,0.00%


In [4]:
# Check the length and width of the table
df.shape

(801, 39)

In [5]:
# Have a look at columns having missing data plus type1
for i in ['height_m', 'percentage_male', 'type1', 'type2']:
    print(df[i].unique())

[ 0.7  1.   2.   0.6  1.1  1.7  0.5  1.6  0.3  1.5  nan  1.2  3.5  0.4
  0.8  1.3  0.9  1.4  1.9  1.8  8.8  2.2  6.5  2.5  2.1  4.   2.3  0.2
  9.2  5.2  3.8 14.5  2.7  6.2  4.5  7.   2.4  5.4  4.2  3.7  3.2  3.3
  0.1  2.6  2.8  2.9  3.   5.8  5.   3.9  3.4  5.5]
[ 88.1  50.    0.  100.   24.6  75.4   nan  11.2]
['grass' 'fire' 'water' 'bug' 'normal' 'poison' 'electric' 'ground'
 'fairy' 'fighting' 'psychic' 'rock' 'ghost' 'ice' 'dragon' 'dark' 'steel'
 'flying']
['poison' nan 'flying' 'dark' 'electric' 'ice' 'ground' 'fairy' 'grass'
 'fighting' 'psychic' 'steel' 'fire' 'rock' 'water' 'dragon' 'ghost' 'bug'
 'normal']


In [6]:
# Drop 'abilities' and classification since their unique values are too much
df.drop(columns=['abilities', 'classfication'], inplace=True)
# Drop 'type2' since its missing value takes nearly 50% of the data set
df.drop(columns=['type2'], inplace=True)

In [7]:
# Retrieve numeric and categorical columns
num_cols = list(df._get_numeric_data().columns)
cat_cols = list(set(df.columns) - set(num_cols))

In [8]:
cat_cols

['capture_rate', 'type1']

### Take care of column 'capture_rate'

In [9]:
# Have a lookt at the column's unique values
df.capture_rate.unique()

array(['45', '255', '120', '127', '90', '190', '75', '235', '150', '25',
       '170', '50', '200', '100', '180', '60', '225', '30', '35', '3',
       '65', '70', '125', '205', '155', '145', '130', '140', '15', '220',
       '160', '80', '55', '30 (Meteorite)255 (Core)'], dtype=object)

In [10]:
df.loc[df.capture_rate == '30 (Meteorite)255 (Core)', 'capture_rate'] = '0'
df.capture_rate = df.capture_rate.astype(int)

In [11]:
# Encode categorical features into numeric
df = pd.get_dummies(data=df, drop_first=True, columns=cat_cols)

In [12]:
# Impute numeric data
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
num_imputer = IterativeImputer()
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [13]:
X = df.drop(columns='is_legendary')
y = df.is_legendary

In [14]:
from imblearn.combine import SMOTEENN
sampler = SMOTEENN()
X, y = sampler.fit_resample(X, y)

In [15]:
# Split the dataset into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.1, random_state=1)

In [16]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Dropout
from tensorflow.keras.optimizers import Adam

# Create the keras tuner model.
def build_model(hp):
    hp_drop_out = hp.Float('dropout', 0, 0.5, step=0.1)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model = Sequential()
    for i in range(hp.Int('num_layers', 2, 20)):
        model.add(Dense(units=hp.Int('units_' + str(i), min_value=32, max_value=512, step=32), activation='relu'))
#         model.add(BatchNormalization())
        model.add(Dropout(hp_drop_out))
#     model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    opt = Adam(learning_rate=hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4]))
    model.compile(optimizer=Adam(learning_rate=hp_learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model

In [17]:
'''
Credit source: 
    https://www.tensorflow.org/tutorials/keras/keras_tuner
    https://github.com/keras-team/keras-tuner/blob/master/examples/cifar10.py
'''
import kerastuner as kt

tuner = kt.Hyperband(build_model,
                     objective='val_accuracy', 
                     max_epochs=16,
                     overwrite=False)

INFO:tensorflow:Reloading Oracle from existing project .\untitled_project\oracle.json
INFO:tensorflow:Reloading Tuner from .\untitled_project\tuner0.json


In [18]:
es = EarlyStopping(monitor='val_accuracy', verbose=0, patience=10, min_delta=1e-3)
callbacks_list = [es]

In [19]:
tuner.search(X_train, y_train, epochs=64, batch_size=32, validation_data=(X_test, y_test), callbacks=callbacks_list)

INFO:tensorflow:Oracle triggered exit


In [20]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=64, validation_data=(X_test, y_test))

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


In [21]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 3


In [22]:
# Re-instantiate the hypermodel and train it with the optimal number of epochs from above.
hypermodel = tuner.hypermodel.build(best_hps)
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x274db6a6148>

In [23]:
eval_result = hypermodel.evaluate(X_test, y_test)
print("[test loss, test accuracy]:", eval_result)

[test loss, test accuracy]: [145.36407470703125, 0.5958904027938843]


In [24]:
# Retrain the model
hypermodel.fit(X_train, y_train, epochs=best_epoch)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x274de3d1e08>

In [25]:
# from keras.models import save_model
# hypermodel.save('best_model.h5')

In [26]:
# Load the model and predict
# from keras.models import load_model
# hypermodel = load_model('best_model.h5')
y_pred = hypermodel.predict(X_test)
y_pred = y_pred.astype(int)

In [27]:
# Get the index of the predicted table
y_pred = pd.DataFrame(y_pred, columns=['is_legendary'], index=X_test.index)

In [28]:
# Evaluate the accuracy of the prediction
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)
print('Accuracy: %.2f%%' % (accuracy * 100.0))

Accuracy: 95.21%
