# Model to Predict Pokémon Type from the stats

## Imports, constants and functions

In [133]:
## Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import functions as fn

## Load and clean-up data

In [145]:
## Load Data from original JSON dataset

TYPE_NONE = 'NONE'
features=['hp', 'atk', 'spatk', 'def', 'spdef', 'speed', 'bst', 'is_form', 'is_legendary', 'is_mythical']

numeric_features = ['hp', 'atk', 'spatk', 'def', 'spdef', 'speed', 'bst', 'is_form', 'is_legendary', 'is_mythical']
categorical_features = [] # we don't have any yet

labels=['type1', 'type2']
types = {
    # TYPE_NONE,
    'normal',
    'fire',
    'water',
    'electric',
    'grass',
    'ice',
    'fighting',
    'poison',
    'ground',
    'flying',
    'psychic',
    'bug',
    'rock',
    'ghost',
    'dragon',
    'dark',
    'steel',
    'fairy' 
}

raw_df = fn.load_json_datasource()
df = fn.cleanup_df(raw_df)
df.loc[df['type2'].isna(), ['type2']] = pd.NA
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1409 entries, bulbasaur to enamorus-therian
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type1         1409 non-null   object
 1   type2         696 non-null    object
 2   color         1409 non-null   object
 3   gen           1409 non-null   int64 
 4   region        1409 non-null   object
 5   is_form       1409 non-null   bool  
 6   is_legendary  1409 non-null   bool  
 7   is_mythical   1409 non-null   bool  
 8   hp            1409 non-null   int64 
 9   atk           1409 non-null   int64 
 10  def           1409 non-null   int64 
 11  spatk         1409 non-null   int64 
 12  spdef         1409 non-null   int64 
 13  speed         1409 non-null   int64 
 14  bst           1409 non-null   int64 
dtypes: bool(3), int64(8), object(4)
memory usage: 179.5+ KB


In [146]:
# Clean up NaN type2's by replacing them with their type1
# unfortunately, an imputer can't do that for us so we'll have to do it beforehand

from sklearn.base import TransformerMixin

def fill_missing_type2(df):
    newDf = df.copy()
    newDf.loc[newDf['type2'].isna(), 'type2'] = newDf['type1']
    return newDf

filled_df = fill_missing_type2(df)
filled_df.sample(10)

Unnamed: 0_level_0,type1,type2,color,gen,region,is_form,is_legendary,is_mythical,hp,atk,def,spatk,spdef,speed,bst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
darumaka,fire,fire,red,5,unova,False,False,False,70,90,45,15,45,50,315
dratini,dragon,dragon,blue,1,kanto,False,False,False,41,64,45,50,50,50,300
alakazam,psychic,psychic,brown,1,kanto,False,False,False,55,50,45,135,95,120,500
indeedee,psychic,normal,purple,8,galar,False,False,False,60,65,55,105,95,95,475
naganadel,poison,dragon,purple,7,alola,False,False,False,73,73,73,127,73,121,540
tyrogue,fighting,fighting,purple,2,johto,False,False,False,35,35,35,35,35,35,210
onix,rock,ground,gray,1,kanto,False,False,False,35,45,160,30,45,70,385
ambipom,normal,normal,purple,4,sinnoh,False,False,False,75,100,66,60,66,115,482
barboach,water,ground,gray,3,hoenn,False,False,False,50,48,43,46,41,60,288
meganium-f,grass,grass,green,4,johto,True,False,False,80,82,100,83,100,80,525


## Define X and y, and split them

In [147]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Encode string labels to int
encoded_df = filled_df

type1_le = preprocessing.LabelEncoder().fit(encoded_df['type1'])
encoded_df['type1'] = type1_le.transform(encoded_df['type1'])


type2_le = preprocessing.LabelEncoder().fit(encoded_df['type2'])
encoded_df['type2'] = type2_le.transform(encoded_df['type2'])


X = encoded_df[features]
y = encoded_df[labels]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [148]:
X.sample(15)

Unnamed: 0_level_0,hp,atk,spatk,def,spdef,speed,bst,is_form,is_legendary,is_mythical
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
suicune,100,75,90,115,115,85,580,False,True,False
scyther,70,110,55,80,80,105,500,False,False,False
vivillon-high-plains,80,52,90,50,50,89,411,True,False,False
silcoon,50,35,25,55,25,15,205,False,False,False
seedot,40,40,30,50,30,30,220,False,False,False
eldegoss,60,50,80,90,120,60,460,False,False,False
deino,52,65,45,50,50,38,300,False,False,False
skiddo,66,65,62,48,57,52,350,False,False,False
hitmonlee,50,120,35,53,110,87,455,False,False,False
grubbin,47,62,55,45,45,46,300,False,False,False


In [149]:
y.sample(15)

Unnamed: 0_level_0,type1,type2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
centiskorch-gmax,6,0
burmy-trash,0,0
vivillon-elegant,0,7
elgyem,14,14
shieldon,15,16
chespin,9,9
urshifu-rapid-strike-gmax,5,17
porygonz,12,12
pawniard,1,16
turtwig,9,9


## Create a pipeline

In [150]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
# https://www.kaggle.com/code/chimae/predicting-pokemon-types/notebook

# missing_values param = tells what value to consider a "missing value"
numeric_pipeline = make_pipeline(
    SimpleImputer(strategy="median", missing_values=np.nan)
)

categorical_pipeline = make_pipeline(
    #SimpleImputer(strategy="most_frequent", missing_values=pd.NA),
    SimpleImputer(strategy="constant", fill_value="NONE", missing_values=pd.NA),
    #OneHotEncoder(drop='first', handle_unknown = 'ignore') # https://www.roelpeters.be/found-unknown-categories-in-column-sklearn/
    OneHotEncoder(drop='first', handle_unknown = 'ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipeline, numeric_features),
        ("cat_pipe", categorical_pipeline, categorical_features),
    ]
)

"""
[{'classifier__criterion': 'gini',
  'classifier__max_depth': 9,
  'classifier__min_samples_leaf': 3,
  'classifier__min_samples_split': 3,
  'classifier__n_estimators': 14},
 max score = 0.9490377927196846]
"""

rainForestClf = RandomForestClassifier(
    n_estimators = 14, 
    max_depth=9, 
    random_state=123, 
    criterion='gini',                 
    min_samples_leaf=3, 
    min_samples_split=3,
)

clf = MultiOutputClassifier(rainForestClf)

# dtreeClf = DecisionTreeClassifier(
#     max_depth=6, 
#     min_samples_leaf=3, 
#     min_samples_split=3,
#     criterion='entropy'
# )

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier",  clf)]
)

In [151]:
# fit our model with the training data

pipe.fit(X_train, y_train)

# Calculate optimal classifier params with Grid Search

In [152]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import warnings

warnings.filterwarnings("ignore")

param_grid = {
    'classifier__n_estimators': range(2, 20),
    'classifier__max_depth': range(6, 10),
    'classifier__min_samples_leaf': range(3, 12, 2),
    'classifier__min_samples_split': range(3, 25, 5),
    'classifier__criterion':['gini', 'entropy']
}

search = GridSearchCV(
    pipe, # you have defined this beforehand
    param_grid, # your parameter grid
    cv=5, # the value for K in K-fold Cross Validation
    scoring='accuracy', # the performance metric to use, 
    verbose=1
) # we want informative outputs during the training process


# comment the following lines out when done (since the process is very slow):
#search.fit(X_train, y_train)
#[search.best_params_, search.best_score_]

## Generate predictions

In [153]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pd.DataFrame(pipe.predict(X_test), columns = ['type1', 'type2'])
y_pred_test

Unnamed: 0,type1,type2
0,0,5
1,17,1
2,4,4
3,12,17
4,17,6
...,...,...
348,0,7
349,12,12
350,14,16
351,0,16


## Check how good is our model with the accuracy score

In [154]:
pipe.score(X_test, y_pred_test)

1.0

## Comparing the results

In [155]:
type1_test = type1_le.inverse_transform(y_test['type1'])
type2_test = type1_le.inverse_transform(y_test['type2'])

type1_pred = type1_le.inverse_transform(y_pred_test['type1'])
type2_pred = type1_le.inverse_transform(y_pred_test['type2'])


result_df = X_test.copy()

result_df['type1'] = type1_test
result_df['type2'] = type2_test
result_df['type1_pred'] = type1_pred
result_df['type2_pred'] = type2_pred

result_df.sample(20)

Unnamed: 0_level_0,hp,atk,spatk,def,spdef,speed,bst,is_form,is_legendary,is_mythical,type1,type2,type1_pred,type2_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
tympole,50,50,50,40,40,64,294,False,False,False,water,water,normal,water
dustox-f,60,50,50,70,90,65,385,True,False,False,bug,poison,water,grass
unown-i,48,72,72,48,48,48,336,True,False,False,psychic,psychic,psychic,psychic
alakazam,55,50,135,45,95,120,500,False,False,False,psychic,psychic,ghost,psychic
deoxys-attack,50,180,180,20,20,150,600,True,False,True,psychic,psychic,bug,dark
aron,50,70,40,100,40,30,330,False,False,False,steel,rock,water,ground
alcremie-lemon-cream-strawberry,65,60,110,75,121,64,495,True,False,False,fairy,fairy,fairy,fairy
alcremie-rainbow-swirl-love,65,60,110,75,121,64,495,True,False,False,fairy,fairy,fairy,fairy
cradily,86,81,81,97,107,43,495,False,False,False,rock,grass,water,normal
corvisquire,68,67,43,55,55,77,365,False,False,False,flying,flying,normal,normal
