# Model to Predict Pokémon stats from the type(s)

## Imports, constants and functions

In [2]:
## Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import functions as fn

## Load and clean-up data

In [3]:
## Load Data from original JSON dataset

pokemon_types = {
    'normal',
    'fire',
    'water',
    'electric',
    'grass',
    'ice',
    'fighting',
    'poison',
    'ground',
    'flying',
    'psychic',
    'bug',
    'rock',
    'ghost',
    'dragon',
    'dark',
    'steel',
    'fairy' 
}

features=['type1', 'type2', 'is_form', 'is_legendary', 'is_mythical']
numeric_features = ['is_form', 'is_legendary', 'is_mythical']
categorical_features = ['type1', 'type2']

labels=['hp', 'atk', 'spatk', 'def', 'spdef', 'speed']

raw_df = fn.load_json_datasource()
df = fn.cleanup_df(raw_df)
df.loc[df['type2'].isna(), ['type2']] = pd.NA
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1409 entries, bulbasaur to enamorus-therian
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type1         1409 non-null   object
 1   type2         696 non-null    object
 2   color         1409 non-null   object
 3   gen           1409 non-null   int64 
 4   region        1409 non-null   object
 5   is_form       1409 non-null   bool  
 6   is_legendary  1409 non-null   bool  
 7   is_mythical   1409 non-null   bool  
 8   hp            1409 non-null   int64 
 9   atk           1409 non-null   int64 
 10  def           1409 non-null   int64 
 11  spatk         1409 non-null   int64 
 12  spdef         1409 non-null   int64 
 13  speed         1409 non-null   int64 
 14  bst           1409 non-null   int64 
dtypes: bool(3), int64(8), object(4)
memory usage: 179.5+ KB


In [4]:
# Clean up NaN type2's by replacing them with their type1
# unfortunately, an imputer can't do that for us so we'll have to do it beforehand

from sklearn.base import TransformerMixin

def fill_missing_type2(df):
    newDf = df.copy()
    newDf.loc[newDf['type2'].isna(), 'type2'] = newDf['type1']
    return newDf

filled_df = fill_missing_type2(df)
filled_df.sample(10)

Unnamed: 0_level_0,type1,type2,color,gen,region,is_form,is_legendary,is_mythical,hp,atk,def,spatk,spdef,speed,bst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
burmy,bug,bug,green,4,sinnoh,False,False,False,40,29,45,29,45,36,224
dugtrio-alola,ground,steel,brown,7,alola,True,False,False,35,100,60,50,70,110,425
morpeko,electric,dark,yellow,8,galar,False,False,False,58,95,58,70,58,97,436
haunter,ghost,poison,purple,1,kanto,False,False,False,45,50,45,115,55,95,405
hawlucha,fighting,flying,green,6,kalos,False,False,False,78,92,75,74,63,118,500
meloetta-pirouette,normal,psychic,white,5,unova,True,False,True,100,128,90,77,77,128,600
gallade-mega,psychic,fighting,white,6,kalos,True,False,False,68,165,95,65,115,110,618
blastoise,water,water,blue,1,kanto,False,False,False,79,83,100,85,105,78,530
fomantis,grass,grass,pink,7,alola,False,False,False,40,55,35,50,35,35,250
drifblim,ghost,flying,purple,4,sinnoh,False,False,False,150,80,44,90,54,80,498


## Define X and y, and split them

In [5]:
from sklearn.model_selection import train_test_split

X = filled_df[features]
y = filled_df[labels]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:
X.sample(15)

Unnamed: 0_level_0,type1,type2,is_form,is_legendary,is_mythical
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alcremie-vanilla-cream-clover,fairy,fairy,True,False,False
gligar-f,ground,flying,True,False,False
bergmite,ice,ice,False,False,False
bruxish,water,psychic,False,False,False
escavalier,bug,steel,False,False,False
quagsire,water,ground,False,False,False
torchic,fire,fire,False,False,False
axew,dragon,dragon,False,False,False
dewpider,water,bug,False,False,False
litleo,fire,normal,False,False,False


In [7]:
y.sample(15)

Unnamed: 0_level_0,hp,atk,spatk,def,spdef,speed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
urshifu-rapid-strike-gmax,100,130,63,100,60,97
grimer-alola,80,80,40,50,50,25
boltund,69,90,90,60,60,121
regirock,80,100,50,200,100,50
starmie,60,75,100,85,85,115
silcoon,50,35,25,55,25,15
zoroark-hisui,55,100,125,60,60,110
zweilous,72,85,65,70,70,58
clefairy,70,45,60,48,65,35
deoxys,50,150,150,50,50,150


## Create a pipeline

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
# https://www.kaggle.com/code/chimae/predicting-pokemon-types/notebook

# missing_values param = tells what value to consider a "missing value"
numeric_pipeline = make_pipeline(
    SimpleImputer(strategy="median", missing_values=np.nan)
)

categorical_pipeline = make_pipeline(
    #SimpleImputer(strategy="most_frequent", missing_values=pd.NA),
    SimpleImputer(strategy="constant", fill_value="NONE", missing_values=pd.NA),
    #OneHotEncoder(drop='first', handle_unknown = 'ignore') # https://www.roelpeters.be/found-unknown-categories-in-column-sklearn/
    OneHotEncoder(drop='first', handle_unknown = 'ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipeline, numeric_features),
        ("cat_pipe", categorical_pipeline, categorical_features),
    ]
)

"""
[{'classifier__criterion': 'gini',
  'classifier__max_depth': 9,
  'classifier__min_samples_leaf': 3,
  'classifier__min_samples_split': 3,
  'classifier__n_estimators': 14},
 max score = 0.9490377927196846]
"""

rainForestClf = RandomForestClassifier(
    n_estimators = 14, 
    max_depth=9, 
    random_state=123, 
    criterion='gini',                 
    min_samples_leaf=3, 
    min_samples_split=3,
)

clf = MultiOutputClassifier(rainForestClf)

# dtreeClf = DecisionTreeClassifier(
#     max_depth=6, 
#     min_samples_leaf=3, 
#     min_samples_split=3,
#     criterion='entropy'
# )

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier",  clf)]
)

In [9]:
# fit our model with the training data

pipe.fit(X_train, y_train)

# Calculate optimal classifier params with Grid Search

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import warnings

warnings.filterwarnings("ignore")

param_grid = {
    'classifier__n_estimators': range(2, 20),
    'classifier__max_depth': range(6, 10),
    'classifier__min_samples_leaf': range(3, 12, 2),
    'classifier__min_samples_split': range(3, 25, 5),
    'classifier__criterion':['gini', 'entropy']
}

search = GridSearchCV(
    pipe, # you have defined this beforehand
    param_grid, # your parameter grid
    cv=5, # the value for K in K-fold Cross Validation
    scoring='accuracy', # the performance metric to use, 
    verbose=1
) # we want informative outputs during the training process


# comment the following lines out when done (since the process is very slow):
#search.fit(X_train, y_train)
#[search.best_params_, search.best_score_]

## Generate predictions

In [11]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pd.DataFrame(pipe.predict(X_test), columns = ['hp','atk','spatk','def','spdef','speed'])
y_pred_test

Unnamed: 0,hp,atk,spatk,def,spdef,speed
0,70,95,65,60,65,60
1,40,40,40,50,50,70
2,65,60,110,75,121,64
3,50,65,40,60,50,70
4,60,85,40,70,60,60
...,...,...,...,...,...,...
348,60,100,90,50,50,89
349,60,100,50,45,45,50
350,50,50,65,65,60,50
351,60,100,90,50,50,89


## Check how good is our model with the accuracy score

In [12]:
pipe.score(X_test, y_pred_test)

1.0

## Comparing the results

In [19]:
result_df = X_test.copy()
#result_df = pd.concat([result_df, y_test, y_pred_test], axis=0)
result_df.sample(20)

Unnamed: 0_level_0,type1,type2,is_form,is_legendary,is_mythical
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
alakazam-mega,psychic,psychic,True,False,False
rampardos,rock,rock,False,False,False
alcremie-matcha-cream-star,fairy,fairy,True,False,False
sealeo,ice,water,False,False,False
accelgor,bug,bug,False,False,False
vivillon-sandstorm,bug,flying,True,False,False
machoke,fighting,fighting,False,False,False
swanna,water,flying,False,False,False
unown-f,psychic,psychic,True,False,False
alcremie-caramel-swirl-flower,fairy,fairy,True,False,False
