# SuperEffective.gg Pokémon Dataset - Data Notebook

## Imports, constants and functions

In [121]:
## Imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy
import functions as fn

## Load and clean-up data

In [122]:
## Load Data from original JSON dataset

TYPE_NONE = 'NONE'
features=['hp', 'atk', 'spatk', 'def', 'spdef', 'speed', 'bst', 'color', 'is_form', 'is_legendary', 'is_mythical']

numeric_features = ['hp', 'atk', 'spatk', 'def', 'spdef', 'speed', 'bst', 'is_form', 'is_legendary', 'is_mythical']
categorical_features = ['color'] # we don't have any yet

labels=['type1', 'type2']
types = {
    # TYPE_NONE,
    'normal',
    'fire',
    'water',
    'electric',
    'grass',
    'ice',
    'fighting',
    'poison',
    'ground',
    'flying',
    'psychic',
    'bug',
    'rock',
    'ghost',
    'dragon',
    'dark',
    'steel',
    'fairy' 
}

raw_df = fn.load_json_datasource()
df = fn.cleanup_df(raw_df)
df.loc[df['type2'].isna(), ['type2']] = pd.NA
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1409 entries, bulbasaur to enamorus-therian
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type1         1409 non-null   object
 1   type2         696 non-null    object
 2   color         1409 non-null   object
 3   gen           1409 non-null   int64 
 4   region        1409 non-null   object
 5   is_form       1409 non-null   bool  
 6   is_legendary  1409 non-null   bool  
 7   is_mythical   1409 non-null   bool  
 8   hp            1409 non-null   int64 
 9   atk           1409 non-null   int64 
 10  def           1409 non-null   int64 
 11  spatk         1409 non-null   int64 
 12  spdef         1409 non-null   int64 
 13  speed         1409 non-null   int64 
 14  bst           1409 non-null   int64 
dtypes: bool(3), int64(8), object(4)
memory usage: 179.5+ KB


In [123]:
# Clean up NaN type2's by replacing them with their type1
# unfortunately, an imputer can't do that for us so we'll have to do it beforehand

from sklearn.base import TransformerMixin

def fill_missing_type2(df):
    newDf = df.copy()
    newDf.loc[newDf['type2'].isna(), 'type2'] = newDf['type1']
    return newDf

filled_df = fill_missing_type2(df)
filled_df.sample(10)

Unnamed: 0_level_0,type1,type2,color,gen,region,is_form,is_legendary,is_mythical,hp,atk,def,spatk,spdef,speed,bst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
palkia,water,dragon,purple,4,sinnoh,False,True,False,90,120,100,150,120,100,680
aipom-f,normal,normal,purple,4,johto,True,False,False,55,70,55,40,55,85,360
wimpod,bug,water,gray,7,alola,False,False,False,25,35,40,20,30,80,230
pumpkaboo-small,ghost,grass,brown,6,kalos,True,False,False,44,66,70,44,55,56,335
porygon,normal,normal,pink,1,kanto,False,False,False,65,60,70,85,75,40,395
audino-mega,normal,normal,white,6,kalos,True,False,False,103,60,126,80,126,50,545
bayleef,grass,grass,green,2,johto,False,False,False,60,62,80,63,80,60,405
herdier,normal,normal,gray,5,unova,False,False,False,65,80,65,35,65,60,370
wishiwashi-school,water,water,blue,7,alola,True,False,False,45,140,130,140,135,30,620
lotad,water,grass,green,3,hoenn,False,False,False,40,30,30,40,50,30,220


## Define X and y, and split them

In [124]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Encode string labels to int
encoded_df = filled_df

type1_le = preprocessing.LabelEncoder().fit(encoded_df['type1'])
encoded_df['type1'] = type1_le.transform(encoded_df['type1'])


type2_le = preprocessing.LabelEncoder().fit(encoded_df['type2'])
encoded_df['type2'] = type2_le.transform(encoded_df['type2'])


X = encoded_df[features]
y = encoded_df[labels]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [125]:
X.sample(15)

Unnamed: 0_level_0,hp,atk,spatk,def,spdef,speed,bst,color,is_form,is_legendary,is_mythical
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
darumaka,70,90,15,45,45,50,315,red,False,False,False
floette-blue,54,45,75,47,98,52,371,white,True,False,False
silvally-ghost,95,95,95,95,95,95,570,gray,True,True,False
larvesta,55,85,50,55,55,60,360,white,False,False,False
unown-r,48,72,72,48,48,48,336,black,True,False,False
charizard,78,84,109,78,85,100,534,red,False,False,False
bidoof,59,45,35,40,40,31,250,brown,False,False,False
spearow,40,60,31,30,31,70,262,brown,False,False,False
chikorita,45,49,49,65,65,45,318,green,False,False,False
cryogonal,80,50,95,50,135,105,515,blue,False,False,False


In [126]:
y.sample(15)

Unnamed: 0_level_0,type1,type2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
bastiodon,15,16
spewpa,0,0
larvitar,15,10
dartrix,9,7
roggenrola,15,15
carnivine,9,9
heracross,0,5
ninetales,6,6
beautifly-f,0,7
phanpy,10,10


## Create a pipeline

In [127]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import make_column_selector as selector
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
# https://www.kaggle.com/code/chimae/predicting-pokemon-types/notebook

# missing_values param = tells what value to consider a "missing value"
numeric_pipeline = make_pipeline(
    SimpleImputer(strategy="median", missing_values=np.nan)
)

categorical_pipeline = make_pipeline(
    #SimpleImputer(strategy="most_frequent", missing_values=pd.NA),
    SimpleImputer(strategy="constant", fill_value="NONE", missing_values=pd.NA),
    #OneHotEncoder(drop='first', handle_unknown = 'ignore') # https://www.roelpeters.be/found-unknown-categories-in-column-sklearn/
    OneHotEncoder(drop='first', handle_unknown = 'ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipeline, numeric_features),
        ("cat_pipe", categorical_pipeline, categorical_features),
    ]
)

"""
[{'classifier__criterion': 'gini',
  'classifier__max_depth': 9,
  'classifier__min_samples_leaf': 3,
  'classifier__min_samples_split': 3,
  'classifier__n_estimators': 14},
 max score = 0.9490377927196846]
"""

rainForestClf = RandomForestClassifier(
    n_estimators = 14, 
    max_depth=9, 
    random_state=123, 
    criterion='gini',                 
    min_samples_leaf=3, 
    min_samples_split=3,
)

clf = MultiOutputClassifier(rainForestClf)

# dtreeClf = DecisionTreeClassifier(
#     max_depth=6, 
#     min_samples_leaf=3, 
#     min_samples_split=3,
#     criterion='entropy'
# )

pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier",  clf)]
)

In [128]:
# fit our model with the training data

pipe.fit(X_train, y_train)

# Calculate optimal classifier params with Grid Search

In [129]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import warnings

warnings.filterwarnings("ignore")

param_grid = {
    'classifier__n_estimators': range(2, 20),
    'classifier__max_depth': range(6, 10),
    'classifier__min_samples_leaf': range(3, 12, 2),
    'classifier__min_samples_split': range(3, 25, 5),
    'classifier__criterion':['gini', 'entropy']
}

search = GridSearchCV(
    pipe, # you have defined this beforehand
    param_grid, # your parameter grid
    cv=5, # the value for K in K-fold Cross Validation
    scoring='accuracy', # the performance metric to use, 
    verbose=1
) # we want informative outputs during the training process


# comment the following lines out when done (since the process is very slow):
#search.fit(X_train, y_train)
#[search.best_params_, search.best_score_]

## Generate predictions

In [130]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pd.DataFrame(pipe.predict(X_test), columns = ['type1', 'type2'])
y_pred_test

Unnamed: 0,type1,type2
0,0,5
1,12,12
2,4,4
3,17,17
4,9,9
...,...,...
348,0,7
349,17,14
350,17,17
351,9,9


## Check how good is our model with the accuracy score

In [131]:
pipe.score(X_test, y_pred_test)

1.0

## Comparing the results

In [132]:
type1_test = type1_le.inverse_transform(y_test['type1'])
type2_test = type1_le.inverse_transform(y_test['type2'])

type1_pred = type1_le.inverse_transform(y_pred_test['type1'])
type2_pred = type1_le.inverse_transform(y_pred_test['type2'])


result_df = X_test.copy()

result_df['type1'] = type1_test
result_df['type2'] = type2_test
result_df['type1_pred'] = type1_pred
result_df['type2_pred'] = type2_pred

result_df

Unnamed: 0_level_0,hp,atk,spatk,def,spdef,speed,bst,color,is_form,is_legendary,is_mythical,type1,type2,type1_pred,type2_pred
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
heracross-f,80,125,40,75,95,85,500,blue,True,False,False,bug,fighting,bug,fighting
swanna,75,87,87,63,63,98,473,white,False,False,False,water,flying,normal,normal
alcremie-matcha-cream-star,65,60,110,75,121,64,495,white,True,False,False,fairy,fairy,fairy,fairy
seaking,80,92,65,65,80,68,450,red,False,False,False,water,water,water,water
simisage,75,98,98,63,63,101,498,green,False,False,False,grass,grass,grass,grass
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
vivillon-elegant,80,52,90,50,50,89,411,white,True,False,False,bug,flying,bug,flying
phanpy,90,60,40,60,40,40,330,blue,False,False,False,ground,ground,water,psychic
glaceon,65,60,130,110,95,65,525,blue,False,False,False,ice,ice,water,water
scyther-f,70,110,55,80,80,105,500,green,True,False,False,bug,flying,grass,grass
