In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import sklearn
import os
pd.options.plotting.backend = "plotly"

In [2]:
def save_fig(fig, filename):
    if not os.path.exists("./plot_images"):
        os.mkdir("./images")
    fig.write_image("./plot_images/" + filename + ".png")

### Reading Data

In [3]:
df = pd.read_csv('data/pokemon.csv', index_col='pokedex_number')
df.head()

Unnamed: 0_level_0,abilities,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,...,name,percentage_male,sp_attack,sp_defense,speed,type1,type2,weight_kg,generation,is_legendary
pokedex_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Bulbasaur,88.1,65,65,45,grass,poison,6.9,1,0
2,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Ivysaur,88.1,80,80,60,grass,poison,13.0,1,0
3,"['Overgrow', 'Chlorophyll']",1.0,1.0,1.0,0.5,0.5,0.5,2.0,2.0,1.0,...,Venusaur,88.1,122,120,80,grass,poison,100.0,1,0
4,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmander,88.1,60,50,65,fire,,8.5,1,0
5,"['Blaze', 'Solar Power']",0.5,1.0,1.0,1.0,0.5,1.0,0.5,1.0,1.0,...,Charmeleon,88.1,80,65,80,fire,,19.0,1,0


In [4]:
df.shape

(801, 40)

# Exploratoy Data Analysis (EDA)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 801 entries, 1 to 801
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   abilities          801 non-null    object 
 1   against_bug        801 non-null    float64
 2   against_dark       801 non-null    float64
 3   against_dragon     801 non-null    float64
 4   against_electric   801 non-null    float64
 5   against_fairy      801 non-null    float64
 6   against_fight      801 non-null    float64
 7   against_fire       801 non-null    float64
 8   against_flying     801 non-null    float64
 9   against_ghost      801 non-null    float64
 10  against_grass      801 non-null    float64
 11  against_ground     801 non-null    float64
 12  against_ice        801 non-null    float64
 13  against_normal     801 non-null    float64
 14  against_poison     801 non-null    float64
 15  against_psychic    801 non-null    float64
 16  against_rock       801 non

In [6]:
df.describe()

Unnamed: 0,against_bug,against_dark,against_dragon,against_electric,against_fairy,against_fight,against_fire,against_flying,against_ghost,against_grass,...,experience_growth,height_m,hp,percentage_male,sp_attack,sp_defense,speed,weight_kg,generation,is_legendary
count,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,801.0,...,801.0,781.0,801.0,703.0,801.0,801.0,801.0,781.0,801.0,801.0
mean,0.996255,1.057116,0.968789,1.07397,1.068976,1.065543,1.135456,1.192884,0.985019,1.03402,...,1054996.0,1.163892,68.958801,55.155761,71.305868,70.911361,66.334582,61.378105,3.690387,0.087391
std,0.597248,0.438142,0.353058,0.654962,0.522167,0.717251,0.691853,0.604488,0.558256,0.788896,...,160255.8,1.080326,26.576015,20.261623,32.353826,27.942501,28.907662,109.354766,1.93042,0.282583
min,0.25,0.25,0.0,0.0,0.25,0.0,0.25,0.25,0.0,0.25,...,600000.0,0.1,1.0,0.0,10.0,20.0,5.0,0.1,1.0,0.0
25%,0.5,1.0,1.0,0.5,1.0,0.5,0.5,1.0,1.0,0.5,...,1000000.0,0.6,50.0,50.0,45.0,50.0,45.0,9.0,2.0,0.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1000000.0,1.0,65.0,50.0,65.0,66.0,65.0,27.3,4.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,...,1059860.0,1.5,80.0,50.0,91.0,90.0,85.0,64.8,5.0,0.0
max,4.0,4.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,1640000.0,14.5,255.0,100.0,194.0,230.0,180.0,999.9,7.0,1.0


In [7]:
df.columns

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'sp_attack', 'sp_defense',
       'speed', 'type1', 'type2', 'weight_kg', 'generation', 'is_legendary'],
      dtype='object')

In [8]:
num_cols = list(df.select_dtypes(exclude=['object']).columns)
obj_cols = list(df.select_dtypes(include=['object']).columns)
null_cols = df.columns[df.isnull().any()]

print("Number of numeric columns:", len(num_cols))
print("Number of object columns:", len(obj_cols))
print("Number of null columns:", len(null_cols))

Number of numeric columns: 33
Number of object columns: 7
Number of null columns: 4


## Null Columns

In [9]:
df[null_cols].head()

Unnamed: 0_level_0,height_m,percentage_male,type2,weight_kg
pokedex_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.7,88.1,poison,6.9
2,1.0,88.1,poison,13.0
3,2.0,88.1,poison,100.0
4,0.6,88.1,,8.5
5,1.1,88.1,,19.0


In [10]:
df[null_cols].isnull().sum()

height_m            20
percentage_male     98
type2              384
weight_kg           20
dtype: int64

In [11]:
df.type2.fillna("None", inplace=True)

The other columns we will check correlations to impute the best values for NaN.

## Object Columns

In [12]:
df[obj_cols].head()

Unnamed: 0_level_0,abilities,capture_rate,classfication,japanese_name,name,type1,type2
pokedex_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"['Overgrow', 'Chlorophyll']",45,Seed Pokémon,Fushigidaneフシギダネ,Bulbasaur,grass,poison
2,"['Overgrow', 'Chlorophyll']",45,Seed Pokémon,Fushigisouフシギソウ,Ivysaur,grass,poison
3,"['Overgrow', 'Chlorophyll']",45,Seed Pokémon,Fushigibanaフシギバナ,Venusaur,grass,poison
4,"['Blaze', 'Solar Power']",45,Lizard Pokémon,Hitokageヒトカゲ,Charmander,fire,
5,"['Blaze', 'Solar Power']",45,Flame Pokémon,Lizardoリザード,Charmeleon,fire,


In [13]:
df.capture_rate = df.capture_rate.replace('30 (Meteorite)255 (Core)', 30)
df.capture_rate = df.capture_rate.astype(int)

### Type

In [14]:
dict_color_types = {"Water": "rgb(51, 153, 255)",
                    "Normal": "#e0e0d1",
                    "Grass": "#47d147",
                    "Bug": "#669900",
                    "Psychic": "#ff33cc",
                    "Fire": "#ff0000",
                    "Rock": "#ffd24d",
                    "Electric": "#ffff00",
                    "Poison": "#990099",
                    "Ground": "#cc9900",
                    "Dark": "#1a001a",
                    "Fighting": "#ff4000",
                    'Ghost': "#1a001a",
                    'Dragon': "#0033cc",
                    'Steel': "#999966",
                    'Ice': "#00ffff",
                    'Fairy': "#ffccee",
                    'Flying': "#d1e0e0",
                    }   

In [15]:
from plotly.subplots import make_subplots

count_type1 = df.groupby('type1').size().sort_values(ascending=True)
count_type1.index = count_type1.index.map(lambda x: x.capitalize())
count_type2 = df.groupby('type2').size().sort_values(ascending=True)
count_type2.index = count_type2.index.map(lambda x: x.capitalize())

fig = make_subplots(rows=1, cols=2, subplot_titles=("Type 1", "Type 2"),
                    )

for idx, typ in enumerate([count_type1, count_type2]):
    fig.add_trace(go.Bar(y=typ.index, x=typ.values, 
                       text=typ.values, textposition='outside',
                       orientation='h',
                       marker_color=typ.index.map(dict_color_types),
                       name="Type "+str(idx+1),
                       ),
                       row=1, col=idx+1)
    if idx == 1: # update range to show text on bars
        fig.update_xaxes(visible=False, range=[0, 500], row=1, col=idx+1) 
    else:
        fig.update_xaxes(visible=False, range=[0, 130], row=1, col=idx+1)

fig.update_layout(title='Nº of Pokemon Types',
                 template='plotly_white', width=1000, height=600,
                 showlegend=False,
                 margin=dict(l=40, r=10, t=100, b=20),
                 yaxis_title="Type",
                 )

fig.show()

In [16]:
df[df.type1 == 'flying'][["name", "type1", "type2"]]

Unnamed: 0_level_0,name,type1,type2
pokedex_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
641,Tornadus,flying,
714,Noibat,flying,dragon
715,Noivern,flying,dragon


### Classification

In [17]:
df["classfication"].value_counts()

Dragon Pokémon        8
Mouse Pokémon         6
Mushroom Pokémon      6
Flame Pokémon         5
Balloon Pokémon       5
                     ..
Marionette Pokémon    1
Puppet Pokémon        1
Color Swap Pokémon    1
Weather Pokémon       1
Artificial Pokémon    1
Name: classfication, Length: 588, dtype: int64

# Training our Clustering Algorithm

In [21]:
from sklearn.preprocessing import StandardScaler

scal = StandardScaler()
X_scaled = scal.fit_transform(df[set(num_cols)-set(null_cols)])

In [22]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, random_state=42)
gm.fit(X_scaled)

GaussianMixture(n_components=3, random_state=42)

In [28]:
scal.transform(df[df.name == 'Mewtwo'][set(num_cols)-set(null_cols)])

array([[ 1.7578709 , -0.35927587,  0.42484693, -0.78897983,  1.81926319,
         3.58995097,  2.24471534,  2.549893  , -1.39455066, -0.28327409,
        -0.19590922,  1.2175902 , -0.11300854,  2.95161166, -0.13217878,
         3.79462976,  2.1533519 , -1.02108029,  0.08845721,  1.68166846,
        -3.33706107, -0.31928584,  3.23154099,  0.0330966 , -0.09784534,
        -0.04315047, -0.13273054,  0.04490935, -0.09628202,  1.39465389]])

In [29]:
gm.predict(scal.transform(df[df.name == 'Mewtwo'][set(num_cols)-set(null_cols)]))

array([2], dtype=int64)

GaussianMixture(n_components=3, random_state=42)

# Notes

- see correlation between type and weight and height
- create column to check if the name is original or have similar ones