In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(".data/Pokemon.csv")
df.shape

(800, 13)

In [3]:
df.set_index('Name', inplace=True)

In [4]:
df

Unnamed: 0_level_0,#,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Bulbasaur,1,Grass,Poison,318,45,49,49,65,65,45,1,False
Ivysaur,2,Grass,Poison,405,60,62,63,80,80,60,1,False
Venusaur,3,Grass,Poison,525,80,82,83,100,100,80,1,False
VenusaurMega Venusaur,3,Grass,Poison,625,80,100,123,122,120,80,1,False
Charmander,4,Fire,,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...
Diancie,719,Rock,Fairy,600,50,100,150,100,150,50,6,True
DiancieMega Diancie,719,Rock,Fairy,700,50,160,110,160,110,110,6,True
HoopaHoopa Confined,720,Psychic,Ghost,600,80,110,60,150,130,70,6,True
HoopaHoopa Unbound,720,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [5]:
df[["HP", "Attack", "Speed"]].corr()

Unnamed: 0,HP,Attack,Speed
HP,1.0,0.422386,0.175952
Attack,0.422386,1.0,0.38124
Speed,0.175952,0.38124,1.0


In [6]:
def correlation_ratio(category_col: str, numerical_col: str, data: pd.DataFrame) -> float:
    x_cat = data[category_col]
    x_num = data[numerical_col]

    m_num = x_num.mean()
    all_variance = ((x_num-m_num)**2).sum()

    categories = x_cat.unique()
    categorical_num = [x_num[x_cat==category] for category in categories]
    categorical_var = [x.shape[0]*(x.mean()-m_num)**2 for x in categorical_num]

    r = sum(categorical_var) / all_variance
    return r

In [7]:
results = []
category_cols = ["Generation"]
numerical_cols = ["HP", "Speed"]
for category_col in category_cols:
    result = []
    for numerical_col in numerical_cols:
        r = correlation_ratio(category_col, numerical_col, df)
        result.append(r)
    results.append(result)
result_df = pd.DataFrame(results, index=category_cols, columns=numerical_cols)

In [8]:
result_df

Unnamed: 0,HP,Speed
Generation,0.012383,0.013666


In [9]:
import numpy as np
import scipy.stats as st

In [10]:
def cramerV(col1: str, col2: str, data: pd.DataFrame) -> float:
    confusion_matrix = pd.crosstab(data[col1], data[col2])
    x2, p, dof, e = st.chi2_contingency(confusion_matrix, False)

    n = confusion_matrix.sum().sum()
    r = np.sqrt(x2/(n*(np.min(confusion_matrix.shape)-1)))

    return r

In [11]:
def cramers_v(col1: str, col2: str, df: pd.DataFrame) -> float:
    x = df[col1]
    y = df[col2]
    confusion_matrix = pd.crosstab(x,y)
    chi2 = st.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

In [12]:
print(cramerV("Generation", "Legendary", df))
cramers_v("Generation", "Legendary", df)

0.11111234624561267


0.07807535108767247

In [13]:
cramerV("Generation", "Type 2", df)

0.34628835219141385

In [14]:
results = []
cols = ["Generation", "Type 1", "Type 2", "Legendary"]
for col1 in cols:
    result = []
    for col2 in cols:
        r = cramerV(col1, col2, df)
        result.append(r)
    results.append(result)
result_df = pd.DataFrame(results, index=cols, columns=cols)

In [15]:
result_df

Unnamed: 0,Generation,Type 1,Type 2,Legendary
Generation,1.0,0.214855,0.346288,0.111112
Type 1,0.214855,1.0,0.31341,0.336193
Type 2,0.346288,0.31341,1.0,0.243339
Legendary,0.111112,0.336193,0.243339,1.0


In [16]:
results = []
cols = ["Generation", "Type 1", "Type 2", "Legendary"]
for col1 in cols:
    result = []
    for col2 in cols:
        r = cramers_v(col1, col2, df)
        result.append(r)
    results.append(result)
result_df = pd.DataFrame(results, index=cols, columns=cols)

In [17]:
result_df

Unnamed: 0,Generation,Type 1,Type 2,Legendary
Generation,1.0,0.158249,0.282345,0.078075
Type 1,0.158249,1.0,0.243953,0.303091
Type 2,0.282345,0.243953,1.0,0.134519
Legendary,0.078075,0.303091,0.134519,0.991617
