In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import random
import math

from utils.explore import *

## Chi-squared test

In [3]:
us = pd.DataFrame(
    ["white"] * 100000 + 
    ["hispn"] * 60000 +    
    ["black"] * 50000 + 
    ["asian"] * 15000 + 
    ["other"] * 35000
)
mn = pd.DataFrame(
    ["white"] * 600 + 
    ["hispn"] * 300 +
    ["black"] * 250 +
    ["asian"] * 75 + 
    ["other"] * 150
)

ct_us = pd.crosstab(index=us[0], columns="count")
ct_mn = pd.crosstab(index=mn[0], columns="count")

In [13]:
observed = ct_mn
expected = ct_us / len(us) * len(mn)
chi_sqrd = ((observed - expected) ** 2 / expected).sum()

In [14]:
observed - expected

col_0,count
0,Unnamed: 1_level_1
asian,-4.326923
black,-14.423077
hispn,-17.307692
other,-35.096154
white,71.153846


In [15]:
(observed - expected) ** 2

col_0,count
0,Unnamed: 1_level_1
asian,18.722263
black,208.025148
hispn,299.556213
other,1231.740015
white,5062.869822


In [16]:
(observed - expected) ** 2 / expected

col_0,count
0,Unnamed: 1_level_1
asian,0.236014
black,0.786713
hispn,0.944056
other,6.654595
white,9.573427


In [17]:
((observed - expected) ** 2 / expected).sum()

col_0
count    18.194805
dtype: float64

In [18]:
c_crit = stats.chi2.ppf(q=0.95, df=4)
c_crit

9.487729036781154

In [20]:
p_value = 1 - stats.chi2.cdf(x=chi_sqrd, df=4)
p_value

array([ 0.00113047])

In [22]:
stats.chisquare(f_obs=observed, f_exp=expected)

Power_divergenceResult(statistic=array([ 18.19480519]), pvalue=array([ 0.00113047]))

## Chi-squared independence test

In [23]:
np.random.seed(10)
voter_race = np.random.choice(
    a=["asian", "black", "hispanic", "other", "white"],
    p=[0.05, 0.15 ,0.25, 0.05, 0.5],
    size=1000
)
voter_party = np.random.choice(
    a=["democrat", "independent", "republican"],
    p=[0.4, 0.2, 0.4],
    size=1000
)

In [26]:
df_voters = pd.DataFrame({"race": voter_race, "party": voter_party})
df_voters.head()

Unnamed: 0,party,race
0,democrat,white
1,republican,asian
2,independent,white
3,republican,white
4,democrat,other


In [42]:
ct_voters = pd.crosstab(index=voters["race"], columns=voters["party"], margins=True)
ct_voters.index = ["asian", "black", "hispanic", "other", "white", "row_totals"]
ct_voters.columns = ["democrat", "independent", "republican", "column_totals"]
ct_voters

Unnamed: 0,democrat,independent,republican,column_totals
asian,21,7,32,60
black,65,25,64,154
hispanic,107,50,94,251
other,15,8,15,38
white,189,96,212,497
row_totals,397,186,417,1000


In [30]:
observed = ct_voters.iloc[0:5, 0:3]
observed

party,democrat,independent,republican
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asian,21,7,32
black,65,25,64
hispanic,107,50,94
other,15,8,15
white,189,96,212


In [50]:
ct_voters["column_totals"][:5]

asian        60
black       154
hispanic    251
other        38
white       497
Name: column_totals, dtype: int64

In [53]:
ct_voters.loc["row_totals"][:3]

democrat       397
independent    186
republican     417
Name: row_totals, dtype: int64

In [72]:
ct_voters.iloc[:, 3]

asian           60
black          154
hispanic       251
other           38
white          497
row_totals    1000
Name: column_totals, dtype: int64

In [74]:
ct_voters.iloc[5, :]

democrat          397
independent       186
republican        417
column_totals    1000
Name: row_totals, dtype: int64

In [63]:
# use expect an even distribution
expected = pd.DataFrame(np.outer(
    ct_voters["column_totals"][:5],
    ct_voters.loc["row_totals"][:3]
) / float(len(df_voters)))

expected.index = observed.index
expected.columns = observed.columns

expected

party,democrat,independent,republican
race,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
asian,23.82,11.16,25.02
black,61.138,28.644,64.218
hispanic,99.647,46.686,104.667
other,15.086,7.068,15.846
white,197.309,92.442,207.249


In [64]:
# practically rmse
chi_sqrd = (((observed - expected) ** 2 / expected).sum()).sum()
chi_sqrd

7.169321280162059

In [65]:
# for a df=8 ((5 - 1) * (3 - 1), find the point to the right
# of 0.95 of all observations
c_crit = stats.chi2.ppf(q=0.95, df=8)
c_crit
# 7.1693 falls comfortably within the bounds

15.507313055865453

In [68]:
# find the p_value
p_value = 1 - stats.chi2.cdf(x=chi_sqrd, df=8)
p_value
# 0.52 probability of any difference in our observed values
# being due to chance

0.51847939294884204

In [70]:
stats.chi2_contingency(observed=observed)

(7.1693212801620589,
 0.51847939294884204,
 8,
 array([[  23.82 ,   11.16 ,   25.02 ],
        [  61.138,   28.644,   64.218],
        [  99.647,   46.686,  104.667],
        [  15.086,    7.068,   15.846],
        [ 197.309,   92.442,  207.249]]))