In [2]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [8]:
# define the data
df = pd.DataFrame(data={"searched": [False, True], "black": [36244, 1219], "white": [239241, 3108]})
df.set_index(keys=["searched"], inplace=True)
df

Unnamed: 0_level_0,black,white
searched,Unnamed: 1_level_1,Unnamed: 2_level_1
False,36244,239241
True,1219,3108


In [9]:
total_count = df["white"].values.sum() + df["black"].values.sum()
# normalize to get probabilities and to calculate indepedence
df["black_relative"] = df["black"] / total_count
df["white_relative"] = df["white"] / total_count
df

Unnamed: 0_level_0,black,white,black_relative,white_relative
searched,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
False,36244,239241,0.12953,0.855006
True,1219,3108,0.004356,0.011107


The Pearson chi-squared test allows us to test whether observed frequencies are different from expected frequencies, so we need to determine what frequencies we would expect in each cell if searches and race were unrelated – which we can define as being _independent_.=> P(Race intersection Searched) = P(Race) * P(Searched)

In [16]:
p_searched = df.loc[True].black_relative + df.loc[True].white_relative
p_no_searched = df.loc[False].black_relative + df.loc[False].white_relative
p_black = df.black_relative.sum()
p_white = df.white_relative.sum()
print(p_searched, p_no_searched, p_black, p_white)

0.015463954369362286 0.9845360456306378 0.13388632367446715 0.8661136763255329


In [23]:
# null hypothesis that race and searched are independent
df_null = pd.DataFrame(data={
    "searched": [False, True],
    "black_relative": [p_black * p_no_searched, p_black * p_searched],
    "white_relative": [p_white*p_no_searched, p_white*p_searched]
    }
)
df_null.set_index(keys=["searched"], inplace=True)
df_null

Unnamed: 0_level_0,black_relative,white_relative
searched,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.131816,0.85272
True,0.00207,0.013394


In [32]:
ddof = (2 - 1) * (2 - 1)  # nRows - 1 * nCols - 1
observed_vals = np.concatenate([df.black_relative.values, df.white_relative.values]) * total_count
expected_vals = np.concatenate([df_null.black_relative, df_null.white_relative.values]) * total_count
# denormalize to get real vals
chi2 = np.power(observed_vals - expected_vals, 2)
chi2 = (chi2 / expected_vals).sum()
p_val = stats.chi2.pdf(chi2, df=ddof)
print(chi2, p_val)

828.2998869866643 1.9001204194034058e-182
