# Chi Squared Test

### 1. Goodness of Fit test
used to determine whether a sample of categorical data fits a specific probability distribution or theoretical model.

In [2]:
import pandas as pd

In [3]:
# create two dataframes called national and state.
national = pd.DataFrame(["white"]*100000 + ["hispanic"]*60000 +\
                        ["black"]*50000 + ["other"]*35000)

state = pd.DataFrame(["white"]*600 + ["hispanic"]*300 + \
                         ["black"]*250 + ["other"]*150)

# perform cross-tabulation to create frequency tables
national_table = pd.crosstab(index=national[0], columns="count")
state_table = pd.crosstab(index=state[0], columns="count")

# print the frequency tables
print( "National")
print(national_table)
print( "Minnesota")
print(state_table)

National
col_0      count
0               
black      50000
hispanic   60000
other      35000
white     100000
Minnesota
col_0     count
0              
black       250
hispanic    300
other       150
white       600


In [4]:
observed = state_table
national_ratios = national_table/len(national)
expected = national_ratios * len(state)

# chisquared test
chi_squared_stat = (((observed-expected)**2)/expected).sum()
print(chi_squared_stat)

col_0
count    17.884615
dtype: float64


### 2. Test of Independence
The chi-squared test of independence tests whether two categorical variables are independent.

In [6]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
import scipy.stats as sp
def compute_freq_chi2(x,y):
    freqtab = pd.crosstab(x,y)
    print("Frequency table")
    print("============================")
    print(freqtab)
    print("============================")
    chi2, pval, dof, expected = sp.chi2_contingency(freqtab)
    print("ChiSquare test statistic: ",chi2)
    print("p-value: ",pval)
    return

# chi squared test
price = pd.qcut(data['SalePrice'], 3, labels = ['High', 'Medium', 'Low'])
compute_freq_chi2(data.LandContour, price)

Frequency table
SalePrice    High  Medium  Low
LandContour                   
Bnk            32      20   11
HLS            10      12   28
Low             8      11   17
Lvl           437     447  427
ChiSquare test statistic:  26.252544346201447
p-value:  0.00019976918050008282
