### Chi-Square Test Implementation on Tips Dataset

#### This test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between two variables.

In [1]:
import pandas as pd
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df_table = pd.crosstab(df['sex'],df['smoker'])

In [5]:
df_table

smoker,Yes,No
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
Male,60,97
Female,33,54


In [6]:
observed_reln = df_table.values

In [7]:
observed_reln

array([[60, 97],
       [33, 54]], dtype=int64)

In [8]:
val = stats.chi2_contingency(df_table)

In [9]:
val

(0.008763290531773594,
 0.925417020494423,
 1,
 array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [10]:
expected_reln = val[3]

In [11]:
no_of_rows = len(df_table.iloc[0:2,0])
no_of_cols = len(df_table.iloc[0,0:2])

#### The Formula for degree of freedom is (No. of Rows - 1)*(No. of Cols - 1). 

In [12]:
degree_of_freedom = (no_of_rows - 1)* (no_of_cols - 1)
alpha = 0.05

In [13]:
from scipy.stats import chi2
chi_square = sum([(o-e)**2/e for o,e in zip(observed_reln,expected_reln)] )

In [14]:
chi_square_stats = chi_square[0] + chi_square[1]

In [18]:
chi_square_stats

0.001934818536627623

In [32]:
p_value = 1 - chi2.cdf(x = chi_square_stats, df = degree_of_freedom)
p_value

0.964915107315732

In [33]:
if p_value <= alpha :
    print('There is a relationship between 2 Categorical Feature')
else:
    print('There is no relationship between 2 Categorical Feature.')

There is no relationship between 2 Categorical Feature.
