<a href="https://colab.research.google.com/github/swarnava-96/Hypothesis-Testing/blob/main/Chi_Square_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Chi-Square Test**
The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables.

In [5]:
import scipy.stats as stats

In [6]:
import seaborn as sns
import numpy as np
import pandas as pd

# Loading the tips dataset from seaborn
dataset = sns.load_dataset('tips')

dataset.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
# Creating a crosstab with features "sex" and "smoker"

dataset_table = pd.crosstab(dataset["sex"], dataset["smoker"])
print(dataset_table)

smoker  Yes  No
sex            
Male     60  97
Female   33  54


In [8]:
# Checking the crosstab values

dataset_table.values

array([[60, 97],
       [33, 54]])

In [9]:
# Observed values

observed_values = dataset_table.values
print("Observed values: \n", observed_values)

Observed values: 
 [[60 97]
 [33 54]]


In [10]:
# Chi square contingency

val = stats.chi2_contingency(dataset_table)

In [11]:
val

(0.008763290531773594, 0.925417020494423, 1, array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

In [12]:
expected_values = val[3]

In [13]:
# Degrees of freedom

no_of_rows = len(dataset_table.iloc[0:2,0])
no_of_columns = len(dataset_table.iloc[0,0:2])
ddof = (no_of_rows - 1) * (no_of_columns - 1)
print("Degrees of Freedom: ",ddof )
alpha = 0.05

Degrees of Freedom:  1


In [17]:
# Chi square statistic

from scipy.stats import chi2
chi_square = sum([(o-e)**2./e for o,e in zip(observed_values,expected_values)])
chi_square_statistic = chi_square[0]+chi_square[1]
print("Chi Square Statistic: ", chi_square_statistic)

Chi Square Statistic:  0.001934818536627623


In [18]:
# Critical value

critical_value = chi2.ppf(q = 1 - alpha, df = ddof)
print("Critical Value: ", critical_value)

Critical Value:  3.841458820694124


In [19]:
# Pvalue

pvalue = 1 - chi2.cdf(x = chi_square_statistic, df = ddof)
print("Pvalue: ", pvalue)
print("Significance Level: ", alpha)
print("Degrees of Freedom", ddof)

Pvalue:  0.964915107315732
Significance Level:  0.05
Degrees of Freedom 1


In [20]:
# Hypothesis testing

## For a good test chi_square_statistic >= critical_value <always>
if chi_square_statistic >= critical_value:
  print("Reject null hypothesis, there is a relationship between the two categorical variables.")
else:
  print("Accept the null hypothesis, there is no relatonship between the two categorical variables.")

  # For a good test pvalue >= alpha

  if pvalue >= alpha:
    print("Accept the null hypothesis, there is a relationship between the two categorical variables.")
  else:
    print("Reject null hypothesis, there is a relationship between the two categorical variables.")

Accept the null hypothesis, there is no relatonship between the two categorical variables.
Accept the null hypothesis, there is a relationship between the two categorical variables.
