# Chi 2 test Implementation

The test is applied when you have two categorical variables from a single population. 
It is used to determine whether there is a significant association between the two variables.

In [37]:
import pandas as pd
import numpy as np
import scipy.stats as stats 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


now we will check the relationship between 2 categorical columns i.e sex and smoker

In [4]:
datas = pd.crosstab(df['sex'],df['smoker']) #determine the frequency of each rows
datas.values

array([[60, 97],
       [33, 54]], dtype=int64)

In [5]:
obs_values = datas.values #observe values
obs_values

array([[60, 97],
       [33, 54]], dtype=int64)

In [6]:
chi_val = stats.chi2_contingency(obs_values)
chi_val

(0.008763290531773594, 0.925417020494423, 1, array([[59.84016393, 97.15983607],
        [33.15983607, 53.84016393]]))

returns : chi2 ,p-value,degree of freedom , expected value
 

we have seen that the expected value is nearer to the observed value 

In [7]:
exp_values = chi_val[3]
exp_values

array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]])

In [8]:
rows = len(datas.iloc[:,0])
columns = len(datas.iloc[0,:])
ddof = (rows-1)*(columns-1) 
alpha = 0.05
ddof

1

In [9]:
from scipy.stats import chi2

In [10]:
chi_sq_val = sum((O-E)**2/E for O,E in zip(obs_values,exp_values))

In [11]:
chi_sq_val

array([0.00119737, 0.00073745])

In [12]:
chi_sq_tot = chi_sq_val[0] + chi_sq_val[1]
chi_sq_tot

0.001934818536627623

In [13]:
critical_value=chi2.ppf(q=1-alpha,df=ddof)
critical_value

3.841458820694124

In [14]:
p_value = 1 - chi2.cdf(x=chi_sq_tot,df=ddof)
p_value

0.964915107315732

In [15]:
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)

p-value: 0.964915107315732
Significance level:  0.05
Degree of Freedom:  1


In [16]:
if chi_sq_tot >=critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Accept H0,There is no relationship between 2 categorical variables")
    
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Accept H0,There is no relationship between 2 categorical variables")

Accept H0,There is no relationship between 2 categorical variables
Accept H0,There is no relationship between 2 categorical variables


# T Test

In [17]:
ages=[10,20,35,50,28,40,55,18,16,55,30,25,43,18,30,28,14,24,16,17,32,35,26,27,65,18,43,23,21,20,19,70]

In [18]:
len(ages)

32

In [19]:
ages_mean = np.mean(ages)

In [20]:
ages_mean

30.34375

In [21]:
sample_size = 10

In [22]:
age_sample = np.random.choice(ages,size=10)

In [23]:
age_sample

array([30, 14, 35, 30, 16, 30, 21, 23, 20, 40])

In [24]:
from scipy.stats import ttest_1samp  # 1 sample test

In [25]:
ttest_1samp(age_sample,30)

Ttest_1sampResult(statistic=-1.538822994729871, pvalue=0.15823030429870413)

In [26]:
if p_value < 0.05:
    print('Reject the null hypothesis')
else:
    print('Fail to Reject the null hypothesis')

Fail to Reject the null hypothesis


There is significant relation ship or differences between the samples

In [27]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import math
np.random.seed(6)

In [28]:
school_ages = stats.poisson.rvs(loc=18,mu=35,size=1500)
classA_ages = stats.poisson.rvs(loc=18,mu=30,size=60)

In [29]:
classA_ages.mean()

46.9

In [30]:
_,p_value = stats.ttest_1samp(a=classA_ages,popmean=school_ages.mean())

In [31]:
p_value

1.139027071016194e-13

In [32]:
if p_value < 0.05:    # alpha value is 0.05 or 5%
    print(" we are rejecting null hypothesis")
else:
    print("we are accepting null hypothesis")

 we are rejecting null hypothesis


In [33]:
np.random.seed(12)
classA_height=stats.poisson.rvs(loc=18,mu=36,size=600)
ClassB_ages=stats.poisson.rvs(loc=18,mu=33,size=60)
ClassB_ages.mean()

50.06666666666667

In [34]:
_,p_value=stats.ttest_ind(a=classA_height,b=ClassB_ages,equal_var=False)

In [35]:
if p_value < 0.05:    # alpha value is 0.05 or 5%
    print(" we are rejecting null hypothesis")
else:
    print("we are accepting null hypothesis")

 we are rejecting null hypothesis
