In [6]:
#Problem Statement 1: 

#Is gender independent of education level? A random sample of 395 people were surveyed and each person was asked to report 
#the highest education level they obtained. The data that resulted from the survey is summarized in the following table: 
#      High School    Bachelors   Masters   Ph.d.   Total 
#Female 60             54          46        41      201 
#Male   40             44          53        57      194 
# 
#Total  100            98          99        98      395 

#Question:  Are gender and education level dependent at 5% level of significance?  
#In other words, given the data collected above, is there a relationship between the gender of an individual and the level 
#of education that they have obtained?  

import numpy as np
import scipy.stats as sci
import pandas as pd
import matplotlib.pyplot as plt
import math as mth

#1st Solution using ZScore and PValue

#array
female_array=np.array([60,54,46,41])
male_array=np.array([40,44,53,57])
education_level=['High School','Bachelors','Masters','PHD']

#data set
df=pd.DataFrame({'Edcuation_Level':education_level,'Male_Count':male_array,'Female_Count':female_array})

#Calculate mean, std deviation, z score, and P Values of the data frame.
df['male_mean']=male_array.mean()
df['female_mean']=female_array.mean()
df['male_std_dev']=male_array.std()
df['female_std_dev']=female_array.std()
df['male_zscore']=sci.zscore(male_array)
df['female_zscore']=sci.zscore(female_array)
df['female_pvalue']=[sci.norm.cdf(pval) for pval in sci.zscore(female_array)]
df['male_pvalue']=[sci.norm.cdf(pval) for pval in sci.zscore(male_array )]

In [7]:
#Print data frame
df

Unnamed: 0,Edcuation_Level,Male_Count,Female_Count,male_mean,female_mean,male_std_dev,female_std_dev,male_zscore,female_zscore,female_pvalue,male_pvalue
0,High School,40,60,48.5,50.25,6.800735,7.292976,-1.249865,1.336903,0.909373,0.105674
1,Bachelors,44,54,48.5,50.25,6.800735,7.292976,-0.661693,0.514193,0.696442,0.254084
2,Masters,53,46,48.5,50.25,6.800735,7.292976,0.661693,-0.582752,0.28003,0.745916
3,PHD,57,41,48.5,50.25,6.800735,7.292976,1.249865,-1.268344,0.102338,0.894326


In [10]:
print('1.) Clearly Male and Female P values shows there is a relationship among Gender and Education level which is visible more than 5%')
print('2.) Female populations is more at lower education like High School and Bachelors')
print('3.) Female populations is less at higher education like Masters and PHD')
print('4.) Male populations is less at lower education like High School and Bachelors')
print('3.) Male populations is more at higher education like Masters and PHD')

df.loc[(df.Edcuation_Level==df.Edcuation_Level),['Edcuation_Level','female_zscore','female_pvalue','male_zscore','male_pvalue']]

1.) Clearly Male and Female P values shows there is a relationship among Gender and Education level which is visible more than 5%
2.) Female populations is more at lower education like High School and Bachelors
3.) Female populations is less at higher education like Masters and PHD
4.) Male populations is less at lower education like High School and Bachelors
3.) Male populations is more at higher education like Masters and PHD


Unnamed: 0,Edcuation_Level,female_zscore,female_pvalue,male_zscore,male_pvalue
0,High School,1.336903,0.909373,-1.249865,0.105674
1,Bachelors,0.514193,0.696442,-0.661693,0.254084
2,Masters,-0.582752,0.28003,0.661693,0.745916
3,PHD,-1.268344,0.102338,1.249865,0.894326


In [11]:
#2nd Solution using "Chi-Square Test of Independence"

# Calcuate Expected Frequency using E=row total * column total sample size
# Calcuate CHi Sqare statistics using χ2=∑(O−E)2/E 
# Find Chi Square  value from CHI Square table for 3 degree freedom with 0.05
# Compare with hypothies
#Ho=Null Hypothesis - Gender is independant of education level
#H1=Alternative Hypothesis - Gender is dependant of education Level

#Array
female_array=np.array([60,54,46,41])
male_array=np.array([40,44,53,57])

#Total Populations
total_population=female_array.sum()+male_array.sum()

education_level=['High School','Bachelors','Masters','PHD']
chi_df=pd.DataFrame({'Edcuation_level':education_level,'Female':female_array,'Male':male_array})
chi_df['tot_male_female']=chi_df.Female+chi_df.Male

#Use E=E is the expected frequency under the null hypothesis and computed by ((row total×column)/total sample size)
chi_df['E_Female']=(chi_df.Female.sum()*chi_df.tot_male_female)/total_population
chi_df['E_Male']=chi_df.tot_male_female-chi_df.E_Female

#Chi Sqare test statistics using χ2=∑(O−E)2/E for each frequency where O represent Obeserved Frequency and E is Expected Frequency.
chi_df['chi_sq_stat_female']=[(mth.pow((chi_df.Female.values[i]-chi_df.E_Female.values[i]),2))/chi_df.E_Female.values[i] for i in range(chi_df.Female.count())]
chi_df['chi_sq_stat_male']=[(mth.pow((chi_df.Male.values[i]-chi_df.E_Male.values[i]),2))/chi_df.E_Male.values[i] for i in range(chi_df.Male.count()) ]

#total of chi_sq_stat
chi_sq_stat=chi_df.chi_sq_stat_female.sum() +chi_df.chi_sq_stat_male.sum()

#Total Chi Sqare Test statistics
chi_sq_stat
print('The critical value(from Chi Sqare table) of χ2 with 3 degree of freedom is :', 7.815)
print(' ')
print('Since 8.006 > 7.815, therefore we reject the null hypothesis and conclude that the education level depends on gender at a 5% level of significance')
chi_df.head(5)

The critical value(from Chi Sqare table) of χ2 with 3 degree of freedom is : 7.815
 
Since 8.006 > 7.815, therefore we reject the null hypothesis and conclude that the education level depends on gender at a 5% level of significance


Unnamed: 0,Edcuation_level,Female,Male,tot_male_female,E_Female,E_Male,chi_sq_stat_female,chi_sq_stat_male
0,High School,60,40,100,50.886076,49.113924,1.632345,1.691244
1,Bachelors,54,44,98,49.868354,48.131646,0.342311,0.354663
2,Masters,46,53,99,50.377215,48.622785,0.380331,0.394054
3,PHD,41,57,98,49.868354,48.131646,1.577107,1.634012


In [12]:
#Problem Statement 2: 

#Using the following data, perform a oneway analysis of variance using α=.05. Write up the results in APA format. 
#[Group1: 51, 45, 33, 45, 67] 
#[Group2: 23, 43, 23, 43, 45]
#[Group3: 56, 76, 74, 87, 56] 
 
group1=np.array([51, 45, 33, 45, 67])
group2=np.array([23, 43, 23, 43, 45])
group3=np.array([56, 76, 74, 87, 56])
alpha=0.05

print ('Group 1',group1)
print ('Group 2',group2)
print ('Group 3',group3)
print(' ')

#One way test
#sci.f_oneway(group1,group2,group3) function
print(sci.f_oneway(group1,group2,group3)) 

#P Value and F Test value is different as Papulation Mean are different
pvalues=sci.f_oneway(group1,group2,group3).pvalue
Fvalues=sci.f_oneway(group1,group2,group3).statistic
print('Pvalues :{} and Fvalue: {}'.format(pvalues,Fvalues))

print(' ')  

#Hypothesis Test
print('H0 Null Hypothesis Group1=Group2=Group3')
print('H1 Alternative Hypothesis Group1!=Group2!=Group3')
print('Hypothesis testing with 5% significance')
print(' ')

if pvalues > alpha:
    print('Reject Null Hypothesis that Group1=Group2=Group3')
else:    
    print('As per P Values which is less than 0.05, Accept the Null Hypothesis that Group1=Group2=Group3')
    print('APA writeup', 'F value =9.75, p value <0.05')

print(' ')

Group 1 [51 45 33 45 67]
Group 2 [23 43 23 43 45]
Group 3 [56 76 74 87 56]
 
F_onewayResult(statistic=9.747205503009463, pvalue=0.0030597541434430556)
Pvalues :0.0030597541434430556 and Fvalue: 9.747205503009463
 
H0 Null Hypothesis Group1=Group2=Group3
H1 Alternative Hypothesis Group1!=Group2!=Group3
Hypothesis testing with 5% significance
 
As per P Values which is less than 0.05, Accept the Null Hypothesis that Group1=Group2=Group3
APA writeup F value =9.75, p value <0.05
 


In [14]:
#Problem Statement 3: 

#Calculate F Test for given 10, 20, 30, 40, 50 and 5,10,15, 20, 25.  
#For 10, 20, 30, 40, 50: 

first_set=np.array([10,20,30,40,50])

#First Set
first_set

array([10, 20, 30, 40, 50])

In [15]:
#Mean First Set 
first_set.mean()

30.0

In [16]:
#Variance of First Set
first_set.var()

200.0

In [21]:
#Standard deviation of First set
first_set.std()

14.142135623730951

In [22]:
second_Set=np.array([5,10,15, 20, 25])
second_Set

array([ 5, 10, 15, 20, 25])

In [23]:
#Mean Second Set
second_Set.mean()

15.0

In [24]:
#Variance of Second Set
second_Set.var()

50.0

In [25]:
#Standard deviation of Second set
second_Set.std()

7.0710678118654755

In [27]:
#F-test
ftest=first_set.var()/second_Set.var()
#F Test Result
ftest

4.0

In [28]:
#P Values, hypothesis testing 5% significance
#Hypothesis Test
print('H0 Null Hypothesis first_set = Second_Set')
print('H1 Alternative Hypothesis first_set != Second_Set')
print('Assumed hypothesis testing with 5% significance')
print(' ')
alpha = 0.05 

#P Value using sci.f.cdf(ftest, len(first_set)-1,len(second_Set)-1)
p_value = sci.f.cdf(ftest, len(first_set)-1,len(second_Set)-1)

print('P Values is :',p_value)
print(' ')
if p_value > alpha:
    print ('P_Value is more than 5% significance and So')
    print ('H0 Null Hypothesis first_set = Second_Set is Rejected')
else:
    print ('H0 Null Hypothesis first_set = Second_Set is Accepted')

H0 Null Hypothesis first_set = Second_Set
H1 Alternative Hypothesis first_set != Second_Set
Assumed hypothesis testing with 5% significance
 
P Values is : 0.896
 
P_Value is more than 5% significance and So
H0 Null Hypothesis first_set = Second_Set is Rejected
