# Hypothesis Testing

Performing a hypothesis check is a way to verify information about any parameter. In this method, we use two different hypotheses i.e. H0, referred to as the null hypothesis and alternate hypothesis which is the opposite as our null hypothesis. Our main objective is to verify whether our null hypothesis is true or false based on the data. We try to change the null hypothesis in such a way that the resulting null hypothesis is true

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from random import sample
data = pd.read_csv('C:/Users/michael.thabane/Documents/Springboard/Capstone1/online_shoppers_intention.csv')
Purchase = data[data.Revenue == True]
Browse = data[data.Revenue == False]
data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


#### Hypothesis Testing for Categorical Variables

In [2]:
data_crosstabs = pd.crosstab(data['Weekend'],data['Revenue'])
print(data_crosstabs)

Revenue  False  True 
Weekend              
False     8053   1409
True      2369    499


In [3]:
# perform chi-square hypothesis test
stat, p, dof, expected = stats.chi2_contingency(data_crosstabs)
critical = stats.chi2.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistics = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 1
p value = 0
Test Statistics = 10
Critical Value = 3


In [4]:
visitor_type = {'Returning_Visitor' : 'Returning_Visitor', 'Other' : 'Other', 'New_Visitor' : 'Other'}
data['VisitorType2'] = data['VisitorType'].map(visitor_type)

In [5]:
data_crosstabs2 = pd.crosstab(data['VisitorType2'],data['Revenue'])
print(data_crosstabs2)

Revenue            False  True 
VisitorType2                   
Other               1341    438
Returning_Visitor   9081   1470


In [6]:
# perform chi-square hypothesis test
stat, p, dof, expected = stats.chi2_contingency(data_crosstabs2)
critical = stats.chi2.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 1
p value = 0
Test Statistic = 132
Critical Value = 3


In [7]:
Special_Day = {0.0 : 'Special Day', 0.2 : 'Regular Day', 0.4 : 'Regular Day', 0.6 : 'Regular Day', 0.8 : 'Regular Day',
              1.0 : 'Regular Day'}
data['SpecialDay2'] = data['SpecialDay'].map(Special_Day)

In [8]:
data_crosstabs3 = pd.crosstab(data['SpecialDay2'],data['Revenue'])
print(data_crosstabs3)

Revenue      False  True 
SpecialDay2              
Regular Day   1174     77
Special Day   9248   1831


In [9]:
# perform chi-square hypothesis test
stat, p, dof, expected = stats.chi2_contingency(data_crosstabs3)
critical = stats.chi2.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 1
p value = 0
Test Statistic = 91
Critical Value = 3


#### Hypothesis Testing for Continuous Variable

Create Random Samples from the subset of Browsing Customers with the same size as the subset of Purchasing Customers

In [10]:
rs_browse_1 = np.random.choice(Browse['Informational_Duration'],len(Purchase), replace=False)
rs_browse_2 = np.random.choice(Browse['Informational_Duration'],len(Purchase), replace=False)
rs_browse_3 = np.random.choice(Browse['Informational_Duration'],len(Purchase), replace=False)
rs_browse_4 = np.random.choice(Browse['Informational_Duration'],len(Purchase), replace=False)
rs_browse_5 = np.random.choice(Browse['Informational_Duration'],len(Purchase), replace=False)

In [11]:
# perform t-test on informational duration between buyers and browsers sample 1
stat, p = stats.ttest_ind(rs_browse_1, Purchase['Informational_Duration'])
dof = data.count()[0] - 1
critical = stats.t.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 12329
p value = 0
Test Statistic = -6
Critical Value = 1


In [12]:
# perform t-test on informational duration between buyers and browsers sample 2
stat, p = stats.ttest_ind(rs_browse_2, Purchase['Informational_Duration'])
dof = data.count()[0] - 1
critical = stats.t.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 12329
p value = 0
Test Statistic = -6
Critical Value = 1


In [13]:
# perform t-test on informational duration between buyers and browsers sample 3
stat, p = stats.ttest_ind(rs_browse_3, Purchase['Informational_Duration'])
dof = data.count()[0] - 1
critical = stats.t.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 12329
p value = 0
Test Statistic = -5
Critical Value = 1


In [14]:
# perform t-test on informational duration between buyers and browsers sample 4
stat, p = stats.ttest_ind(rs_browse_4, Purchase['Informational_Duration'])
dof = data.count()[0] - 1
critical = stats.t.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 12329
p value = 0
Test Statistic = -4
Critical Value = 1


In [15]:
# perform t-test on informational duration between buyers and browsers sample 5
stat, p = stats.ttest_ind(rs_browse_5, Purchase['Informational_Duration'])
dof = data.count()[0] - 1
critical = stats.t.ppf(0.95,dof)
print('Degress of Freedom = %d' % dof)
print('p value = %d' % p)
print('Test Statistic = %d' % stat)
print('Critical Value = %d' % critical)

Degress of Freedom = 12329
p value = 0
Test Statistic = -7
Critical Value = 1


Since all of the t-test show there is a significant difference between the Informational Duration of the Purchasing Customer group and Informational Duration of the Browsing Customer group we conclude the null hypothesis (The information duration of the Purcahsing group is not the same as the Browsing group) is true 