# Case Study #04 - Hypothesis Testing

In [2]:
# importing necessary modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [3]:
# reading the dataset
sales_data = pd.read_csv('Dataset/Sales_add.csv')
sales_data.head()

Unnamed: 0,Month,Region,Manager,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
0,Month-1,Region - A,Manager - A,132921,270390
1,Month-2,Region - A,Manager - C,149559,223334
2,Month-3,Region - B,Manager - A,146278,244243
3,Month-4,Region - B,Manager - B,152167,231808
4,Month-5,Region - C,Manager - B,159525,258402


In [5]:
sales_data.shape

(22, 5)

In [6]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Month                           22 non-null     object
 1   Region                          22 non-null     object
 2   Manager                         22 non-null     object
 3   Sales_before_digital_add(in $)  22 non-null     int64 
 4   Sales_After_digital_add(in $)   22 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1008.0+ bytes


In [7]:
sales_data.describe()

Unnamed: 0,Sales_before_digital_add(in $),Sales_After_digital_add(in $)
count,22.0,22.0
mean,149239.954545,231123.727273
std,14844.042921,25556.777061
min,130263.0,187305.0
25%,138087.75,214960.75
50%,147444.0,229986.5
75%,157627.5,250909.0
max,178939.0,276279.0


### 1. Impact on Shifting to Digital Marketing

First of all, let's form the hypotheses:\
Null hypothesis, **Ho** = _The sales are the same even after stepping into digital marketing._ \
Alternate hypothesis, **Ha** = _The sales increaed after stepping into digital marketing._

Since the sample size is less than 30 and there are two independent samples, let's perform 2 sample t-test on the following data.

Let's set the significance level as 5%. Now we can find the degrees of freedom as below:

In [14]:
# setting significance level
significance_level = 0.05
# finding degrees of freedom
degrees_of_freedom = len(sales_data['Sales_before_digital_add(in $)']) + len(sales_data['Sales_After_digital_add(in $)']) - 2
print(f'Degrees of freedom = {degrees_of_freedom}')

Degrees of freedom = 42


Now finding the critical value:

In [18]:
# calculating critical value for a right tailed test
t_critical = stats.t.ppf(1 - significance_level,df=degrees_of_freedom)
print(f'Critical value = {t_critical}')

Critical value = 1.6819523559426


So the decision rule can be set as:\
**Decision rule:** _If the absolute of t-value is greater than the critical value, reject the null hypothesis._


Now, let's find the _t-value(test statistic)_:

In [26]:
t_value, p_value = stats.ttest_ind(sales_data['Sales_before_digital_add(in $)'], sales_data['Sales_After_digital_add(in $)'])
print(f'Test statistic or t-value = {t_value}')

Test statistic or t-value = -12.995084451110877


In [27]:
print(f'Whether abs(t-value) {t_value} > critical value {t_critical}: {abs(t_value) > t_critical}')

Whether abs(t-value) -12.995084451110877 > critical value 1.6819523559426: True



Here test statistic is greater than the critical value. So, let's reject the null hypothesis Ho.

Hence, we can conclude that the company sales has increased after stepping into digital market.
***

### 2. Dependency between the Features “Region” and “Manager”.

In [29]:
sales_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 5 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Month                           22 non-null     object
 1   Region                          22 non-null     object
 2   Manager                         22 non-null     object
 3   Sales_before_digital_add(in $)  22 non-null     int64 
 4   Sales_After_digital_add(in $)   22 non-null     int64 
dtypes: int64(2), object(3)
memory usage: 1008.0+ bytes


In [31]:
sales_data['Region'].unique()

array(['Region - A', 'Region - B', 'Region - C'], dtype=object)

In [32]:
sales_data['Manager'].unique()

array(['Manager - A', 'Manager - C', 'Manager - B'], dtype=object)

Since the features "Region" and "Manager" are categorical, we'll perform Chi square test to find whether there's dependency or not.

First of all let's formulate the hypotheses:\
Null hypotheses, **Ho:** _There are no dependencies between the features "Region" and "Manager"_ \
Alternate hypotheses, **Ha:** _There is dependency between the features "Region" and "Manager"_

In [40]:
# Null hypotheses
ho = 'There are no dependencies between the features "Region" and "Manager"'
# Alternate hypotheses
ha = 'There is dependency between the features "Region" and "Manager"'

Now, let's create a cross table for the features _"Region"_ and _"Manager"_

In [34]:
cross_table = pd.crosstab(sales_data["Region"] ,sales_data["Manager"] , margins = False)
cross_table

Manager,Manager - A,Manager - B,Manager - C
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Region - A,4,3,3
Region - B,4,1,2
Region - C,1,3,1


Let choose significance level as 0.05

In [36]:
significance_level = 0.05

In [38]:
stat,p,dof,expected = stats.chi2_contingency(cross_table)

In [39]:
print(f'p-value = {p}')

p-value = 0.5493991051158094


In [43]:
if p > significance_level:
    print(f"Accept null hypothesisi: {ho}")
else:
    print(f'Reject null hypothesis and accept alternate hypothesis: {ha}')

Accept null hypothesisi: There are no dependencies between the features "Region" and "Manager"


In [None]:
Hence, we can conclude that there are no significance difference between the features *"Region"* and ""