In [1]:
import numpy as np
np.random.seed(45)
import pandas as pd
pd.set_option('display.max_columns', 0)
import scipy.stats as sta
import cc

In [2]:
data_path = r'C:\Users\DucTRung\Desktop\data_set\statistics'

# Hypothesis Testing for One Population
-----------
## Task 1: Test the null hypothesis ('average salary of DS is 113.000') at 10% significance
### Reject the null if:
* |test-statistic| > |critical-value|
### test-statistic(Z) = (xbar - H0) / standard error
### critical-value = t-score

In [3]:
df = pd.read_excel(data_path+'\\hypothesis_population_variance_known.xlsx', header=6, usecols='B', nrows=37-7)

In [4]:
xbar = df['Dataset'].mean()

In [5]:
n = len(df['Dataset'])

In [6]:
h0 = 113000

In [7]:
se = df['Dataset'].sem()

In [8]:
t_score = sta.t.ppf((1 + .9)/ 2, len(df)-1)

In [9]:
test_statistic = cc.t_test(xbar=xbar, h0=h0, standard_error=se)

In [10]:
cc.decision_the_null(xbar, h0, se, .95, n, n_side=2)

T-test: -6.0
t-score: -0.0
Decision: REJECT the null hypothesis with level of confidence 95.0%


## Task 2: What if the question was: is the competitor open rate Exactly 40%. What would be the decision then?
### Task 2.1: Test at 5% significance. Comment on the decision with the appropriate statistical jargon.

In [11]:
df = pd.read_excel(data_path+'\\hypothesis_population_variance_unknown.xlsx', header=9, usecols='B', nrows=20-9)

In [12]:
df

Unnamed: 0,Open rate
0,0.26
1,0.23
2,0.42
3,0.49
4,0.23
5,0.59
6,0.29
7,0.29
8,0.57
9,0.4


In [13]:
xbar = df['Open rate'].mean()
xbar

0.37699999999999995

In [14]:
n = len(df['Open rate'])

In [15]:
se = df['Open rate'].sem()
se

0.04343705535343962

In [16]:
h0 = .4

In [17]:
cc.decision_the_null(xbar, h0, se, .95, n, n_side=2)

T-test: -1.0
t-score: -0.0
Decision: REJECT the null hypothesis with level of confidence 95.0%


### Task 2.2: Test at 1% significance. Comment on the decision with the appropriate statistical jargon.

In [18]:
cc.decision_the_null(xbar, h0, se, .99, n, n_side=2)

T-test: -1.0
t-score: -0.0
Decision: REJECT the null hypothesis with level of confidence 99.0%


# Hypothesis Testing for Two Dependent Population
-----------
## Backgroud: a weight-loss program, wondering if it is working!
## Task 1: Calculate the difference between before and after

In [19]:
df = pd.read_excel(data_path+'\\hypothesis_dependent_samples.xlsx', header=10, usecols='B, C', nrows=21-10, sheet_name='Weight-loss data, kg')

In [20]:
df

Unnamed: 0,Before (kg),After (kg)
0,103.68,103.668536
1,110.68,108.383885
2,119.05,115.947282
3,101.75,101.704481
4,91.69,90.586932
5,112.03,112.703885
6,88.84,87.363885
7,105.18,103.803885
8,110.37,106.073885
9,120.99,122.651377


In [21]:
df['difference'] = df['After (kg)'] -  df['Before (kg)'] 

## Task 2: State the null hypothesis
* The question: If the weight-loss program is working

### => The null hypothesis: It's not working
### => H0: difference >= 0
---
## Task 3: Decide if this is a one-side or two-side test
### => Only care negative values, so this is a one_side test
### Hint: The null doesn't contain equality or inequality sign (<, >, <=, >=)
---

## Task 4: Answer the question

In [22]:
xbar = df['difference'].mean()

In [23]:
h0 = 0

In [24]:
se = df['difference'].sem()

In [25]:
n = len(df['difference'])

In [26]:
cc.decision_the_null(xbar, h0, se, confidence=.95, n=n, n_side=1)

T-test: -2.0
t-score: 2.0
Decision: REJECT the null hypothesis with level of confidence 95.0%


In [27]:
cc.decision_the_null(xbar, h0, se, confidence=.90, n=n, n_side=1)

T-test: -2.0
t-score: 1.0
Decision: REJECT the null hypothesis with level of confidence 90.0%


In [28]:
cc.decision_the_null(xbar, h0, se, confidence=.99, n=n, n_side=1)

T-test: -2.0
t-score: 3.0
Decision: ACCEPT the null hypothesis with level of confidence 99.0%


In [29]:
cc.decision_the_null(xbar, h0, se, confidence=1, n=n, n_side=1)

T-test: -2.0
t-score: inf
Decision: ACCEPT the null hypothesis with level of confidence 100%


# Hypothesis Testing for Two Independent Population with Assumed Equal
---
## Task 1: Ajax releases a new dish detergent, Extreme Ajax+, which they advertise to be able to clean more dishes with a single one. You want to ses if that is true and have bought 25 bottles of each type and data on how many dirty dishes you can wash with a single bottle.

In [30]:
df = pd.read_excel(data_path+'\\hypothesis_independent_samples.xlsx', header=13, usecols='B:E', nrows=3).set_index('Unnamed: 1')

In [31]:
df

Unnamed: 0_level_0,Extreme Ajax+,Ajax,Difference
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Size,25,25,?
Mean,115,100,15
Variance,625,400,6.40312


## State the Null Hypothesis
#### What do you want to verify: 
*. Extreme Ajax+ can clean more dishes with a single bottle == mean(A+) - mean(A) > 0 

#### The Null Hypothesis:
*. H0: mean(A+) - mean(A) <= 0

#### Number of side test:
*. This is one-side test

In [32]:
xbar = df.loc['Mean', 'Extreme Ajax+']

In [33]:
ybar = df.loc['Mean', 'Ajax']

In [34]:
h0 = 0

In [35]:
x_var = df.loc['Variance', 'Extreme Ajax+']

In [36]:
y_var = df.loc['Variance', 'Ajax']

In [37]:
confidence = .95

In [38]:
n_x = n_y = 25

In [39]:
n_side = 1

In [40]:
cc.decision_the_null_2_indep_sample(xbar=xbar,
                                    ybar=ybar,
                                    h0=0,
                                    x_var=x_var,
                                    y_var=y_var,
                                    confidence=.95,
                                    n_x=25,
                                    n_y=25,
                                    n_side=1)

Test statistics: 2.0
Critical value: 2.0
Decision: REJECT the null hypothesis with level of confidence 95.0%


In [41]:
cc.decision_the_null_2_indep_sample(xbar=xbar,
                                    ybar=ybar,
                                    h0=0,
                                    x_var=x_var,
                                    y_var=y_var,
                                    confidence=.90,
                                    n_x=25,
                                    n_y=25,
                                    n_side=1)

Test statistics: 2.0
Critical value: 1.0
Decision: REJECT the null hypothesis with level of confidence 90.0%


In [42]:
cc.decision_the_null_2_indep_sample(xbar=xbar,
                                    ybar=ybar,
                                    h0=0,
                                    x_var=x_var,
                                    y_var=y_var,
                                    confidence=.99,
                                    n_x=25,
                                    n_y=25,
                                    n_side=1)

Test statistics: 2.0
Critical value: 2.0
Decision: ACCEPT the null hypothesis with level of confidence 99.0%


## Task 2: You have data on the amount of times people click on a pop-up add on 24 Mondays and 21 Saturdays on an e-learning platform for several years. The samples are drawn independently. Is there strong evidence that number of clicks the add record on Monday is higher than the number of clicks on Saturdays?

In [46]:
df = pd.read_excel(data_path+'\\hypothesis_independent-samples_equally.xlsx', header=7, usecols='B:D', nrows=3).set_index('Unnamed: 1')
df

Unnamed: 0_level_0,Monday,Saturday
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1
Mean,1078,908.2
Std. deviation,633,469.8
Sample size,24,21.0


### State the Null Hypothesis
#### Verification:
* Number of clicks the add on Monday is higher than the one on Saturday == mean(Mon) - mean(Sat) > 0

#### The Null Hypthesis
* H0: mean(Mon) - mean(Sat) <= 0

In [48]:
xbar = df.loc['Mean', 'Monday']

In [49]:
ybar = df.loc['Mean', 'Saturday']

In [52]:
x_var = pow(df.loc['Std. deviation', 'Monday'], 2)

In [53]:
y_var = pow(df.loc['Std. deviation', 'Saturday'], 2)

In [55]:
n_x = df.loc['Sample size', 'Monday']

In [56]:
n_y = df.loc['Sample size', 'Saturday']

In [61]:
cc.decision_the_null_2_indep_sample(xbar, ybar,
                                    h0=0,
                                    x_var=x_var, y_var=y_var,
                                    n_x=n_x, n_y=n_y,
                                    confidence=.95,
                                    n_side=1)

Test statistics: 1.0
Critical value: 2.0
Decision: ACCEPT the null hypothesis with level of confidence 95.0%


In [62]:
cc.decision_the_null_2_indep_sample(xbar, ybar,
                                    h0=0,
                                    x_var=x_var, y_var=y_var,
                                    n_x=n_x, n_y=n_y,
                                    confidence=.99,
                                    n_side=1)

Test statistics: 1.0
Critical value: 2.0
Decision: ACCEPT the null hypothesis with level of confidence 99.0%


In [63]:
cc.decision_the_null_2_indep_sample(xbar, ybar,
                                    h0=0,
                                    x_var=x_var, y_var=y_var,
                                    n_x=n_x, n_y=n_y,
                                    confidence=.90,
                                    n_side=1)

Test statistics: 1.0
Critical value: 1.0
Decision: ACCEPT the null hypothesis with level of confidence 90.0%


### Result: There is no reason that add clicking on Monday is more than on Saturday
----
# Final Exam

In [66]:
df = pd.read_excel(data_path+'\\Hypothesis_statistics_final_exam.xlsx', header=3, usecols='B:K')
df

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Sweetwater,Alex,51,Male,United States,White,2011-08-15,Software Engineering,Software Engineering Manager,56160.0
1,Carabbio,Judith,30,Female,United States,White,2013-11-11,Software Engineering,Software Engineer,116480.0
2,Saada,Adell,31,Female,United States,White,2012-11-05,Software Engineering,Software Engineer,102440.0
3,Szabo,Andrew,34,Male,United States,White,2014-07-07,Software Engineering,Software Engineer,99840.0
4,Andreola,Colby,38,Female,United States,White,2014-11-10,Software Engineering,Software Engineer,99008.0
...,...,...,...,...,...,...,...,...,...,...
169,Zima,Colleen,39,Female,United States,Asian,2014-09-29,Production,Production Technician I,31200.0
170,Sutwell,Barbara,49,Female,Australia,Asian,2012-05-14,Production,Production Technician I,29120.0
171,Warfield,Sarah,39,Female,United States,Asian,2015-03-30,IT/IS,Sr. Network Engineer,114816.0
172,Petrowsky,Thelma,33,Female,United States,Asian,2014-11-10,IT/IS,Database Administrator,88920.0


## Task: Find if there is pay gap based on race between White and Nonwhite employees.

### The Null Hypothesis:
*. There is no gap == mean(wh) - mean(nwh) = 0

In [119]:
white_df = pd.read_excel(data_path+'\\Hypothesis_statistics_final_exam.xlsx', sheet_name='White',  header=3, usecols='B:K', nrows=116-4)

In [120]:
nonwhite_df = pd.read_excel(data_path+'\\Hypothesis_statistics_final_exam.xlsx', sheet_name='Nonwhite',  header=3, usecols='B:K')

In [121]:
nonwhite_df

Unnamed: 0,Surname,Name,Age,Gender,Country,Ethnicity,Start_date,Department,Position,Salary
0,Friedman,Gerry,48,Male,United States,Two or more races,2011-03-07,Sales,Area Sales Manager,115440.0
1,Mullaney,Howard,42,Male,United States,Two or more races,2014-09-29,Sales,Area Sales Manager,114400.0
2,Nguyen,Dheepa,28,Female,United States,Two or more races,2013-07-08,Sales,Area Sales Manager,114400.0
3,Valentin,Jackie,26,Female,United States,Two or more races,2011-07-05,Sales,Area Sales Manager,114400.0
4,Davis,Daniel,38,Male,Australia,Two or more races,2011-11-07,Production,Production Technician II,52000.0
...,...,...,...,...,...,...,...,...,...,...
57,Zima,Colleen,39,Female,United States,Asian,2014-09-29,Production,Production Technician I,31200.0
58,Sutwell,Barbara,49,Female,Australia,Asian,2012-05-14,Production,Production Technician I,29120.0
59,Warfield,Sarah,39,Female,United States,Asian,2015-03-30,IT/IS,Sr. Network Engineer,114816.0
60,Petrowsky,Thelma,33,Female,United States,Asian,2014-11-10,IT/IS,Database Administrator,88920.0


In [122]:
d= {'mean': [xbar, ybar],
    'n_sample': [n_x, n_y],
    'variance': [x_var, y_var],
    'type':['White', 'Nonwhite']}

In [123]:
df_ = pd.DataFrame(data=d).set_index(['type'])

In [125]:
df_.loc['White', 'mean'] = white_df['Salary'].mean()

In [126]:
df_.loc['Nonwhite', 'mean'] = nonwhite_df['Salary'].mean()

In [127]:
df_.loc['White', 'n_sample'] = len(white_df['Salary'])

In [128]:
df_.loc['Nonwhite', 'n_sample'] = len(nonwhite_df['Salary'])

In [129]:
df_.loc['White', 'variance'] = white_df['Salary'].var()

In [130]:
df_.loc['Nonwhite', 'variance'] = nonwhite_df['Salary'].var()

In [131]:
df_

Unnamed: 0_level_0,mean,n_sample,variance
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
White,67323.1,112.0,1136728000.0
Nonwhite,70917.264516,62.0,1225050000.0


In [133]:
xbar = df_.iloc[0, 0]
n_x = df_.iloc[0, 1]
x_var = df_.iloc[0, 2]

ybar = df_.iloc[1, 0]
n_y = df_.iloc[1, 1]
y_var = df_.iloc[1, 2]

In [135]:
cc.decision_the_null_2_indep_sample(xbar=xbar, ybar=ybar,
                                    n_x=n_x, n_y=n_y,
                                    x_var=x_var, y_var=y_var,
                                    confidence=.95,
                                    n_side=2,
                                    h0=0)

Test statistics: -1.0
Critical value: -0.0
Decision: REJECT the null hypothesis with level of confidence 95.0%


In [136]:
cc.decision_the_null_2_indep_sample(xbar=xbar, ybar=ybar,
                                    n_x=n_x, n_y=n_y,
                                    x_var=x_var, y_var=y_var,
                                    confidence=.99,
                                    n_side=2,
                                    h0=0)

Test statistics: -1.0
Critical value: -0.0
Decision: REJECT the null hypothesis with level of confidence 99.0%


## Results: There is a gap between white and nonwhite