In [1]:
import numpy as np
np.random.seed(45)
import pandas as pd
pd.set_option('display.max_columns', 0)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import math

In [2]:
data_path = r'C:\Users\DucTRung\Desktop\data_set\statistics'

In [3]:
df = pd.read_excel(data_path+'\\Two_means_dependent_samples.xlsx',sheet_name='Data in kg', header=13).drop(['Unnamed: 0'], axis=1).set_index('Subject')

# Confidence interval for difference of two means: *dependent samples*
## Background: A program for losing weight and we are interested in how much weight are we  likely to lose. Dataset have a sample of 10 people who have already completed the 12-week program.
## Task 1: Calculate the mean and standard deviation of the dataset.

In [4]:
df

Unnamed: 0_level_0,Weight before (kg),Weight after (kg),Difference
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,103.68,92.87,-10.81
2,110.68,101.58,-9.1
3,119.05,105.66,-13.39
4,101.75,96.18,-5.57
5,91.69,86.97,-4.72
6,112.03,105.9,-6.13
7,88.84,80.56,-8.28
8,105.18,97.0,-8.18
9,110.37,99.27,-11.1
10,120.99,107.44,-13.55


In [5]:
mean = df['Difference'].mean()
print('Mean: ', mean)

Mean:  -9.082999992383083


In [6]:
std = df['Difference'].std()
print('Standard deviation:', std)

Standard deviation: 3.111141445655812


## Task 2: Determine the appropriate  statistic to use
1. Poplation variance is unknown.
2. The dataset has 2 populations.
3. Assuming that the population is normally distributed.
### => The appropriate statistic to use is the t-statistic

## Task 3: Calculate the 95% confidence interval

In [7]:
def confidence_interval(data, confidence=.95):
    a = np.array(data)
    n = len(a)
    m = np.mean(a)
    se = st.sem(a)
    h = st.t.ppf((1 + confidence)/2, n-1)*se
    return round(m, 2), round(m-h, 2), round(m+h,2)

In [8]:
confidence_interval(df['Difference'], confidence=.95)

(-9.08, -11.31, -6.86)

## Task 4: Interpret the result
* We are 95% confident that you will lose between 11.31 and 6.86 if we follow the program as strict as the sample.

# Confidence interval for difference of two means: *Independent samples and known variance*

In [9]:
df = pd.read_excel(data_path+'\\Two_means_independent_samples_know_variance.xlsx', header=8, index_col=0)[['Unnamed: 1', 'Engineering', 'Management', 'Difference']].set_index('Unnamed: 1')
df

Unnamed: 0_level_0,Engineering,Management,Difference
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Size,100,70,?
Sample mean,58,65,-7
Population std,10,5,1.16496


## Task 1: Calculate the 99% confidence interval

In [10]:
def CI_independent(n_1, n_2, std_1, std_2, m_1, m_2, confidence=.95):
    # Calulate Variance:
    var_1 = pow(std_1, 2)
    var_2 = pow(std_2, 2)
    # Calculate t_score:
    t_score = st.t.ppf((1+ confidence)/2, n_1+n_2-2)
    # Calculate Standard Error:
    se = math.sqrt((var_1/n_1) + (var_2/n_2))
    # Calculate Margin of Error:
    me = t_score * se
    
    return round((m_1 - m_2)+me, 2), round((m_1 - m_2)-me, 2)

In [11]:
ci_low, ci_high = CI_independent(100, 70, 10, 5, 58, 65, confidence=.99)
print('Confidence interval of 99%: [{}, {}]' .format(ci_low, ci_high))

Confidence interval of 99%: [-3.96, -10.04]


## Task 2: Compare with the 95% CI
* A higher confidence leads to a broader interval

In [12]:
ci_low, ci_high = CI_independent(100, 70, 10, 5, 58, 66)
print('Confidence interval of 95%: [{}, {}]' .format(ci_low, ci_high))

Confidence interval of 95%: [-5.7, -10.3]


# Confidence interval for difference of two means: *Independent samples and unknown variance but assumed equal*
## E.g.: Dataset of NY apples and LA apples

In [13]:
df = pd.read_excel(data_path+'\\Two_means_independent_samples_unknown_equal.xlsx', header=8).drop('Unnamed: 0', axis=1)
df

Unnamed: 0,NY apples,LA apples
0,3.8,3.02
1,3.76,3.22
2,3.87,3.24
3,3.99,3.02
4,4.02,3.06
5,4.25,3.15
6,4.13,3.81
7,3.98,3.44
8,3.99,
9,3.62,


In [14]:
np.array(df['LA apples'])

array([3.02, 3.22, 3.24, 3.02, 3.06, 3.15, 3.81, 3.44,  nan,  nan])

## Task 1: Calculate the 90% confidence interval

In [15]:
def CI_independent_assumed_equal(n_1, n_2, std_1, std_2, m_1, m_2, confidence=.95):
    # Calulate variance:
    var = ((n_1 -1)*pow(std_1, 2) + (n_2 -1)*pow(std_2, 2))/(n_1 + n_2 -2)
    # Calculate t_score:
    t_score = st.t.ppf((1+ confidence)/2, n_1+n_2-2)
    # Calculate Standard Error:
    se = math.sqrt((var/n_1) + (var/n_2))
    # Calculate Margin of Error:
    me = t_score * se
    
    return round((m_1 - m_2)+me, 2), round((m_1 - m_2)-me, 2)

In [16]:
def pooled(a_1, a_2, confidence=.95):
    n_1 = len(a_1)
    n_2 = len(a_2)
    std_1 = np.std(a_1)
    std_2 = np.std(a_2)
    m_1 = np.mean(a_1)
    m_2 = np.mean(a_2)
    return CI_independent_assumed_equal(n_1, n_2, std_1, std_2, m_1, m_2, confidence)

In [17]:
pooled(df['NY apples'], df['LA apples'], confidence=.9)

(0.86, 0.53)

## Task 2: Compare with the 95% CI
* A lower confidence leads to a narrower interval

In [18]:
pooled(df['NY apples'], df['LA apples'])

(0.9, 0.49)

# Final Exam
## Task 1: calculate the confidence intervals for men shoes sales in the USA, this time based on a bigger sample (2015-2016)
### CI = mean + t_score * SE

In [23]:
AI_Bundy_f = pd.read_excel(data_path+'\\inferential_statistics_final_exam.xlsx', header=3).drop(['Unnamed: 0', 'Unnamed: 12'], axis=1)

In [92]:
fre_dis_f = pd.read_excel(data_path+'\\inferential_statistics_final_exam.xlsx', sheet_name='Tasks 1,2', usecols='B, C:N, P:AA', header=9, nrows=18).set_index('US')

In [105]:
def CI_1sample(data):
    confidence = .95
    m = data.mean()
    se = data.sem()
    t_ = st.t.ppf((1 + confidence)/2, len(data)/2)
    me = t_ * se
    return round(m-me), round(m+me)

In [107]:
fre_dis_f['CI_95%'] = fre_dis_f.apply(CI_1sample, axis=1)

## Task 2: Compare the results with the confidence intervals when the sample was based just on 2016

In [112]:
fre_dis_16_f = fre_dis_f[fre_dis_f.columns[12: 24]]

In [114]:
fre_dis_f['CI_16_95%'] = fre_dis_16_f.apply(CI_1sample, axis=1)

In [115]:
fre_dis_f

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1,12.1,CI_95%,CI_16_95%
US,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1
6,0,0,0,0,3,1,1,3,5,4,0,0,4,1,3,1,3,3,3,4,3,7,3,0,"(1.0, 3.0)","(2.0, 4.0)"
6.5,4,1,0,1,0,0,2,3,0,1,3,3,3,2,0,1,0,0,1,7,2,1,2,1,"(1.0, 2.0)","(0.0, 3.0)"
7,0,0,0,0,1,2,1,0,1,2,2,3,0,0,1,0,6,4,4,2,3,0,0,0,"(1.0, 2.0)","(0.0, 3.0)"
7.5,0,1,2,1,0,0,3,2,3,2,2,2,3,2,3,1,7,0,7,3,4,6,1,1,"(1.0, 3.0)","(1.0, 5.0)"
8,5,3,1,0,6,6,4,0,5,6,3,3,7,9,7,3,12,2,9,4,7,5,2,6,"(3.0, 6.0)","(4.0, 8.0)"
8.5,4,5,3,1,2,11,6,6,4,12,4,2,12,12,8,8,15,9,17,17,6,9,10,6,"(6.0, 10.0)","(8.0, 13.0)"
9,10,14,7,9,17,16,20,21,13,17,10,12,17,13,13,11,21,22,25,30,26,25,13,10,"(14.0, 19.0)","(14.0, 24.0)"
9.5,18,16,23,15,16,16,19,26,25,17,35,24,19,25,27,24,26,33,25,47,31,44,37,26,"(22.0, 29.0)","(24.0, 36.0)"
10,8,13,13,15,7,20,22,18,22,15,11,14,17,26,26,19,16,31,25,24,23,31,15,20,"(16.0, 22.0)","(19.0, 27.0)"
10.5,7,13,8,8,15,10,16,8,16,16,17,14,13,16,22,14,28,19,18,15,19,21,16,10,"(13.0, 17.0)","(14.0, 21.0)"


### Overall: 
* The numbers are close, however most shoe sizes seem overestimated when we base the prediction only on one year. This may be because 2016 was a very good year for sales, while 2015 wasn't.
-----------
## Task 3: Estimate the 90% CI for Germany store #1 and #2
* store #1, #2 are independent with assumed equal

In [133]:
ger_st_f = pd.read_excel(data_path+'\\inferential_statistics_final_exam.xlsx', sheet_name='Task 3', header=11, usecols='B, C:N, P:AA', nrows=16).set_index('US')

In [126]:
ger_st_f.columns.names = ['month']

In [139]:
ger_st_f['ger1_mean'] = ger_st_f[ger_st_f.columns[0:12]].mean(axis=1)

In [142]:
ger_st_f['ger2_mean'] = ger_st_f[ger_st_f.columns[12:24]].mean(axis=1)

In [145]:
ger_st_f['ger1_var'] = ger_st_f[ger_st_f.columns[0:12]].var(axis=1)
ger_st_f['ger2_var'] = ger_st_f[ger_st_f.columns[12:24]].var(axis=1)

In [152]:
n_1 = n_2 = 12
ger_st_f['pooled_var'] = ((n_1 -1) * ger_st_f['ger1_var'] + (n_2 -1) * ger_st_f['ger2_var'])/(n_1 + n_2 -2)

In [172]:
confidence = .95
t_score = st.t.ppf((1 + confidence)/2, n_1+n_1-2)

In [173]:
ger_st_f['Margin_of_error'] = np.sqrt((ger_st_f['pooled_var']/n_1) + (ger_st_f['pooled_var']/n_2)) * t_score

In [174]:
ger_st_f['CI_95%_low'] = round((ger_st_f['ger1_mean'] - ger_st_f['ger2_mean']) - ger_st_f['Margin_of_error'], 2)
ger_st_f['CI_95%_high'] = round((ger_st_f['ger1_mean'] - ger_st_f['ger2_mean']) + ger_st_f['Margin_of_error'], 2)

In [175]:
ger_st_f

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1,12.1,ger1_mean,ger2_mean,ger1_var,ger2_var,pooled_var,Margin_of_error,CI_90%_low,CI_90%_high,CI_95%_low,CI_95%_high
US,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.5,0,0,0,0,1,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0.416667,0.083333,0.810606,0.083333,0.44697,0.566038,-0.14,0.8,-0.23,0.9
5,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0.166667,0.166667,0.333333,0.333333,0.333333,0.488817,-0.4,0.4,-0.49,0.49
5.5,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0,1,0.083333,0.333333,0.083333,0.424242,0.253788,0.426523,-0.6,0.1,-0.68,0.18
6,0,2,0,0,0,0,0,0,0,0,0,0,0,1,3,1,2,0,0,0,0,0,0,0,0.166667,0.583333,0.333333,0.992424,0.662879,0.689324,-0.99,0.15,-1.11,0.27
6.5,3,3,1,2,1,0,2,0,2,1,3,4,2,0,2,1,1,2,0,1,2,1,3,0,1.833333,1.25,1.606061,0.931818,1.268939,0.953733,-0.21,1.37,-0.37,1.54
7,0,3,3,4,1,0,1,0,2,0,0,1,0,0,0,4,1,3,1,1,1,3,1,4,1.25,1.583333,2.022727,2.265152,2.143939,1.239689,-1.36,0.69,-1.57,0.91
7.5,1,2,4,1,2,6,4,3,5,8,2,1,2,1,1,3,2,7,9,8,14,8,6,3,3.25,5.333333,4.931818,16.060606,10.496212,2.742981,-4.35,0.19,-4.83,0.66
8,6,10,3,9,1,3,6,8,3,12,3,9,13,6,5,13,5,3,11,6,6,9,8,3,6.083333,7.333333,12.265152,12.242424,12.253788,2.963751,-3.7,1.2,-4.21,1.71
8.5,10,10,10,7,14,4,7,7,4,8,7,9,8,5,10,4,5,5,9,7,3,7,9,8,8.083333,6.666667,7.719697,4.969697,6.344697,2.132613,-0.35,3.18,-0.72,3.55


### Overall:
* For a single shoe size, we can clearly see that one shop outperforms the other.
* However, our confidence is lower. This shows that the level of confidence does make a difference.
* Also the intervals are narrower, so the two shops are extrmely similar in terms of sales.