In [74]:
import numpy as np
np.random.seed(45)
import pandas as pd
pd.set_option('display.max_columns', 0)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import math

In [75]:
data_path = r'C:\Users\DucTRung\Desktop\data_set\statistics'

In [76]:
df = pd.read_excel(data_path+'\\Two_means_dependent_samples.xlsx',sheet_name='Data in kg', header=13).drop(['Unnamed: 0'], axis=1).set_index('Subject')

# Confidence interval for difference of two means: *dependent samples*
## Background: A program for losing weight and we are interested in how much weight are we  likely to lose. Dataset have a sample of 10 people who have already completed the 12-week program.
## Task 1: Calculate the mean and standard deviation of the dataset.

In [77]:
df

Unnamed: 0_level_0,Weight before (kg),Weight after (kg),Difference
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,103.68,92.87,-10.81
2,110.68,101.58,-9.1
3,119.05,105.66,-13.39
4,101.75,96.18,-5.57
5,91.69,86.97,-4.72
6,112.03,105.9,-6.13
7,88.84,80.56,-8.28
8,105.18,97.0,-8.18
9,110.37,99.27,-11.1
10,120.99,107.44,-13.55


In [78]:
mean = df['Difference'].mean()
print('Mean: ', mean)

Mean:  -9.082999992383083


In [79]:
std = df['Difference'].std()
print('Standard deviation:', std)

Standard deviation: 3.111141445655812


## Task 2: Determine the appropriate  statistic to use
1. Poplation variance is unknown.
2. The dataset has 2 populations.
3. Assuming that the population is normally distributed.
### => The appropriate statistic to use is the t-statistic

## Task 3: Calculate the 95% confidence interval

In [80]:
def confidence_interval(data, confidence=.95):
    a = np.array(data)
    n = len(a)
    m = np.mean(a)
    se = st.sem(a)
    h = st.t.ppf((1 + confidence)/2, n-1)*se
    return round(m, 2), round(m-h, 2), round(m+h,2)

In [81]:
confidence_interval(df['Difference'], confidence=.95)

(-9.08, -11.31, -6.86)

## Task 4: Interpret the result
* We are 95% confident that you will lose between 11.31 and 6.86 if we follow the program as strict as the sample.

# Confidence interval for difference of two means: *Independent samples and known variance*

In [82]:
df = pd.read_excel(data_path+'\\Two_means_independent_samples_know_variance.xlsx', header=8, index_col=0)[['Unnamed: 1', 'Engineering', 'Management', 'Difference']].set_index('Unnamed: 1')
df

Unnamed: 0_level_0,Engineering,Management,Difference
Unnamed: 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Size,100,70,?
Sample mean,58,65,-7
Population std,10,5,1.16496


## Task 1: Calculate the 99% confidence interval

In [83]:
def CI_independent(n_1, n_2, std_1, std_2, m_1, m_2, confidence=.95):
    # Calulate Variance:
    var_1 = pow(std_1, 2)
    var_2 = pow(std_2, 2)
    # Calculate t_score:
    t_score = st.t.ppf((1+ confidence)/2, n_1+n_2-2)
    # Calculate Standard Error:
    se = math.sqrt((var_1/n_1) + (var_2/n_2))
    # Calculate Margin of Error:
    me = t_score * se
    
    return round((m_1 - m_2)+me, 2), round((m_1 - m_2)-me, 2)

In [84]:
ci_low, ci_high = CI_independent(100, 70, 10, 5, 58, 65, confidence=.99)
print('Confidence interval of 99%: [{}, {}]' .format(ci_low, ci_high))

Confidence interval of 99%: [-3.96, -10.04]


## Task 2: Compare with the 95% CI
* A higher confidence leads to a broader interval

In [85]:
ci_low, ci_high = CI_independent(100, 70, 10, 5, 58, 66)
print('Confidence interval of 95%: [{}, {}]' .format(ci_low, ci_high))

Confidence interval of 95%: [-5.7, -10.3]


# Confidence interval for difference of two means: *Independent samples and unknown variance but assumed equal*
## E.g.: Dataset of NY apples and LA apples

In [86]:
df = pd.read_excel(data_path+'\\Two_means_independent_samples_unknown_equal.xlsx', header=8).drop('Unnamed: 0', axis=1)
df

Unnamed: 0,NY apples,LA apples
0,3.8,3.02
1,3.76,3.22
2,3.87,3.24
3,3.99,3.02
4,4.02,3.06
5,4.25,3.15
6,4.13,3.81
7,3.98,3.44
8,3.99,
9,3.62,


In [99]:
np.array(df['LA apples'])

nan

## Task 1: Calculate the 90% confidence interval

In [87]:
def CI_independent_assumed_equal(n_1, n_2, std_1, std_2, m_1, m_2, confidence=.95):
    # Calulate variance:
    var = ((n_1 -1)*pow(std_1, 2) + (n_2 -1)*pow(std_2, 2))/(n_1 + n_2 -2)
    # Calculate t_score:
    t_score = st.t.ppf((1+ confidence)/2, n_1+n_2-2)
    # Calculate Standard Error:
    se = math.sqrt((var/n_1) + (var/n_2))
    # Calculate Margin of Error:
    me = t_score * se
    
    return round((m_1 - m_2)+me, 2), round((m_1 - m_2)-me, 2)

In [100]:
def pooled(a_1, a_2, confidence=.95):
    n_1 = len(a_1)
    n_2 = len(a_2)
    std_1 = np.std(a_1)
    std_2 = np.std(a_2)
    m_1 = np.mean(a_1)
    m_2 = np.mean(a_2)
    return CI_independent_assumed_equal(n_1, n_2, std_1, std_2, m_1, m_2, confidence)

In [102]:
pooled(df['NY apples'], df['LA apples'], confidence=.9)

(0.86, 0.53)

## Task 2: Compare with the 95% CI
* A lower confidence leads to a narrower interval

In [103]:
pooled(df['NY apples'], df['LA apples'])

(0.9, 0.49)