#### Paired T-test
#### This is a "before and after" scenario, so dependent rather than independent variables
#### The process is the same; just a different method to call

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt

In [3]:
INPUT_DIR = 'D:\\Data\Python\\'
file = 'paired T-test.csv'
df = pd.read_csv(INPUT_DIR + file)
df.head()

Unnamed: 0,Test,Score
0,Pre,3
1,Pre,0
2,Pre,6
3,Pre,7
4,Pre,4


In [4]:
var = ['Pre', 'Post']        # These are the stores to study
print("{}{:>10}{:>10}{:>12}".format("Type", "Mean", "Count", "StdDev"))

for x in var:
    print("{:<10}{:<10.2f}{:<10}{:<10.2f}".format(x,
                                              df.ix[df['Test'] == x]['Score'].mean(),
                                              df.ix[df['Test'] == x]['Score'].count(),
                                              df.ix[df['Test'] == x]['Score'].std()))

Type      Mean     Count      StdDev
Pre       3.33      9         2.24      
Post      7.00      9         3.04      


#### Check if these values (above) are ok, if so, split out the 2 sets of scores into separate arrays (for sciPy)

In [5]:
var_list = []
for x in range(len(var)):
    var_list.append(df.ix[df['Test']== var[x]]["Score"])

#### Now call ttest_rel rather than ttest_ind

In [6]:
(t, p) = stats.ttest_rel(var_list[0], var_list[1])

#### Looking for a P-value of less than .05  typically. If above that, then the two means should be considered equal

In [7]:
print("T-statistic:  {:.3}".format(t))
print("    P-value:  {:.3}".format(p))

T-statistic:  -3.14
    P-value:  0.0137


#### So there is a 1.37% chance that this result (T-statistic of -3.14) would occur if the means were equal

##### ***

#### What is the 95% confidence interval?

In [8]:
diff = np.subtract(var_list[0],var_list[1])

In [9]:
std = diff.std()
avg = diff.mean()
print("Mean of the differences is {:.2f}".format(avg))
print("StdDev of the differences is {:.2f}".format(std))
print("Sample size is {}".format(len(var_list[0])))

Mean of the differences is -3.67
StdDev of the differences is 3.50
Sample size is 9


#### T-value for 95% confidence interval and 8 degrees of freedom: 2.306

In [11]:
T = 1.86        # N = 8; confidence interval is 97.5%, not 95, 
lower = avg - (T * std) / np.sqrt(len(var_list[0]-1))
upper = avg + (T * std) / np.sqrt(len(var_list[0]-1))
print("95% confidence interval would be between {:.2} and {:.2}".format(lower, upper))

95% confidence interval would be between -5.8 and -1.5


#### This means we are 95% certain that the mean difference between the 2 samples is between -5.8 and -1.5