#### Using Rite Aid sales data, create a DF with 2 stores
#### Then use 2 sample T-test to compare the mean of sales
#### The sales figure on each record is the basket charge (single customer)

In [3]:
import pandas as pd
import numpy as np
from scipy import stats
from math import sqrt

In [2]:
INPUT_DIR = 'D:\\Data\Python\\'
file = 'Rx Sales data.csv'
df = pd.read_csv(INPUT_DIR + file)
df.shape

(216771, 12)

In [3]:
# Will not being using these fields
del df["Unnamed: 0"]
del df["Last Date"]

In [4]:
# Remove returns (negative sales)
df = df[df['Front of Store Sales']>0]

In [5]:
df.shape

(172647, 10)

In [6]:
stores = [12999, 17783]        # These are the stores to study

In [7]:
print("{}{:>25}{:>10}{:>12}{:>10}".format("Store#", "Front of Store Sales", "Mean", "Count", "StdDev"))

for x in stores:
    print("{}{:>18.2f}{:>19.2f}{:>10}{:>10.2f}".format(x,
                                              df.ix[df['Store'] == x]['Front of Store Sales'].sum(),
                                              df.ix[df['Store'] == x]['Front of Store Sales'].mean(),
                                              df.ix[df['Store'] == x]['Front of Store Sales'].count(),
                                              df.ix[df['Store'] == x]['Front of Store Sales'].std()))

Store#     Front of Store Sales      Mean       Count    StdDev
12999          15528.62              25.05       620     25.95
17783          17289.24              23.75       728     23.73


#### Check if these values (above) are ok, if so, split out the 2 sets of stores into separate arrays (for sciPy)

In [8]:
store_list = []
for x in range(len(stores)):
    store_list.append(df.ix[df['Store']== stores[x]]["Front of Store Sales"])

In [9]:
(t, p) = stats.ttest_ind(store_list[0], store_list[1])

#### Looking for a P-value of less than .05  typically. If above that, then the two means should be considered equal

In [10]:
print("T-statistic:  {:.3}".format(t))
print("    P-value:  {:.3}".format(p))

T-statistic:  0.958
    P-value:  0.338


#### For stores 12999 and 17783 P is .34 which is way above the minimum to deny the Null,
#### therefore means are considered equal

## *****

#### T-test works best when the data is normalized. Let's see how normal the sales are

In [11]:
(count, bin) = np.histogram(store_list[0], bins=6)

In [12]:
print("  {}{:>12}".format("Bin Range", "Count"))
for x in range(len(bin)-1):
    print("{:>6.2f} {} {:<10.2f}{}".format(bin[x], "-", bin[x+1], count[x]))

  Bin Range       Count
  0.07 - 38.08     477
 38.08 - 76.09     120
 76.09 - 114.09    18
114.09 - 152.10    1
152.10 - 190.11    1
190.11 - 228.12    3


#### So it's not at all Normal

#### *******

#### Long way of computing T-statistic

In [13]:
# Get difference of means
M0 = store_list[0].mean()
M1 = store_list[1].mean()

In [14]:
# Get sample variances
V0 = store_list[0].var()
V1 = store_list[1].var()
print(V0,V1)

673.342804303508 563.0763573680786


In [15]:
# Sample sizes
S0 = store_list[0].size
S1 = store_list[1].size
print(S0,S1)

620 728


In [19]:
T = (M0 - M1) / sqrt(V0/S0 + V1/S1)
print("T-statistic is {:.2f}".format(T))

T-statistic is 0.95


In [4]:
prior = [161,197,190,154,179,170,183,172,176,199,156]
current = [106]
(t, p) = stats.ttest_ind(prior, current)
print(p)

nan


