In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## A/B Testing: Comparing Two Samples

In [None]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    # table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
# Let's load up the births table:
births = Table.read_table('baby.csv')

In [None]:
# and run difference_of_means on it:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
# Let's shuffle the labels for smoking and non-smoking pregnancies.  Shuffling the labels is essentially saying, 
# "the labels smoking and non-smoking don't matter for birth weight!" and so we're simulating the null hypothesis!
shuffled_labels = births.sample(with_replacement = False).column('Maternal Smoker')
shuffled_labels

In [None]:
# Recall the counts of smoker and non-smoker:
births.group('Maternal Smoker')

In [None]:
# Let's notice that the number of True and False in shuffled_labels is the same:
sum(shuffled_labels == True), sum(shuffled_labels == False)

In [None]:
# Let's simulate the test statistic under the null hypothesis once with a function:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   

In [None]:
# Let's run it:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
# Now let's run it 2500 times to get the empirical distribution of the test statistic under the null hypoth:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births,'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference of averages between groups', differences).hist('Difference of averages between groups')

In [None]:
# Question: What's the p-value?

# Randomized Control Experiment

In [None]:
# Does botox reduce pain?
# 1 -> pain improved
# 0 -> pain did not improve
botox = Table.read_table('bta.csv')
botox.show()

In [None]:
# How can we see the number of people for each combination of Result and Group?
botox.pivot('Result', 'Group')

In [None]:
botox.group('Group', np.average)

# Testing the Hypothesis

In [None]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

In [None]:
one_simulated_difference(botox, 'Result', 'Group')

In [None]:
# Find the empirical distribution under the null by running one_simulated_difference 10000 times:

In [None]:
# And now make a histogram.  Make the label on the horizontal axis be 'Distances between groups'

In [None]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)