In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# A/B Testing: Comparing Two Samples

In [None]:
births = Table.read_table('data/baby.csv')
births

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')

## Test Statistic


In [None]:
# Use the .group method to compute the average for each group

means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
# Use the table that results from using .group
# to determine the value of the observed test statistic

means = means_table.column(1)
observed_difference = means.item(1) - means.item(0)
observed_difference

In [None]:
# Write a function that could calculate the statistic
# but is flexible enough to work on any table when you
# specify the column label that contains numerical values
# that you want to average, and the grouping label

def difference_of_means(table, label, group_label):

    # create table with only the two relevant columns
    reduced = table.select(label, group_label)  
    
    # create table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # pull just the column/array with the group means
    # use .column(1) since predicted the label is hard
    means = means_table.column(1)
    
    # return the difference between the two elements
    return means.item(1) - means.item(0)

In [None]:
# Use the function to calculate the observed statistic

difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
# Since the function provides flexibility, look at a few other variables

difference_of_means(births, 'Gestational Days', 'Maternal Smoker')

In [None]:
difference_of_means(births, 'Maternal Age', 'Maternal Smoker')

In [None]:
difference_of_means(births, 'Maternal Height', 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))
letters

In [None]:
letters.sample()

In [None]:
letters.sample(with_replacement = False)

In [None]:
shuffled_letters = letters.sample(with_replacement = False).column(0)
letters.with_column('Shuffled', shuffled_letters)

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False
                                                ).column('Maternal Smoker')

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')

# Permutation Test

In [None]:
# Write a function to complete one simulated statistic

def one_simulated_difference(table, label, group_label):

    # select array of shuffled labels as an array
    shuffled_labels = table.sample(with_replacement = False).column(group_label)
    
    # add in the shuffled labels as a new column to the provided table
    shuffled_table = table.select(label).with_column('Shuffled Label', shuffled_labels)
    
    # return the difference of the means using the shuffled labels
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
# Simulate 2500 times and store statistics in an array

differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist(bins=np.arange(-10, 4, 0.5))
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

## What About Gestational Days?

In [None]:
observed_difference = difference_of_means(births, 'Gestational Days', 'Maternal Smoker')

differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Gestational Days', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist(bins=np.arange(-4, 4, 0.5))
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');

In [None]:
sum(differences <= observed_difference) / 2500