In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Review: Comparing Two Samples

In [3]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [4]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(
        with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(
        shuffled_table, numeric_label, 'Shuffled Label')   

In [5]:
births = Table.read_table('data/baby.csv')

In [6]:
births.group('Maternal Smoker', np.average)

Maternal Smoker,Birth Weight average,Gestational Days average,Maternal Age average,Maternal Height average,Maternal Pregnancy Weight average
False,123.085,279.874,27.5441,64.014,129.48
True,113.819,277.898,26.7364,64.1046,126.919


# Randomized Control Experiment

In [7]:
botox = Table.read_table('data/bta.csv')
botox.show()

Group,Result
Control,1
Control,1
Control,0
Control,0
Control,0
Control,0
Control,0
Control,0
Control,0
Control,0


In [8]:
botox.pivot('Result', 'Group')

  values = np.array(tuple(values))


Group,0.0,1.0
Control,14,2
Treatment,6,9


In [10]:
sum(make_array(True, True))
# True = 1
# False = 0

2

In [11]:
botox.group('Group', np.average)

Group,Result average
Control,0.125
Treatment,0.6


# Testing the Hypothesis

In [12]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
# observed difference of 0.475 is the difference between the chronic back pain improvement between 
# the treatment group and the control group
# on average, our group that received botox treatment reported 0.475 more improvement than the control group


# our null says the difference between the two groups, should be 0 
# values far away from 0 (positive values) make us lean toward the alternative
# toward the right <- need this for p-value

0.475

In [13]:
one_simulated_difference(botox, 'Result', 'Group')

-0.041666666666666685

In [None]:
# now let's measure the simulated differences 10,000 times

# hint: write a for loop here






In [None]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)

In [None]:
# p-value
# refer to lecture slides or textbook to compute the p-value


