In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Review: Comparing Two Samples

In [None]:
def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    
    #table with the two relevant columns
    reduced = table.select(numeric_label, group_label)  
    
    # table containing group means
    means_table = reduced.group(group_label, np.average)
    
    # array of group means
    means = means_table.column(1)
    
    return means.item(1) - means.item(0)

In [None]:
def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    
    # array of shuffled labels
    shuffled_labels = table.sample(with_replacement = False).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(numeric_label).with_column('Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')   

In [None]:
births = Table.read_table('data/baby.csv')

In [None]:
births.group('Maternal Smoker', np.average)

# Randomized Control Experiment

In [None]:
botox = Table.read_table('data/bta.csv')
botox.show()

# Task:
- Summarize the dataset: show the unique combinations
    - How many people got treatment and experienced improvements?
    - How many people got placebo and experienced improvements?
    - How many people got treatment and didnt experienced improvements?
    - How many people got placebo and didnt experienced improvements?
- Calculate the share of improvements per group

## Tickets
is the distribution of the 

- 31 “treatment” values in Column 1 (including the unknown ones) 
different from the distribution of the 
- 31 “control” values in Column 2 (again including the unknown ones)?

In [None]:
botox = botox.with_column('Outcome if assigned treatment', botox.column('Result'))
botox.column('Outcome if assigned treatment')[botox.column('Group') != 'Treatment'] = np.nan

botox = botox.with_column('Outcome if assigned control', botox.column('Result'))
botox.column('Outcome if assigned control')[botox.column('Group') != 'Control'] = np.nan

botox.show()

# Testing the Hypothesis

In [None]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

In [None]:
one_simulated_difference(botox, 'Result', 'Group')

# Task: 
1. Simulate the difference 10k times
1. Create a table containing the simulations
1. Plot the histogram
    - Question: in which direction is the alternative?
    - Where on the histogram is the *observed* difference?
    - Bonus: Add the observed difference as a dot into the histogram (use `plots.scatter()`).
1. Calculate the p-value ([hint](https://inferentialthinking.com/chapters/12/1/AB_Testing.html#conclusion-of-the-test))
1. Choose a p-value cutoff