In [None]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

# Comparing Two Samples

In [None]:
births = Table.read_table('data/baby.csv')

In [None]:
births
# baby's birth weight ounces
# gestational days - how long baby is in mom
# maternal height is in inches

In [None]:
smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
smoking_and_birthweight

In [None]:
smoking_and_birthweight.group('Maternal Smoker')

In [None]:
smoking_and_birthweight.hist('Birth Weight', group='Maternal Smoker')
# interpretation of the graph below:


# lots of overlap between two groups (smoking/nonsmoking) in regards to baby's weight
# maternal smoker = false group having slightly chunkier babies

# Test Statistic

[Question] What values of our statistic are in favor of the alternative: positive or negative?

In [None]:
means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
means_table

In [None]:
means = means_table.column(1)
observed_difference = means.item(1) - means.item(0) # Group B average - Group A average
# smokers avg baby birth weight - nonsmoker avg baby birth weight
observed_difference
# negative values give us evidence for alternative hypothesis
# towards the LEFT of our histogram

# values closer to 0 give us evidence for null hypothesis (there is no difference between two groups)
# towards the RIGHT of our histogram

In [None]:
def difference_of_means(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups"""
    
    #table with the two relevant columns
    # smoking_and_birthweight = births.select('Maternal Smoker', 'Birth Weight')
    reduced = table.select(label, group_label)  
    # reduced is a table with two columns
    
    # table containing group means
    # means_table = smoking_and_birthweight.group('Maternal Smoker', np.average)
    means_table = reduced.group(group_label, np.average)
    # mean_table is a two-column table, with averages for quantitative variables
    # between two groups ( two rows)
    
    # array of group means
    #means = means_table.column(1)
    means = means_table.column(1)
    # means is an array with the two averages of quant variables
    
    #observed_difference = means.item(1) - means.item(0)
    # Group B average - Group A average
    return means.item(1) - means.item(0)

In [None]:
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')

# Random Permutation (Shuffling)

In [None]:
letters = Table().with_column('Letter', make_array('a', 'b', 'c', 'd', 'e'))
letters

In [None]:
letters.sample() # returns same size table, samples WITH replacement
# duplicates can appear
# 2 a's and 2 d's, no b's, no e's

In [None]:
shuffled = letters.sample(with_replacement = False) # returns same size table, samples WITHOUT replacement
# shuffles my existing table, different order
shuffled

In [None]:
shuffled_array = shuffled.column(0)
shuffled_array

In [None]:
#letters.with_column('Shuffled', letters.sample(with_replacement = False).column(0))
letters.with_column('Shuffled', shuffled_array)

# Simulation Under Null Hypothesis

In [None]:
smoking_and_birthweight

In [None]:
shuffled_labels = smoking_and_birthweight.sample(with_replacement=False # shuffle smoking_and_birthweight
                                                ).column('Maternal Smoker')
shuffled_labels
# shuffled maternal smoker column

In [None]:
original_and_shuffled = smoking_and_birthweight.with_column(
    'Shuffled Label', shuffled_labels
)

In [None]:
original_and_shuffled

In [None]:
shuffled_table = original_and_shuffled.drop(0)
shuffled_table

In [None]:
shuffled_table.group('Shuffled Label', np.average)

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')
# simulated test statistic, under the null
# Group B average - Group A average, after we shuffled maternal smoker labels

In [None]:
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')
# observed statistic

# Permutation Test

In [None]:
def one_simulated_difference(table, label, group_label):
    """Takes: name of table, column label of numerical variable,
    column label of group-label variable
    Returns: Difference of means of the two groups after shuffling labels"""
    
    # array of shuffled labels
    # add comments here
    # on what each line does 
    shuffled_labels = table.sample(with_replacement = False
                                                    ).column(group_label)
    
    # table of numerical variable and shuffled labels
    shuffled_table = table.select(label).with_column(
        'Shuffled Label', shuffled_labels)
    
    return difference_of_means(shuffled_table, label, 'Shuffled Label')   

In [None]:
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')

In [None]:
differences = make_array()

for i in np.arange(2500):
    new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
    differences = np.append(differences, new_difference)

In [None]:
Table().with_column('Difference Between Group Means', differences).hist()
print('Observed Difference:', observed_difference)
plots.title('Prediction Under the Null Hypothesis');
plots.scatter(observed_difference,0)

In [None]:
# how often were we able to simulate the observed statistic?
# never


# is our data consistent with the null? 
# NO! 
## what is our histogram above centered on?
# 0 difference in birth weight ounces, between two groups

# does the above histogram provide evidence to reject or fail to reject the null?
# remember null is that there is no difference between two groups smoking/nonsmoking
# REJECT!

# BONUS: what is the p-value? 0%
## direction of the alternative? LEFT



In [None]:
(sum(differences <= observed_difference))/len(differences)