In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

# Bootstrap

### Recap

In [None]:
sf_pop = Table.read_table('data/san_francisco_2015.csv').where('Total Compensation', are.above(10000))
sf_pop = sf_pop.where('Total Compensation', are.above(10*40*52))
# show me the employees with the lowest compensation in sf_pop
sf_pop.sort('Total Compensation')

In [None]:
sf_pop

In [None]:
sf_pop.num_rows

In [None]:
pop_median = percentile(50, sf_pop.column('Total Compensation'))
sf_bins = np.arange(0, 700000, 25000)
sf_pop.hist('Total Compensation', bins=sf_bins)
print("Population Median = ", pop_median)
plots.title('Population');

In [None]:
# what if we did not have access to the entire census data?
# let's look at the estimation a random sample of 300 of sf's public employees 
# and compare to our "bull's eye" or our population parameter
 # Random sample of size 300
our_sample = sf_pop.sample(300, with_replacement = False)
our_sample_median = percentile(50, our_sample.column('Total Compensation'))
our_sample.hist('Total Compensation', bins=sf_bins)
print("Population Median = ", pop_median)
print("Sample Median = ", our_sample_median)
plots.title('Our sample');

In [None]:
.000001 * 20000
# 2% of our 300-employee sample reside in our $0-20,000 bin for total compensation
.02*300

In [None]:
# Empirical distribution of the sample median
# assuming we can just resample from the population
def one_sample_median():
    single_sample = sf_pop.sample(300, with_replacement = False)
    return percentile(50, single_sample.column('Total Compensation'))

# let's generate 1000 samples:












medians = make_array()
for i in np.arange(1000):
    new_median = one_sample_median()
    medians = np.append(medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)

Table().with_column(
    'Sample Medians', medians
).hist('Sample Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.title('Sample Medians (1K Samples from Pop)');

In [None]:
# we took 1000 samples with sample size of 300
# how accurate were our 1,000 estimates?
# how big was our typical error?

# how does this inform you of your future decisions of making estimates of the population?



### Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('1 Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", our_sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

In [None]:
def one_bootstrap_median():
    # define function here to generation one bootstrap median
    
    
    
    
    
    
    
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

In [None]:
help(Table.sample)

In [None]:
# Bootstrap our sample 1000 times
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.scatter(our_sample_median, 0, color="blue");
plots.title('Bootstrap Medians (1K Bootstraps from our Sample)');

### 95% Confidence Interval

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=5, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);
plots.title('Bootstrap Medians (1K Bootstraps from our Sample)');

## Another Example: Mean Maternal Age

In [None]:
# This time we have a sample, but no population data!
# we want to define our population as all us mothers at this given time
# no way we have access to the population data
births = Table.read_table('data/baby.csv')
births.show(5)

In [None]:
births.hist('Maternal Age')

In [None]:
mean_age = np.mean(births.column('Maternal Age'))
mean_age
# sample statistic - original sample

In [None]:
np.average(births.sample().column('Maternal Age'))# give us same size as original sample, sample WITH replacement

# bootstrap sample average age

In [None]:
def one_bootstrap_mean():
    #bootstrap_sample = births.sample() # give us same size as original sample, sample WITH replacement
    #bootstrap_maternal_age = bootstrap_sample.column('Maternal Age')
    # bootstrap_average = np.mean(bootstrap_maternal_age)
    # return bootstrap_average
    return np.mean(births.sample().column('Maternal Age'))
# .sample() by default, gives you the same size back and samples WITH replacement

In [None]:
bootstrap_means = make_array()

for i in np.arange(1000): # did this 1,000 times
    new_mean = one_bootstrap_mean() # generated a new sample of 1100 some individuals
    bootstrap_means = np.append(bootstrap_means, new_mean)
    
left = percentile(2.5, bootstrap_means)
right = percentile(97.5, bootstrap_means)

In [None]:
Table().with_column('Bootstrap means', bootstrap_means).hist()

plots.plot([left,right], [0,0], color="gold", lw=5, zorder=1);
plots.scatter(mean_age,0,color="blue", zorder=2);
plots.title('Bootstrap Means (1K Bootstraps from our Sample)');

In [None]:
# We are 95% confident that our bootstrapping process up above
# generated a "good" interval, which means it captures the population parameter.

# We are 95% confident that our interval of average maternal age between 26.9 - 27.6 
# years of age captures the TRUE population paramter, which is the TRUE average maternal
# age.