# Bootstrap

In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np

### Sampling distribution of the median

In [None]:
sf = Table.read_table('san_francisco_2015.csv')
sf.show(3)

In [None]:
pop_median = percentile(50, sf.column('Total Compensation'))
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
print("Population Median = ", pop_median)

In [None]:
### Random sample of size 300

our_sample = sf.sample(300, with_replacement = False)
our_sample_median = percentile(50, our_sample.column('Total Compensation'))
our_sample.hist('Total Compensation', bins=sf_bins)
print("Population Median = ", pop_median)
print("Sample Median = ", our_sample_median)

In [None]:
### Sampling distribution of the sample median

def one_sample_median():
    single_sample = sf.sample(300, with_replacement = False)
    return percentile(50, single_sample.column('Total Compensation'))

medians = make_array()
for i in np.arange(1000):
    new_median = one_sample_median()
    medians = np.append(medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column('Sample Medians', medians).hist('Sample Medians', bins=med_bins)
plots.scatter(pop_median, 0, color="red");

### Bootstrap

In [None]:
### Take a bootstrap (re)sample of size 300, WITH replacement

boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
print("Population Median = ", pop_median)
print("Our Sample Median = ", our_sample_median)
print("Bootstrap Sample Median = ", percentile(50,boot_sample.column('Total Compensation')))

In [None]:
### Simulate the distribution of the bootstrap sample median
### bootstrap sample : sample :: sample : population

### NOTE: this whole simulation only uses our_sample, we never need to look at sf

def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
Table().with_column('Bootstrap Medians', bootstrap_medians).hist('Bootstrap Medians', bins=med_bins)
plots.scatter(pop_median, 0, color="red");
plots.scatter(our_sample_median, 0, color="blue");

### Bootstrap Confidence Interval

In [None]:
### Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column('Bootstrap Medians', bootstrap_medians).hist('Bootstrap Medians', bins=med_bins)
plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);

In [None]:
## Now do the whole process: 
## 1. take a sample, 
## 2. use it to make many bootstrap samples,
## 3. use the middle 95% of the bootstrap medians as our confidence interval

our_sample = sf.sample(300, with_replacement = False)
our_sample_median = percentile(50, our_sample.column('Total Compensation'))

bootstrap_medians = make_array()
for i in np.arange(201):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column('Bootstrap Medians', bootstrap_medians).hist('Bootstrap Medians', bins=med_bins)
plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);

In [None]:
## Do the whole process with a bigger sample
our_sample = sf.sample(3000, with_replacement = False)
our_sample_median = percentile(50, our_sample.column('Total Compensation'))

bootstrap_medians = make_array()
for i in np.arange(201):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column('Bootstrap Medians', bootstrap_medians).hist('Bootstrap Medians', bins=med_bins)
plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(our_sample_median, 0, color="blue", zorder=2);