In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

### Percentiles

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data
np.sort(x)

In [None]:
# Step 2. Figure out where 55th percentile would be.

In [None]:
# OR: 1 Line of Code
percentile(55, x)

In [None]:
# If we tried to compute which element to take...
55 / 100 * 6
# we round up from 3.3 to 4
# we grab the 4th element of the array, x
# which is 34

In [None]:
s = [1, 7, 3, 9, 5]
s = np.sort(s)
s

In [None]:
percentile(0, s) == percentile(20,s)

### Sample Median

In [None]:
sf = Table.read_table('data/san_francisco_2015.csv')
sf

In [None]:
# Who is making the most money
sf.sort('Total Compensation', descending=True).show(5)

In [None]:
# Who is making the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
min_salary = 10 * 20 * 52
sf = sf.where('Total Compensation', are.above(min_salary))

In [None]:
pop_median = percentile(50, sf.column('Total Compensation'))
pop_median

In [None]:
# generate a sample of size 300, without replacement

our_sample = sf.sample(300, with_replacement = False)
our_sample

In [None]:
# and find the 50th percentile or the median Total Compensation
percentile(50, our_sample.column('Total Compensation'))


In [None]:
sf_bins = np.arange(0, 700000, 25000)
sf.hist('Total Compensation', bins=sf_bins)
plots.title('Population Distribution');

In [None]:
our_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Sample Distribution');

# Variability of the Estimate

In [None]:
# now let's define a function, call it generate_sample_median
# which takes as an argument, any sample size
# and generates a sample of that size
# and returns the 50th percentile or median Total Compensation of that sample

def generate_sample_median(samp_size):
    our_sample = sf.sample(samp_size, with_replacement=False)
    return percentile(50, our_sample.column('Total Compensation'))




In [None]:
sample_median = generate_sample_median(300)
sample_median

In [None]:
error = sample_median - pop_median
error

# Quantifying Uncertainty

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(300)
    sample_medians = np.append(sample_medians, new_median)

In [None]:
med_bins = np.arange(90000, 125001, 2500)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

plots.scatter(pop_median, 0, color="red");

In [None]:
err_bins = np.arange(-15000, 12501, 2500)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

plots.scatter(0, 0, color="red");

# Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(300, with_replacement=True)
boot_sample.hist('Total Compensation', bins=sf_bins)
plots.title('Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('Total Compensation')))

In [None]:
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

In [None]:
# let's run the function above 1000 times
# and collect in an array called bootstrap_medians, each bootstrap sample's median

bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)



In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.scatter(sample_median, 0, color="blue");

## Confidence Intervals

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=6, zorder=1); # change lw=3 to lw=6
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(sample_median, 0, color="blue", zorder=2);

In [None]:
# can you find the middle 99% of bootstrap samples?

# try generating the histogram below






