In [None]:
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import numpy as np

## Lecture 3 ##

### Percentiles

In [None]:
# Manually compute the 55th percentile.
x = make_array(43, 20, 51, 7, 28, 34)

In [None]:
# Step 1. Sort the data
np.sort(x)

In [None]:
# Step 2. Figure out where 55th percentile would be.

In [None]:
# OR: 1 Line of Code
percentile(55, x)

In [None]:
# If we tried to compute which element to take...
55 / 100 * 6

In [None]:
s = [1, 7, 3, 9, 5]
np.sort(s)

In [None]:
percentile(10, s) == 0

In [None]:
percentile(39, s) == percentile(40, s)

In [None]:
percentile(40, s) == percentile(41, s)

In [None]:
percentile(50, s) == 5

In [None]:
40/100 * 5

### Sample Median

In [None]:
skyscrapers = Table.read_table('data/skyscrapers.csv')
skyscrapers.sort('completed', descending = True)

In [None]:
# Who is making the most money
skyscrapers.sort('height', descending=True).show(5)

In [None]:
# Who is making the least money
sf.sort('Total Compensation', descending=False).show(5)

In [None]:
min_height = 350
skyscrapers_sample = skyscrapers.where('height', are.above(min_height))
skyscrapers_sample

In [None]:
percentile(50, skyscrapers_sample.column('height'))

In [None]:
pop_median = percentile(50, skyscrapers.column('height'))
pop_median

In [None]:
our_sample = skyscrapers.sample(50, with_replacement=False)
our_sample.show(5)

In [None]:
help(Table.sample)

In [None]:
percentile(50, our_sample.column('height'))

In [None]:
skyscrapers_bins = np.arange(0, 550) # (0 .... 549)
skyscrapers.hist('height', bins=skyscrapers_bins)
plots.title('Population Distribution');

In [None]:
our_sample.hist('height', bins=skyscrapers_bins)
plots.title('Sample Distribution');

# Variability of the Estimate

In [None]:
def generate_sample_median(samp_size):
    our_sample = skyscrapers.sample(samp_size, with_replacement=False)
    return percentile(50, our_sample.column('height'))

In [None]:
sample_median = generate_sample_median(50)
sample_median

In [None]:
error = abs(sample_median - pop_median)
error

# Quantifying Uncertainty

In [None]:
sample_medians = make_array()

for i in np.arange(1000):
    new_median = generate_sample_median(50)
    sample_medians = np.append(sample_medians, new_median)

pop_median - min(sample_medians)
pop_median - max(sample_medians)

In [None]:
med_bins = np.arange(200, 241)
Table().with_column(
    'Sample Medians', sample_medians
).hist(bins = med_bins)

plots.scatter(pop_median, 0, color="red");

In [None]:
err_bins = np.arange(-14, 16)
Table().with_column(
    'Errors', sample_medians - pop_median
).hist(bins = err_bins)

plots.scatter(0, 0, color="red");

# Bootstrap

In [None]:
# Take a bootstrap (re)sample of size 300, WITH replacement
boot_sample = our_sample.sample(50, with_replacement=True)
boot_sample.hist('height', bins=skyscrapers_bins)
plots.title('Bootstrap sample');

print("Population Median =       ", pop_median)
print("Our Sample Median =       ", sample_median)
print("Bootstrap Sample Median = ", 
      percentile(50,boot_sample.column('height')))

In [None]:
def one_bootstrap_median():
    single_sample = our_sample.sample()
    return percentile(50, single_sample.column('Total Compensation'))

In [None]:
bootstrap_medians = make_array()
for i in np.arange(1000):
    new_median = one_bootstrap_median()
    bootstrap_medians = np.append(bootstrap_medians, new_median)

In [None]:
Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.scatter(pop_median, 0, color="red");
plots.scatter(sample_median, 0, color="blue");

## Confidence Intervals

In [None]:
# Make an interval based on the middle 95% of bootstrap samples

left = percentile(2.5, bootstrap_medians)
right = percentile(97.5, bootstrap_medians)

Table().with_column(
    'Bootstrap Medians', bootstrap_medians
).hist('Bootstrap Medians', bins=med_bins)

plots.plot([left, right], [0,0], color="gold",lw=3, zorder=1);
plots.scatter(pop_median, 0, color="red", zorder=2);
plots.scatter(sample_median, 0, color="blue", zorder=2);