In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
from IPython.display import Image
from IPython.core.display import HTML 

In [None]:
voters = Table().with_columns("Candidate", ["A", "B"], "Chance", [0.6, 0.4])

## Part 1. Confidence Intervals and Sample Size


In [None]:
#http://inferentialthinking.com/notebooks/san_francisco_2015.csv
sf = Table.read_table('san_francisco_2015.csv').select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(3)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins, unit='dollar')

In [None]:
# confidence interval methods

def bootstrap_mean(sample_from_population, label, repetitions):
    resampled_means = []
    for i in np.arange(repetitions):
        resample = sample_from_population.sample()
        mean = np.mean(resample.column(label))
        resampled_means.append(mean)
    return resampled_means

def bootstrap_ci_mean(sample_from_population, label, repetitions):
    resampled_means = bootstrap_mean(sample_from_population, label, repetitions)
    
    interval_95 = make_array(
        percentile(2.5, resampled_means),
        percentile(97.5, resampled_means)
    )
    
    Table().with_column('Resampled mean', resampled_means).hist(0)
    plots.plot(interval_95, [0, 0], color='gold', lw=8)
    print('Approximate 95% Bootstrap Confidence Interval for Population Mean:')
    print(np.round(interval_95, 3))
    print('Interval Width: '+str(interval_95[1] - interval_95[0]))

In [None]:
sf_sample = sf.sample(200)
sf_sample.show(3)

In [None]:
#resample 1000 times from my original sample (with replacement)
#display histogram showing 95% confidence interval

bootstrap_ci_mean(sf_sample, 'Total Compensation', 1000)

In [None]:
# Sample size was 200. Confidence interval is too wide
# Note, numbers will change because samples are random! 

In [None]:
# What do you think the sample size should be to achieve a 
# 95% confidence interval with width of $10,000 or less?

# A: 250
# B: 300
# C: 400
# D: 800
# E: 10,000

In [None]:























sf_sample = sf.sample(800)
bootstrap_ci_mean(sf_sample, 'Total Compensation', 1000)

back to slides to recap

## Experiment design


In [None]:
#potential population SD

votes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
np.std(votes)

In [None]:
total_voters=10

def sd_voters(n_voters_for_a):
    votes = np.append(np.ones(n_voters_for_a), np.zeros(total_voters - n_voters_for_a))
    print(votes)
    return np.std(votes)

sd_voters(5) #change input

back to slides

In [None]:
# Fact: if data contains only 0-1 then SD will never be above 0.5
# we know that worst_sd_pop = 0.5
# width_in_sds = 4
# desired_width = 0.03

In [None]:
# √(sample size)  ≥  4 x (SD of 0-1 population) / 0.03
worst_sd_pop = 0.5
width_in_sds = 4
desired_width = 0.03
(width_in_sds * (worst_sd_pop / desired_width)) ** 2

In [None]:
#  no matter what my population looks like, 
# if I take a sample of 4445 people, my CI will always be 0.03 width or less

## Conducting the experiment


In [None]:
voters.labels

In [None]:
voters.select(0)

In [None]:
# weights - Array specifying probability. Must be a valid probability distribution 

observed_sample = voters.select(0).sample(1000, weights=voters.column('Chance'))
observed_sample.show(5)

In [None]:
# means are proportions

n = 4445
observed_sample = voters.select(0).sample(n, weights=voters.column('Chance'))

means = []
for i in np.arange(1000):
    resample = observed_sample.sample()
    means.append(np.count_nonzero(resample.column(0) == 'A') / n) 
    
print("Confidence interval : ["+str(percentile(2.5, means))+", "
      +str(percentile(97.5, means))+"] with width "+ str(percentile(97.5, means) - percentile(2.5, means)))

In [None]:
voters