In [None]:
from datascience import *
import numpy as np
## Normal Distribution
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
voters = Table().with_columns("Candidate", ["A", "B"], "Chance", [0.6, 0.4])

## Normal Distribution

In [None]:
#http://inferentialthinking.com/notebooks/baby.csv
births = Table.read_table('baby.csv')
births.show(3)

In [None]:
# all of them on separate histograms
births.hist(overlay=False)

In [None]:
#maternal height looks pretty normal (height and weight often are)

avg_h = np.round(np.mean(births.column('Maternal Height')), 1)
sd_h = np.round(np.std(births.column("Maternal Height")),1)
print("mean:  "+str(avg_h)+"     standard deviation:  "+ str(sd_h))

In [None]:
# bell shaped curve
births.hist(3, bins=np.arange(55.5, 72.5, 1), unit='inch')
positions = np.arange(-3, 3.1, 1)* sd_h + avg_h
plots.xticks(positions);

In [None]:
#(back to slides)

## The standard normal curve

In [None]:
from scipy import stats

plot_normal_cdf()

In [None]:

samples = []
for i in np.arange(1000000):
    samples.append( np.random.normal())
    
    
sample_table = Table().with_column('Sample', samples)
sample_table.hist(bins=np.arange(-3.5, 3.6, .1))

In [None]:
#(back to slides)

## Average Plus or Minus a Few SDs

In [None]:
# how many samples are within 3 SDs?

sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows

In [None]:
#as a proportion of all samples
sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows/1000000

In [None]:
#Chebyshev says within 3 SDs, at least 1-1/9 = 88.8888%

In [None]:
#proportion within 2 SDs
sample_table.where('Sample', are.not_below(-2)).where('Sample', are.not_above(2)).num_rows/1000000

In [None]:
#proportion within 1 SD
sample_table.where('Sample', are.not_below(-1)).where('Sample', are.not_above(1)).num_rows/1000000

In [None]:
#(back to slides)

## Aren't Normal Distributions Rare?

In [None]:
#maternal age, not bell shaped; has long tail
births.hist(2)

In [None]:
print("population mean:  "+str(np.mean(births.column(2))))


In [None]:
print("population mean:  "+str(np.mean(births.sample(1000).column(2))))


In [None]:
means = []
for i in np.arange(10000):
    means.append(np.mean(births.sample(1000).column(2)))


In [None]:
Table().with_column('Mean', means).hist(bins=50)

In [None]:
#(back to slides)

## Central Limit Theorem

In [None]:
#http://inferentialthinking.com/notebooks/united_summer2015.csv
united = Table.read_table('united_summer2015.csv')
united

In [None]:
united.hist('Delay', bins=30)


In [None]:
mean_delay = np.mean(united.column('Delay'))
sd_delay = np.std(united.column('Delay'))

print("mean delay:  "+str(mean_delay)+"           standard deviation:  "+str(sd_delay))

In [None]:
delay = united.select('Delay')
delay.show(3)

In [None]:
# Side note: look at how I build my list of medians and how I use append method. A lot of questions on piazza about it. 

means = []
for i in np.arange(10000):
    sample = delay.sample(400)
    means.append(np.mean(sample.column(0)))

Table().with_column('Sample mean', means).hist(bins=30, unit='minute')


In [None]:
# What will happen if we change the sample size from 400 to 10?
# A: About the same (bell curve)
# B: More jagged 
# C: Longer left tail
# D: Longer right tail
# E: More like the histogram of delays

In [None]:
means = []
for i in np.arange(10000):
    sample = delay.sample(10)
    means.append(np.mean(sample.column(0)))

Table().with_column('Sample mean', means).hist(bins=30, unit='minute')


In [None]:
(#back to slides)

## Variability of the sample mean

In [None]:
# let's make a function
# parameter: sample size

def sample_means(sample_size):
    means = []
    for i in np.arange(10000):
        sample = delay.sample(sample_size)
        means.append(np.mean(sample.column(0)))
    return means
sample_means(400)

In [None]:
# let's compare different sample sizes
Table().with_column(
    '400', sample_means(400),
    '900', sample_means(900),
    '2500', sample_means(2500),
).hist(bins=30, unit='minute')

In [None]:
# how narrow do they get and why is it important

# back to slides

In [None]:
sd_delay

In [None]:
def variability(sample_size):
    means = sample_means(sample_size)
    Table().with_column('Sample mean', means).hist(bins=30, unit='minute')
    
    print('Sample size:          ', sample_size)

In [None]:
variability(400)

In [None]:
# according to the formula: SD = (population SD) / √sample size

In [None]:
def variability(sample_size):
    means = sample_means(sample_size)
    Table().with_column('Sample mean', means).hist(bins=30, unit='minute')
    sqrt_n = np.sqrt(sample_size)

    print('Sample size:                   ', sample_size)
    print('Square root n:                 ', sqrt_n)
    print('Sample mean SD:                ', np.std(means))
    print('Population SD / Square root n =', sd_delay / sqrt_n)

In [None]:
variability(400)

In [None]:
# let's increase the sample size
variability(800)


In [None]:
# What SD do you expect for sample size 3200?

# A: Half of SD for sample size 800
# B: Same as SD for sample size 800
# C: Double SD for sample size 800
# D: Four times SD for sample size 800
# E: I don't know

variability(3200)


In [None]:
#(back to slides)

## Discussion Question 1

In [None]:
variability(1)

In [None]:
# what happens if I change sample size to 2?
variability(2)

In [None]:
#see how the histogram becomes more normal as sample size increases

for i in 2**(np.arange(10)):
    variability(i)

In [None]:
#(back to slides)

## Confidence Intervals and Sample Size


In [None]:
#http://inferentialthinking.com/notebooks/san_francisco_2015.csv
sf = Table.read_table('san_francisco_2015.csv').select(3, 11, 21)
sf.set_format(2, NumberFormatter(0))
sf = sf.where(2, are.above(10000))
sf.show(3)

In [None]:
comp_bins = np.arange(0, 700000, 25000)
sf.hist(2, bins=comp_bins, unit='dollar')

In [None]:
# confidence interval methods

def bootstrap_mean(sample_from_population, label, repetitions):
    resampled_means = []
    for i in np.arange(repetitions):
        resample = sample_from_population.sample()
        mean = np.mean(resample.column(label))
        resampled_means.append(mean)
    return resampled_means

def bootstrap_ci_mean(sample_from_population, label, repetitions):
    resampled_means = bootstrap_mean(sample_from_population, label, repetitions)
    
    interval_95 = make_array(
        percentile(2.5, resampled_means),
        percentile(97.5, resampled_means)
    )
    
    Table().with_column('Resampled mean', resampled_means).hist(0)
    plots.plot(interval_95, [0, 0], color='gold', lw=8)
    print('Approximate 95% Bootstrap Confidence Interval for Population Mean:')
    print(np.round(interval_95, 3))
    print('Interval Width: '+str(interval_95[1] - interval_95[0]))

In [None]:
sf_sample = sf.sample(200)
sf_sample.show(3)

In [None]:
#resample 1000 times from my original sample (with replacement)
#display histogram showing 95% confidence interval

bootstrap_ci_mean(sf_sample, 'Total Compensation', 1000)

In [None]:
# Sample size was 200. Confidence interval is too wide
# Note, numbers will change because samples are random! 

In [None]:
# What do you think the sample size should be to achieve a 
# 95% confidence interval with width of $10,000 or less?

# A: 200
# B: 300
# C: 400
# D: 800
# E: 10,000

In [None]:
















sf_sample = sf.sample(800)
bootstrap_ci_mean(sf_sample, 'Total Compensation', 1000)

In [None]:
#(back to slides to recap)

## Experiment design

In [None]:
#potential population

votes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
np.std(votes)

In [None]:
votes = [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]
np.std(votes)

In [None]:
total_voters=10

def sd_voters(n_voters_for_a):
    votes = np.append(np.ones(n_voters_for_a), np.zeros(total_voters - n_voters_for_a))
    print(votes)
    return np.std(votes)

sd_voters(6)

In [None]:
# Fact: if data contains only 0-1 then SD will never be above 0.5
# we know that worst_sd_pop = 0.5
# width_in_sds = 4
# desired_width = 0.03


In [None]:
# √(sample size)  ≥  4 x (SD of 0-1 population) / 0.03
worst_sd_pop = 0.5
width_in_sds = 4
desired_width = 0.03
(width_in_sds * (worst_sd_pop / desired_width)) ** 2

In [None]:
#  no matter what how my population looks like, if I take a sample of 4445 people, my CI will always be 0.03 width or less

## Conducting the experiment


In [None]:
voters.labels

In [None]:
voters.select(0)

In [None]:
observed_sample = voters.select(0).sample(1000, weights=voters.column('Chance'))
observed_sample.show(3)

In [None]:
n = 4445
observed_sample = voters.select(0).sample(n, weights=voters.column('Chance'))

means = []
for i in np.arange(1000):
    resample = observed_sample.sample()
    means.append(np.count_nonzero(resample.column(0) == 'A') / n)
print(percentile(2.5, means), percentile(97.5, means))

In [None]:
0.6098987626546681 - 0.5808773903262092

In [None]:
voters