In [None]:
import numpy as np
from datascience import *
%matplotlib inline

np.set_printoptions(threshold=50)
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## How big are most of the values?
## Average plus or minus a few SDs

In [None]:
sample_size = 1000000
samples = []
for i in np.arange(sample_size):       #increase the size by adding more zeros
    samples.append( np.random.normal())
sample_table = Table().with_column('Sample', samples)
sample_table.hist(bins=np.arange(-4.5, 4.6, .1))

In [None]:
# how many samples are within 3 SDs?

sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows

In [None]:
#as a proportion of all samples
sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows/sample_size

Chebyshev says within 3 SDs, at least 1-1/9 = 88.8888%

In [None]:
#proportion within 2 SDs
sample_table.where('Sample', are.not_below(-2)).where('Sample', are.not_above(2)).num_rows/sample_size

In [None]:
#proportion within 1 SD
sample_table.where('Sample', are.not_below(-1)).where('Sample', are.not_above(1)).num_rows/sample_size

back to slides for Normal Proportions section

## Aren't Normal Distributions Rare?

In [None]:
#maternal age, not bell shaped; has long tail
births=Table.read_table('baby.csv')
births.hist(2)

In [None]:
np.mean(births.column(2))


In [None]:
np.mean(births.sample(1000).column(2))   #try it a few times


In [None]:
means = []
for i in np.arange(10000):
    means.append(np.mean(births.sample(1000).column(2)))


In [None]:
Table().with_column('Mean', means).hist(bins=50)

In [None]:
from IPython.display import Image
Image("clt.png", width=600)

## Central Limit Theorem: Another Example

In [None]:
#http://inferentialthinking.com/notebooks/united_summer2015.csv
united = Table.read_table('united_summer2015.csv')
united

In [None]:
united.hist('Delay', bins=30)


In [None]:
mean_delay = np.mean(united.column('Delay'))
sd_delay = np.std(united.column('Delay'))

print("mean delay:  "+str(mean_delay)+"           standard deviation:  "+str(sd_delay))

In [None]:
delay = united.select('Delay')
delay.show(3)

In [None]:
means = []
for i in np.arange(10000):
    sample = delay.sample(400)
    means.append(np.mean(sample.column(0)))

Table().with_column('Sample mean', means).hist(bins=30, unit='minute')


What will happen if we change the sample size from 400 to 10?

 A: Same shape  
 B: Same shape but more jagged   
 C: Longer left tail  
 D: Longer right tail

In [None]:
means = []
for i in np.arange(10000):
    sample = delay.sample(10)
    means.append(np.mean(sample.column(0)))

Table().with_column('Sample mean', means).hist(bins=30, unit='minute')


back to slides for Central Limit Theorem section

## Variability of the sample mean

In [None]:
# make a function to calculate sample means, based on a given sample size
# consistently use 10,000 repetitions

def sample_means(sample_size):
    means = []
    for i in np.arange(10000):
        sample = delay.sample(sample_size)
        means.append(np.mean(sample.column(0)))
    return means


In [None]:
# let's compare different sample sizes
Table().with_column(
    '400', sample_means(400),
    '900', sample_means(900),
    '2500', sample_means(2500),    
).hist(bins=30, unit='minute')

What is the same about these curves, and what is different? 

Sample mean is unbiased.

More accurate estimator when sample size is larger.

In [None]:
# How narrow /wide do the bell curves get? Measuredy by SD.

In [None]:
def variability(sample_size):
    means = sample_means(sample_size)
    Table().with_column('Sample mean', means).hist(bins=30, unit='minute')
    plots.xlim(0, 35);
    plots.ylim(0, .35);

    print('Sample size:                   ', sample_size)
    print('Population mean:               ', mean_delay)
    print('Average of sample means:       ', np.average(means))    
    print('Population SD:                 ', sd_delay)
    print('SD of sample means:            ', np.std(means))


In [None]:
variability(100)

In [None]:
variability(400)

In [None]:
variability(900)

What SD do you expect for sample size 3600?

A. Quarter of SD for sample size 900  
B. Half of SD for sample size 900  
C. Same as SD for sample size 900  
D. Double SD for sample size 900  
D. Four times SD for sample size 900  

In [None]:
variability(3600)

back to slides for recap and discussion questions

## Discussion Question 1

In [None]:
means = sample_means(1)
Table().with_column('Sample mean', means).hist(bins=30, unit='minute')

In [None]:
# what happens if I change sample size to 2?
means = sample_means(1)
Table().with_column('Sample mean', means).hist(bins=30, unit='minute')

In [None]:
#see how the histogram becomes more normal as sample size increases

for i in 2**(np.arange(10)):
    means = sample_means(i)
    Table().with_column('Sample mean', means).hist(bins=30, unit='minute')

back to slides for more discussion questions