In [None]:
import numpy as np
from datascience import *
%matplotlib inline

np.set_printoptions(threshold=50)
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

## Measuring Variability


In [None]:
values = make_array(2, 3, 3, 9)
values

# how far are these numbers from the mean? 

In [None]:
# Step 1, what is the mean?

average = np.average(values)
average

In [None]:
# Step 2, the deviations from average
# how far from the average? (each value)

deviations = values - average
deviations

In [None]:
t = Table().with_columns(
    'Value', values,
    'Deviation from Average', deviations,
)
t

Do the values tend to be far from the average? 

In [None]:
# Let's take the average of the deviations

np.average(t.column(1))

In [None]:
# Step 3. The squared deviations from average
# square them -> positive

t = t.with_column('Squared Deviations', deviations ** 2)
t

In [None]:
# Step 4: Variance = the mean squared deviation from average
# Units have changed. 

variance = np.mean(t.column('Squared Deviations'))
variance

In [None]:
# Step 5: Standard deviation: root mean squared deviation from average
# Steps of calculation:         5    4     3        2             1 
sd = np.sqrt(variance)
sd

In [None]:
#the bigger the standard deviation (or the bigger the variance), the more spread out the distribution 
np.std(values)

back to slides for recap and discussion questions

## Interpreting Standard Deviation

In [None]:
#http://inferentialthinking.com/notebooks/nba2013.csv
nba13 = Table.read_table('nba2013.csv')
nba13

In [None]:
nba13.select('Height').hist(bins=np.arange(68, 88, 1))

In [None]:
mean_height = np.mean(nba13.column('Height'))
mean_height

In [None]:
median_height = np.median(nba13.column('Height'))
median_height

In [None]:
mean_height = np.mean(nba13.column('Height'))
sd_height = np.std(nba13.column('Height'))
(mean_height, sd_height)

In [None]:
# tallest person
nba13.sort('Height', descending=True).show(3)

In [None]:
#shortest player
nba13.sort('Height').show(3)

In [None]:
# for tallest player. How many inches above the mean?
87 - mean_height

How many SDs above the mean? We calculated SD = 3.45.

In [None]:
# look at the deviation relative to SD
(87 - mean_height)/sd_height


In [None]:
# Now for the shortest player
# how many SDs below average?

(69 - mean_height)/sd_height

# We start working with Standard Units
# They measure how many SDs above average.

In [None]:
#conversion to standard units

def standard_units(numbers_array):
    "Convert any array of numbers to standard units."
    return (numbers_array - np.mean(numbers_array))/np.std(numbers_array)    

We can convert heights to standard units, from the original units of inches. 

We saw that 87 inches was about 2.3 SU, and 69 inches was about -2.9 SU.

What values are possible for SU? Can they all be positive? Can there be a value like 9 SU? 

In [None]:
nba13 = nba13.with_column(
    'Height (Standard Units)', standard_units(nba13.column('Height'))
)
nba13.sort(0)

In [None]:
from IPython.display import Image
Image("su.png", width=600)

## Standard Units and Chebyshev Bounds

In [None]:
#http://inferentialthinking.com/notebooks/united_summer2015.csv
united = Table.read_table('united_summer2015.csv')
united = united.with_column(
    'Delay (Standard Units)', standard_units(united.column('Delay'))
)
united

In [None]:
np.std(united.column(3))

In [None]:
united.sort('Delay', descending=True)

In [None]:
within_3_sd = united.where('Delay (Standard Units)', are.between(-3, 3))
within_3_sd.num_rows/united.num_rows

In [None]:
united.hist('Delay (Standard Units)', bins=np.arange(-5, 15.5, 0.5))
plots.xticks(np.arange(-6, 17, 3));

back to slides for Chebyshev Bound slides

## Chebyshev Bounds: Another Example

In [None]:
#http://inferentialthinking.com/notebooks/baby.csv
births = Table.read_table('baby.csv')
births.hist(2)

In [None]:
avg = np.mean(births.column('Maternal Age'))
sd = np.std(births.column(2))
avg, sd

In [None]:
# proportion of rows where the ages in between average +/- 2 std 
(births
 .where(2, are.between(avg - 2*sd, avg + 2*sd))
 .num_rows
 / births.num_rows)

## Normal Distribution

In [None]:
from IPython.display import Image
Image("sdhist.png", width=600)

In [None]:
# all at once, not helpful
births.hist()

In [None]:
# all of them on separate histograms
births.hist(overlay=False)

In [None]:
#maternal height looks pretty normal (height and weight often are)

avg_h = np.round(np.mean(births.column('Maternal Height')), 1)
sd_h = np.round(np.std(births.column("Maternal Height")),1)
avg_h, sd_h

In [None]:
# bell shaped curve
births.hist(3, bins=np.arange(55.5, 72.5, 1), unit='inch')
positions = np.arange(-3, 3.1, 1)* sd_h + avg_h
plots.xticks(positions);

# where is the point of inflection?

back to slides for Standard Normal Curve

## The standard normal curve

In [None]:
from scipy import stats

plot_normal_cdf()

In [None]:
samples = []
for i in np.arange(100):       #increase the size by adding more zeros
    samples.append( np.random.normal())
sample_table = Table().with_column('Sample', samples)
sample_table.hist(bins=np.arange(-4.5, 4.6, .1))

In [None]:
from IPython.display import Image
Image("q18.png", width=600)

## How big are most of the values?
## Average plus or minus a few SDs

In [None]:
# how many samples are within 3 SDs?

sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows

In [None]:
#as a proportion of all samples
sample_table.where('Sample', are.not_below(-3)).where('Sample', are.not_above(3)).num_rows/1000000

In [None]:
#Chebyshev says within 3 SDs, at least 1-1/9 = 88.8888%

In [None]:
#proportion within 2 SDs
sample_table.where('Sample', are.not_below(-2)).where('Sample', are.not_above(2)).num_rows/1000000

In [None]:
#proportion within 1 SD
sample_table.where('Sample', are.not_below(-1)).where('Sample', are.not_above(1)).num_rows/1000000

back to slides for Normal Proportions section

## Aren't Normal Distributions Rare?

In [None]:
#maternal age, not bell shaped; has long tail
births.hist(2)

In [None]:
np.mean(births.column(2))


In [None]:
np.mean(births.sample().column(2))


In [None]:
means = []
for i in np.arange(10000):
    means.append(np.mean(births.sample(1000).column(2)))


In [None]:
Table().with_column('Mean', means).hist(bins=50)

In [None]:
from IPython.display import Image
Image("clt.png", width=600)

## Central Limit Theorem

In [None]:
united

In [None]:
mean_delay =  np.mean (united.column("Delay"))
sd_delay =  np.std (united.column("Delay"))
(mean_delay, sd_delay)

In [None]:
delay = united.select("Delay")
delay.hist(bins=np.arange(-30, 600, 10))

In [None]:
sample_size=400
rep = 10000
means = make_array()

for  i in np.arange(rep):
    sample = delay.sample(sample_size)
    new_mean = np.mean(sample.column("Delay"))
    means = np.append(means, new_mean)

results = Table().with_column("Sample means", means)


In [None]:
results.hist(bins = np.arange(10, 25, 0.5))