In [None]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')

## Standard Units

In [None]:
exams = Table.read_table('data/exams_fa18.csv')
exams.show(5)

In [None]:
exams.hist(overlay=False, bins=np.arange(0,101,5))

In [None]:
def standard_units(x):
    """Convert array of values to standard units"""
    return (x - np.average(x)) / np.std(x)

In [None]:
midterm_su = standard_units(exams.column('Midterm'))
exams = exams.with_column('Midterm in Standard Units', midterm_su)

final_su = standard_units(exams.column('Final'))
exams = exams.with_column('Final in Standard Units', final_su)

exams.show(10)

In [None]:
(exams
    .select('Midterm in Standard Units', 'Final in Standard Units')
    .hist(overlay=False, bins=np.arange(-4,2,0.1))
)

## The SD and Bell Shaped Curves

In [None]:
births = Table.read_table('data/baby.csv')

In [None]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [None]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [None]:
# 1 SD below & above the mean
np.mean(heights) - np.std(heights), np.mean(heights) + np.std(heights)

--- 
back to slides

---

## The Standard Normal Curve

The standard normal curve and its first and second derivative
\begin{align}
\phi(z)= & \frac{1}{\sqrt{2\pi}} e^ {\frac{1}{2}z^2}\\
\frac{\delta{\phi}}{\delta{z}}  = & - \frac{1}{\sqrt{2\pi}} e^ {\frac{1}{2}z^2} x\\
\frac{\delta^2{\phi}}{\delta{z}^2}  = & \frac{1}{\sqrt{2\pi}} e^ {\frac{1}{2}z^2} (x^2-1)\\
\end{align}

In [None]:
# Create the functions and evaluate them 
def snc(x):
    return 1/(2*np.pi)**0.5 * np.e**(-0.5*x**2)

def snc_de1(x):
    return -1/(2*np.pi)**0.5 * np.e**(-0.5 * x**2) *x

def snc_de2(x):
    return 1/(2*np.pi)**0.5 * np.e**(-0.5 * x**2) * (x**2 - 1)

x = np.arange(-4,4,0.1)
y = snc(x)
y_p1 = snc_de1(x)
y_p2 = snc_de2(x)

In [None]:
births = births.with_column('SU', standard_units(heights))

In [None]:
fig, ax = plots.subplots(figsize=(16,9), dpi=80)
ax.hist(standard_units(heights), bins=np.arange(-3, 3, 0.4), density=True)
ax.plot(x, y)
ax.plot(x, y_p1)
ax.plot(x, y_p2)

ax.vlines(-1, 0, 0.4, color='yellow', zorder=2)
ax.vlines(1, 0, 0.4, color='yellow', zorder=2)

ax.legend(['$\phi(z)$', 
           '$\delta{\phi}/\delta{z}$',
           '$\delta^2{\phi}/\delta{z}^2$', 
           'Inflection Points'])

fig.savefig('out.png')

---
back to slides

---

## Central Limit Theorem 

In [None]:
united = Table.read_table('data/united.csv')
united

In [None]:
united_bins = np.arange(-20, 300, 10)
united.hist('Delay', bins=united_bins)

In [None]:
delays = united.column('Delay')
delay_median = np.median(delays)
delay_mean = np.mean(delays)
delay_sd = np.std(delays)
delay_mean, delay_sd, delay_median

In [None]:
percentile(50, delays)

In [None]:
delays.size

In [None]:
def one_sample_mean(sample_size):
    """ 
    Takes a sample from the population of flights 
    and computes its mean
    """
    sampled_flights = united.sample(sample_size)
    return np.mean(sampled_flights.column('Delay'))

In [None]:
one_sample_mean(100)

In [None]:
def ten_thousand_sample_means(sample_size):
    means = make_array()
    for i in np.arange(10_000):
        mean = one_sample_mean(sample_size)
        means = np.append(means, mean)
    return means

In [None]:
sample_means_100 = ten_thousand_sample_means(100)

Table().with_column('Mean of 100 flight delays', sample_means_100).hist(bins=20)

delay_mean, np.mean(sample_means_100)

## Now lets take 400

In [None]:
sample_means_400 = ten_thousand_sample_means(400)

Table().with_column('Mean of 400 flight delays', sample_means_400).hist(bins=20)

delay_mean, np.mean(sample_means_400)

Question: 
- What changed? How is this histogram different from the previous?
- hint: check x axis AND y-axis

In [None]:
np.mean(sample_means_400)

## Now lets do 900
what do we expect?

In [None]:
sample_means_900 = ten_thousand_sample_means(900)

Table().with_column(
    'Mean of 900 flight delays', sample_means_900).hist(bins=20)

delay_mean, np.mean(sample_means_900)

## ...  and 1600

In [None]:
sample_means_1600 = ten_thousand_sample_means(1600)

Table().with_column('Mean of 1600 flight delays', sample_means_1600).hist(bins=20)

delay_mean, np.mean(sample_means_1600)

## Plot them all together

In [None]:
means_tbl = Table().with_columns(
    '100', sample_means_100,
    '400', sample_means_400,
    '900', sample_means_900,
    '1600', sample_means_1600,
)
means_tbl

In [None]:
means_tbl.hist(bins = np.arange(5, 31, 0.3), alpha=0.4, histtype='step', linewidth=2, fill=True)

In [None]:
means =  make_array(np.mean(sample_means_100),
                    np.mean(sample_means_400),
                    np.mean(sample_means_900),
                    np.mean(sample_means_1600))

stds = make_array(np.std(sample_means_100),
                  np.std(sample_means_400),
                  np.std(sample_means_900),
                  np.std(sample_means_1600))

summary = Table().with_columns('sample size', make_array(100, 400, 900 ,1600),
                               'means', means,
                               'std', stds)                               
summary

In [None]:
summary.plot('sample size', 'std', marker='x')

In [None]:
sample_size_scales = make_array(1, 4, 9, 16)
summary.with_columns('sample size scale', sample_size_scales,
                     'sqrt scale factor', 1/np.sqrt(sample_size_scales),
                     'std scale factor', stds / stds[0])