In [None]:
#: the usual imports
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

def normal_curve(x, mu=0, sigma=1):
    return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))

def plot_sample_mean(sample_size):
    sample = flights.sample(sample_size).column('Delay')

    fig, ax1 = plt.subplots()
    plt.xlim([-10, 80])
    plt.grid('off')
    ax1.hist(sample, density=True, bins=np.arange(0, 200, 5), label='Sample')
    ax2 = ax1.twinx()
    plot_clt_curve(sample_size)
    ax1.set_ylim([-.0, .055])
    ax2.set_ylim([-.01, .5])

    plt.scatter(sample.mean(), 0, marker='^', color='C1', s=200, label='Sample Mean')
    plt.legend(loc='upper right')

plt.style.use('fivethirtyeight')

from notebook.services.config import ConfigManager

cm = ConfigManager()
cm.update(
   "livereveal", {
       'width': 1200,
       'height': 700,
       "scroll": True,
})

# Lecture 16

### Central Limit Theorem and Normal Confidence Intervals

## The Central Limit Theorem

> The distribution of sums (and averages) of large random samples (w/ replacement) are roughly normal, regardless of the distribution of the population from which the sample was drawn

## The shape of the distribution

- CLT: Sums and averages of random samples are roughly normal.
- Centered at population mean.
- What is the spread of the distribution?

## The spread of the distribution

Suppose we take a sample of flights from our flight delay dataset and compute the mean flight delay. The distribution of the sample mean will be bell-shaped and centered at the population mean, by the CLT.

If we increase the sample size (take larger samples), the distribution of the sample mean will have  
    A. higher SD  
    B. lower SD  
    C. the same SD

In [None]:
#: draw `sample_size` flights, compute mean, `repetitions` many times
def sample_mean_delays(sample_size, repetitions=2000):
    sample_means = make_array()
    for i in np.arange(repetitions):
        sample = flights.sample(sample_size)
        sample_mean = sample.column('Delay').mean()
        sample_means = np.append(sample_means, sample_mean)
    return sample_means

In [None]:
#: repeat the experiment for various sample sizes
sample_means = {}
sample_sizes = [1, 4, 16, 64, 100, 900, 1600]

for size in sample_sizes:
    sample_means[size] = sample_mean_delays(size)

In [None]:
#: plot the distributions
print("SD of The Population:", np.std(flights.column('Delay')))
bins = np.arange(5, 30)
for size in sample_sizes:
    Table().with_column(
        f'Sample Mean, size = {size}',
        sample_means[size]
    ).hist(bins=bins)
    print("Sample Size:", size)
    print("SD of This Distribution (Sample Mean):", np.std(sample_means[size]))

## CLT for the sample mean

- Sample mean of random samples is distributed normally.
- Centered at population mean.
- Standard deviation is:
$$
    \frac{
        \text{population standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$

In [None]:
#: clt's prediction
def clt_standard_deviation(n):
    sigma = flights.column('Delay').std()
    return sigma / np.sqrt(n)

In [None]:
#: plot prediction and observed
observed.scatter('Sample Size', s=70, zorder=2)
x = np.linspace(1, 1600)
y = clt_standard_deviation(x)
plt.plot(x, y, zorder=1)

## Recap

- We want to find population mean.
- But we only have one sample.
- Hope that sample mean is close to population mean.
- But it could have been different.

## Recap

- We can run bootstrap, or (sometimes) draw more samples to approximate sampling distribution.
- But CLT says that it is normal.
- Centered at population mean.
- Standard deviation is:
$$
    \frac{
        \text{population standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$

## Three Distributions

Suppose we draw a sample of flights and find the mean flight delay.

Three distributions are involved:

- The population (we don't see this)
- The sample (we see this)
- Distribution of the sample mean (we don't see this directly)
    - a.k.a, sampling distribution of mean
    - a.k.a, empirical distribution of sample mean

## The population (we don't see this)

In [None]:
flights = Table.read_table('./united_summer2015.csv')
flights.hist('Delay', bins=30)

## The population

- We don't see:
    - the population distribution,
    - the population mean (fixed!),
    - the population standard deviation (fixed!).

## The sample (we see this)

In [None]:
flights.sample(400).hist('Delay', bins=30)

## The sample

- We *do* see (or can calculate):
    - the sample itself (random!),
    - the sample's mean (random!),
    - the sample's standard deviation (random!).

## Distribution of the sample mean (we don't see this directly)

- We don't see it, but the **Central Limit Theorem** tells us what it should look like:
    - Normally distributed
    - Centered at population mean (fixed, unobserved).
    - Standard deviation (fixed, unobserved):
$$\frac{\text{population standard dev.}}{\sqrt{\text{sample size}}}$$

##  Distribution of the sample mean (we don't see this directly)

In [None]:
#:
population_mean = flights.column('Delay').mean()
population_sd = flights.column('Delay').std()

def plot_clt_curve(sample_size):
    x = np.linspace(-10, 100, 1000)
    y = normal_curve(x, population_mean, population_sd / np.sqrt(sample_size))
    plt.plot(x, y, color='black')
    
plot_clt_curve(400)

## The Central Limit Theorem

In [None]:
sample_size = 400

for i in range(10):
    plot_sample_mean(sample_size)

# Normal Confidence Intervals

## Normal confidence intervals

- We used bootstrapping to construct confidence intervals.
- But we can often use the CLT.
    - Computationally cheaper!
    
![normal.PNG](attachment:normal.PNG)

## Run the bootstrap

In [None]:
#: run the bootstrap for sample mean of flight delays
n_boot = 5000
np.random.seed(42)
sample = flights.sample(400)
boot_means = make_array()

for i in np.arange(n_boot):
    resample = sample.sample()
    boot_mean = resample.column('Delay').mean()
    boot_means = np.append(boot_means, boot_mean)

## Visualize the bootstrap

In [None]:
#: visualize
Table().with_column('Bootstrap Means', boot_means).hist()

## Construct 95% bootstrapped confidence interval

In [None]:
#: construct confidence interval
left_boot = percentile(2.5, boot_means)
right_boot = percentile(97.5, boot_means)
[left_boot, right_boot]

In [None]:
Table().with_column('Bootstrap Means', boot_means).hist()
plt.plot([left_boot, right_boot], [0, 0], color='lime', linewidth=10);

## What did the bootstrap give us?

- We use bootstrap to approximate distribution of sample mean.
- But CLT tells us it is approximately normal!
- We can use the CLT directly to construct CIs.

## Normal confidence intervals

- Draw a normal curve centered at the sample mean
- CLT tells us SD is
$$
    \frac{
        \text{population standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$
- Don't know population SD. Instead, use sample SD:
$$
    \frac{
        \text{sample standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$

## The normal curve

In [None]:
#: draw the normal curve
from scipy import stats
np.random.seed(42)
sample = flights.sample(400).column('Delay')
x = np.linspace(sample.mean() - 3*sample.std()/20, sample.mean() + 3*sample.std()/20, 1000)
y = stats.norm.pdf(x, sample.mean(), sample.std()/20)
plt.plot(x, y, color='black', linestyle='--')
plt.scatter(sample.mean(), 0, marker='^', color='C1', s=80, label='Sample mean')
plt.legend(loc='upper right')

## Computing the confidence interval

- Recall: $\approx$ 95% of normal curve's area falls within $\pm$ 2 SDs of mean
- Don't confuse sample SD with SD of sample mean! (divide by $\sqrt{\text{sample size}}$)

In [None]:
#: compute confidence interval
left_normal = sample.mean() - 2*sample.std() / np.sqrt(400)
right_normal = sample.mean() + 2*sample.std() / np.sqrt(400)
[left_normal, right_normal]

## Visualize the confidence interval

In [None]:
#:
plt.plot(x, y, color='black', linestyle='--')
plt.plot([left_normal, right_normal], [0, 0], zorder=1)
plt.scatter(sample.mean(), 0, marker='^', color='C1', zorder=2, s=80)

## Comparison with bootstrap CI

In [None]:
#:
plt.plot(x, y, color='black', linestyle='--')
plt.plot([left_normal, right_normal], [0, 0], zorder=1, label='Normal')
plt.plot([left_boot, right_boot], [-.01, -.01], color='lime', label='Boot')
plt.scatter(sample.mean(), 0, marker='^', color='C1', zorder=2, s=80)
plt.legend()

## Recap

To compute approximate 95% confidence interval for population mean:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{\text{sample size}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{\text{sample size}}}
\right]
$$

## So why use the bootstrap?

- We can use CLT to make CIs for:
    - sample mean
    - proportions (special case of mean)
- But CLT doesn't apply to, for example, median.
- Have to use bootstrap there.

# Hypothesis testing

## Hypothesis testing for the mean

- We can use the CLT to run hypothesis tests, too.

## Example: Body temperature

- Everyone knows that the average body temperature is 98.6 F (or 37 C).
- We have a data set of body temperatures.

In [None]:
#: temperatures
temperatures = Table.read_table('temp.csv')
temperatures.column(0).mean()

## Hypothesis test

- **Null hypothesis**: The population mean is 98.6 F.
- **Alternative hypothesis**: It is significantly less than 98.6 F.

## The null model

- Under null hypothesis, sample mean is distributed normally around 98.6, with SD: 
$$
    \frac{
        \text{population standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$
- Null hypothesis does not say that temperatures are distributed normally!
- We don't know population SD, use sample SD instead:
$$
    \frac{
        \text{sample standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$

In [None]:
sample_mean_sd = temperatures.column('temperature').std() / np.sqrt(temperatures.num_rows)
sample_mean_sd

## The distribution of the sample mean

In [None]:
#: 
x = np.linspace(98, 99, 1000)
y = normal_curve(x, 98.6, sample_mean_sd)
plt.plot(x, y, color='black')

sample_mean = temperatures.column('temperature').mean()
plt.scatter(sample_mean, 0, color='C1', marker='o', s=180, zorder=3)

## The p-value

- How often would we see something so small under the null hypothesis?

In [None]:
#: standardize
z = (sample_mean - 98.6) / sample_mean_sd
z

In [None]:
stats.norm.cdf(z)

## Hypothesis testing

- We reject the null hypothesis, as the probability of seeing something this small is very low.
- What gives?
- 98.6 F is due to Carl Reinhold August Wunderlich, 19th century physician.
- His thermometer was miscalibrated.
- Later experiment: (36.88 C $\approx$ 98.38 F), but rounded up to (37 C $\approx$ 98.6 F)

## A confidence interval

- A 95% confidence interval for the mean body temperature is:

In [None]:
[sample_mean - 2*sample_mean_sd, sample_mean + 2*sample_mean_sd]

- Careful! This doesn't mean that 95% of temperatures fall in this range!

In [None]:
plt.hist(temperatures.column('temperature'));
ax2 = plt.gca().twinx()
ax2.plot(x, y, color='black')
ax2.set_ylim([-.01, .5])
ax2.grid('off')

# Experiment Design

## Example: Polling

- You are conducting a campus poll:
    - Yes/No: Does Eleanor Roosevelt College exist?
- You want to estimate the proportion in population who believe "Yes".
- Need to be accurate to within $\pm$ 0.03, 95% of the time.
- i.e., your 95% confidence interval should be 0.06 wide.
- How big of a sample do you need?

## Proportions are just means

- When polling, we'll write 1 if a person answers "Yes", 0 otherwise.
- The sample proportion in favor of A = mean of 0's and 1's
- I.e., it is the mean of random sample.
- Hence the CLT applies.

In [None]:
a = make_array(0, 1, 1, 1, 0)
a.mean()

## Our strategy

1. We will poll a random sample of $n$ people
2. Compute sample mean (i.e., proportion in favor of A)
3. Compute sample standard deviation
4. Construct 95% confidence interval:
$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{\text{n}}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{\text{n}}}
\right]
$$

## Our strategy

- We want a CI whose width is 0.06 or less.
- The width of our confidence interval depends on two things:
    - sample SD
    - sample size, $n$
- If we know SD, we can pick $n$ to make CI the right size.
- But before polling, we don't know the sample SD (we don't have a sample!)

## Bounding the sample SD

- How big could the sample SD be?
- This is a simple calculation, but...
- Let's try to get the answer from a plot.

## Create a sample of zeros and ones

In [None]:
#: construct an array of zeros/ones
def polling_sample(size, number_of_ones):
    count = np.arange(size)
    return np.where(count < number_of_ones, 1, 0)

In [None]:
polling_sample(10, 3)

## Compute standard deviations

In [None]:
#: compute SD for each proportion
size = 100
proportions = np.arange(0, size+1, 10)
sds = make_array()

for proportion in proportions:
    population = polling_sample(size, proportion)
    sd = np.std(population)
    sds = np.append(sds, sd)

In [None]:
Table().with_columns(
    'Proportion', proportions,
    'SD', sds
).scatter('Proportion')

## Bounding the sample SD

- The most the sample SD can be is 0.5.
- Doesn't depend on size of sample.
- True for the population, too!
- Using 0.5 in place of sample SD results in a conservative confidence interval.

## Constructing the 95% confidence interval

- The interval is:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

- The width is:

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}}
$$

- Use 0.5 instead of sample SD:

$$
4 \cdot \frac{0.5}{\sqrt{n}} = \frac{2}{\sqrt{n}}
$$

## Constructing the 95% confidence interval

- Upper bound on CI width: $$\text{width} = \frac{2}{\sqrt{n}}$$
- Want our sample mean to be within $\pm$ 0.03.
- I.e., want our CI width to be smaller than 0.06.
- Solve for $n$:

$$
\sqrt{n} = \frac{2}{\text{width}} = \frac{2}{0.06}
$$

In [None]:
(2/.06)**2

## Answer

- We must poll $\approx$ 1111 people to get a 95% CI which is 0.06 wide.

## Discussion question

We must poll 1111 people to get a 95% CI which is 0.06 wide.
Suppose we instead want a CI that is 0.03 wide. How many people do we need to poll?

- A) 555
- B) 2222
- C) 4444
- D) 8888

## Answer: the $\sqrt{\quad}$ rule

- We need 4444 samples.
- To double our accuracy, we must *quadruple* our sample size.

## Experiment

- Let's see how accurate this is.
- Code below chooses some true proportion in favor of A. Don't peek!

In [None]:
#: don't peek
population_proportion = np.random.choice(np.arange(.3, .7, .01))

## Conduct a poll

In [None]:
poll_results = sample_proportions(1111, make_array(population_proportion, 1-population_proportion))
poll_results

## Repeat the poll

In [None]:
#: simulate 5000 polls
distribution = make_array()

for i in np.arange(5000):
    poll_results = sample_proportions(1111, make_array(population_proportion, 1-population_proportion))
    distribution = np.append(distribution, poll_results.item(0))

## Visualize

In [None]:
#: draw histogram
Table().with_column('Sampled Proportions', distribution).hist(bins=20)

plt.scatter(population_proportion, 0, marker='^', zorder=3, s=100)
plt.plot([population_proportion - .03, population_proportion + .03], [0, 0], color='C1', linewidth=5)

In [None]:
np.count_nonzero((population_proportion - .03 <= distribution) & (distribution <= population_proportion + .03)) / 5000