In [None]:
#: the usual imports
import babypandas as bpd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import warnings; warnings.simplefilter('ignore')

def normal_curve(x, mu=0, sigma=1):
    return 1 / np.sqrt(2*np.pi) * np.exp(-(x - mu)**2/(2 * sigma**2))

def plot_sample_mean(sample_size):
    sample = flights.sample(sample_size).get('Delay')

    fig, ax1 = plt.subplots()
    plt.xlim([-10, 80])
    plt.grid('off')
    ax1.hist(sample, density=True, bins=np.arange(0, 200, 5), label='Sample')
    ax2 = ax1.twinx()
    plot_clt_curve(sample_size)
    ax1.set_ylim([-.0, .055])
    ax2.set_ylim([-.01, .5])

    plt.scatter(sample.mean(), 0, marker='^', color='C1', s=200, label='Sample Mean')
    plt.legend(loc='upper right')

plt.style.use('fivethirtyeight')

# Lecture 24

## Normal Confidence Intervals and Experiment Design

# Normal Confidence Intervals

## Normal confidence intervals

- We used bootstrapping to construct confidence intervals.
- But we can often use the CLT.
    - Computationally cheaper!

## Run the bootstrap

In [None]:
# the population
flights = bpd.read_csv('./data/united_summer2015.csv').get(['Delay'])
flights.plot(kind='hist', y='Delay', bins=np.arange(0, 200, 10), density=True)

In [None]:
#: run the bootstrap for sample mean of flight delays
n_boot = 5000
np.random.seed(4)
sample = flights.sample(400)
boot_means = np.array([])

for i in np.arange(n_boot):
    resample = sample.sample(sample.shape[0], replace=True)
    boot_mean = resample.get('Delay').mean()
    boot_means = np.append(boot_means, boot_mean)

## Visualize the bootstrap

In [None]:
#: visualize
bpd.DataFrame().assign(BootMeans=boot_means).plot(kind='hist', y='BootMeans', bins=30, density=True)

## Construct 95% confidence interval

In [None]:
#: construct confidence interval
left_boot = np.percentile(boot_means, 2.5)
right_boot = np.percentile(boot_means, 97.5)
[left_boot, right_boot]

In [None]:
bpd.DataFrame().assign(BootMeans=boot_means).plot(kind='hist', y='BootMeans', bins=30, density=True)
plt.plot([left_boot, right_boot], [0, 0], color='lime', linewidth=10);

## What did the bootstrap give us?

- We use bootstrapping to approximate the distribution of sample mean.
- But CLT tells us it is approximately normal!
- We can use the CLT directly to construct CIs without bootstrapping.

## Normal confidence intervals

- CLT tells us distribution of the sample mean is a bell curve, centered at the population mean.
- Don't know the population mean. Instead, center a bell curve at the sample mean.
- CLT tells us SD of bell curve is
$$
    \frac{
        \text{population standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$
- Don't know population SD. Instead, use sample SD:
$$
    \frac{
        \text{sample standard dev.}
    }{
        \sqrt{\text{sample size}}
    }
$$

## The normal curve

In [None]:
#: draw the normal curve
from scipy import stats
np.random.seed(4)
sample = flights.sample(400).get('Delay')
x = np.linspace(sample.mean() - 3*np.std(sample)/20, sample.mean() + 3*np.std(sample)/20, 1000)
y = stats.norm.pdf(x, sample.mean(), np.std(sample)/20)
plt.plot(x, y, color='black', linestyle='--')
plt.scatter(sample.mean(), 0, marker='^', color='C1', s=80, label='Sample mean')
plt.legend(loc='upper right')
plt.hist(boot_means, density=True, bins=30);

## Computing the confidence interval

- Recall: $\approx$ 95% of normal curve's area falls within $\pm$ 2 SDs of mean
- Don't confuse sample SD with SD of sample mean! (divide by $\sqrt{n}$)

In [None]:
#: compute confidence interval
left_normal = sample.mean() - 2*np.std(sample) / np.sqrt(400)
right_normal = sample.mean() + 2*np.std(sample) / np.sqrt(400)
[left_normal, right_normal]

## Visualize the confidence interval

In [None]:
#:
plt.plot(x, y, color='black', linestyle='--')
plt.plot([left_normal, right_normal], [0, 0], zorder=1)
plt.scatter(sample.mean(), 0, marker='^', color='C1', zorder=2, s=80)

## Comparison with bootstrap CI

In [None]:
#:
plt.plot(x, y, color='black', linestyle='--')
plt.plot([left_normal, right_normal], [0, 0], zorder=1, label='Normal')
plt.plot([left_boot, right_boot], [-.01, -.01], color='lime', label='Boot')
plt.scatter(sample.mean(), 0, marker='^', color='C1', zorder=2, s=80)
plt.legend()

## Recap

To compute normal 95% confidence interval for population mean:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

## So why use the bootstrap?

- We can use CLT to make CIs for:
    - sample mean
    - proportion
- But CLT doesn't apply to, for example, median.
- Have to use bootstrap there.

## Discussion question

Can we use the Central Limit Theorem to construct a $P$% confidence interval where $P \neq 95$?

- A) Yes, but we should no longer center our CI at the sample mean. 
- B) Yes, but we should no longer step 2 SDs in either direction.
- C) Yes, but the square root in the denominator may not be valid anymore.
- D) No, the CLT is only for 95% CIs.

# Hypothesis testing

## Hypothesis testing for the mean

- We can use the CLT to run hypothesis tests, too.

## Example: Body temperature

- Everyone knows that the average body temperature is 98.6 F (or 37 C).
- We have a data set of body temperatures.

In [None]:
#: temperatures
temperatures = bpd.read_csv('data/temp.csv')
temperatures.get('temperature').mean()

## Hypothesis test

- **Null hypothesis**: The population mean is 98.6 F.
- **Alternative hypothesis**: It is significantly less than 98.6 F.

## Recall: Confidence Intervals for Hypothesis Testing

* Null hypothesis: Population parameter = x
* Alternative hypothesis: Population parameter ≠ x
* Cutoff for P-value: p%
* Method:
    - Construct a (100-p)% confidence interval for the population parameter
    - If x is not in the interval, reject the null
    - If x is in the interval, can’t reject the null

## CI for Mean Body Temperature

 - To compute normal 95% confidence interval for population mean:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

- This does **not** say that temperatures are distributed normally!

- We don't know population SD, use sample SD instead.

In [None]:
sample_mean_sd = np.std(temperatures.get('temperature')) / np.sqrt(temperatures.shape[0])
sample_mean_sd

In [None]:
#::
[sample_mean - 2*sample_mean_sd, sample_mean + 2*sample_mean_sd]

Careful! This doesn't mean that 95% of temperatures fall in this range!

In [None]:
plt.hist(temperatures.get('temperature'), density=True);
plt.plot(x, y, color='black')

## Hypothesis testing

- We reject the null hypothesis, at the 0.05 significance level because 98.6 F is not in the 95% confidence interval for mean body temperature.
- What gives?
    - 98.6 F is due to Carl Reinhold August Wunderlich, 19th century physician, when thermometers were new.
    - [Later experiment:](https://med.stanford.edu/content/dam/sm/epidemiology/documents/HRP236/Parsonnet---Critical-Appraisal-of-98.6F.pdf) (36.8 C $\approx$ 98.2 F), but rounded up to (37 C $\approx$ 98.6 F)
    - [Interesting article here.](https://www.latimes.com/archives/la-xpm-2007-nov-26-he-esoterica26-story.html)

# Experiment Design

## Example: Polling

- You are conducting a campus poll:
    - Yes/No: Will you take a summer class?
- You want to estimate the proportion in population who say "Yes".
- Need to be accurate to within $\pm$ 0.03, 95% of the time.
- i.e., your 95% confidence interval should be 0.06 wide.
- How big of a sample do you need?

## Proportions are just means

- When polling, we'll write 1 if a person answers "Yes", 0 otherwise.
- The sample proportion who say "Yes" = mean of 0's and 1's
- Hence the CLT applies.

## Our strategy

1. We will poll a random sample of $n$ people
2. Compute sample mean (i.e., proportion of "Yes")
3. Compute sample standard deviation
4. Construct 95% confidence interval:
$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

Note the width our 95% CI is

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}}
$$

## Our strategy

- We want a CI whose width is 0.06 or less.
- The width of our confidence interval depends on two things:
    - sample SD
    - $\sqrt{n}$
- If we know SD, we can pick $n$ to make CI the right size by solving
$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}} \leq 0.06
$$

- **Problem**: Before polling, we don't know the sample SD because we don't have a sample!
- **Solution**: Find an upper bound for the sample SD.

## Bounding the sample SD

- How big could the sample SD be?
- This can be calculated with algebra, but...
- Let's try to get the answer from a plot.

## Create a population of 0's and 1's

In [None]:
#: construct an array of zeros/ones
def polling_population(size, number_of_ones):
    count = np.arange(size)
    return np.where(count < number_of_ones, 1, 0)

In [None]:
polling_population(10, 3)

## Compute standard deviations

In [None]:
#: compute SD for each proportion
size = 100
proportions = np.arange(size+1)
sds = np.array([])

for proportion in proportions:
    population = polling_population(size, proportion)
    sd = np.std(population)
    sds = np.append(sds, sd)

In [None]:
bpd.DataFrame().assign(
    Proportion=proportions,
    SD=sds
).plot(kind='scatter', x='Proportion', y='SD')

## Bounding the sample SD

- For any data set of 0's and 1's (sample or population), the most the SD can be is 0.5.
- Using 0.5 in place of the sample SD results in a conservative confidence interval.

## Constructing the 95% confidence interval

- The interval is:

$$
\left[
\text{sample mean} - 2\cdot \frac{\text{sample SD}}{\sqrt{n}},
\text{sample mean} + 2\cdot \frac{\text{sample SD}}{\sqrt{n}}
\right]
$$

- The width is:

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}}
$$

- Use 0.5 instead of sample SD gives an upper bound:

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}} \leq 4 \cdot \frac{0.5}{\sqrt{n}} = \frac{2}{\sqrt{n}}
$$

## Constructing the 95% confidence interval

- Want our sample mean to be within $\pm$ 0.03.
- I.e., want our CI to have width at most .06.

$$
4 \cdot \frac{\text{sample SD}}{\sqrt{n}} \leq 4 \cdot \frac{0.5}{\sqrt{n}} = \frac{2}{\sqrt{n}} \leq 0.06
$$

- Solve for $n$:

$$
\sqrt{n} = \frac{2}{0.06}
$$
$$
n = \left(\frac{2}{0.06}\right)^2
$$

In [None]:
(2/.06)**2

## Answer

- We must poll $\approx$ 1111 people to get a 95% CI that is at most 0.06 wide.

## Discussion question

We must poll 1111 people to get a 95% CI that is at most 0.06 wide.
Suppose we instead want a CI that is 0.03 wide. How many people do we need to poll?

- A) 555
- B) 2222
- C) 4444
- D) 8888

## Answer: the $\sqrt{\quad}$ rule

- We need 4444 people.
- To double our accuracy, we must *quadruple* our sample size.

## Experiment

- Let's see how accurate this is.
- Code below chooses some true proportion of "Yes" in the popluation. Don't peek!

In [None]:
#: don't peek
population_proportion = np.random.choice(np.arange(.3, .7, .01))

## Conduct a poll

In [None]:
#:
poll_results = np.random.multinomial(1111, [population_proportion, 1-population_proportion]) / 1111
poll_results

## Repeat the poll

In [None]:
#: simulate 5000 polls
distribution = np.array([])

for i in np.arange(5000):
    poll_results = np.random.multinomial(1111, [population_proportion, 1-population_proportion]) / 1111
    distribution = np.append(distribution, poll_results.item(0))

## Visualize

In [None]:
#: draw histogram
bpd.DataFrame().assign(SampledProportions=distribution).plot(kind='hist', y='SampledProportions', bins=30)

plt.scatter(population_proportion, 0, marker='^', zorder=3, color='lime', s=100)
plt.plot([population_proportion - .03, population_proportion + .03], [0, 0], color='C1', linewidth=5)

In [None]:
np.count_nonzero((population_proportion - .03 <= distribution) & (distribution <= population_proportion + .03)) / 5000