# <span style="color:red"> Confidence intervals </span>

In [3]:
# Import modules 

import numpy as np
from datascience import *
# from datascience_extensions import *
import seaborn as sns
sns.set_style("whitegrid")
%matplotlib inline
import os.path

A summary of the methods for Table is [here](http://data8.org/datascience/tables.html) <br>
A tutorial for the datascience module is [here](http://data8.org/datascience/tutorial.html) <br>
A cheatsheet for the datascience module is [here](https://github.com/wstuetzle/STAT180/blob/master/Computing/data8_sp17_midterm_ref_sheet.pdf)

### <span style="color:blue"> Confidence intervals for a population mean </span>

Let $\mu$ denote the population mean and $\sigma$ denote the population standard deviation. In the chapter on Sampling we found that approximately 95% of sample means of random samples of size $n$ fall in the interval

$$\left[\mu-\dfrac {2\sigma}{\sqrt {n}}, \mu+\dfrac {2\sigma}{\sqrt {n}}\right]$$

<br>

We can rephrase that: Let $\overline {x}$ be the mean of a random sample of size $n$. For sufficienly large $n$, the interval

$$\left[\overline {x}-\dfrac {2\sigma}{\sqrt {n}}, \overline {x}+\dfrac {2\sigma}{\sqrt {n}}\right]$$

has approximately a 95% chance of containing the population mean mu.

** Note **: This is a random interval because it is centered at the sample mean xbar.

<br>

In practice we do not know the population standard deviation $\sigma$. An obvious way out is to estimate $\sigma$
by the standard deviation of the sample - let's call that $\overline {\sigma}$. For sufficiently large sample size $n$, $\overline {\sigma}$ will be close to $\sigma$, and therefore the interval

$$\left[\overline {x}-\dfrac {2\overline {\sigma}}{\sqrt {n}}, \overline {x}+\dfrac {2\overline {\sigma}}{\sqrt {n}}\right]$$

has approximately a 95% chance of containing the population mean mu.

<br>

** The (random) interval $\left[\overline {x}-\dfrac {2\overline {\sigma}}{\sqrt {n}}, \overline {x}+\dfrac {2\overline {\sigma}}{\sqrt {n}}\right]$ is a 95% confidence interval for the population mean $\mu$. **

### <span style="color:blue"> Experiment with confidence intervals for a population mean </span>

Use the flight delay data as in the "Sampling" chapter. 

Draw 100,000 samples each for sample sizes [4, 16, 64, 256, 1024].

For each sample size 

* Compute the 100,000 sample means, the 100,000 sample stds, and the 100,000 confidence intervals.

* Compute the percentage of confidence intervals that contain the population mean.


In [4]:
delay_tab = Table.read_table("https://github.com/wstuetzle/STAT180/raw/master/\
Lectures/Sampling/united.csv")
N = delay_tab.num_rows

Population = delay_tab.where("Delay", are.below(100)).select("Delay")
N = Population.num_rows
Population_mean = np.mean(Population.column("Delay"))

# Choose sample sizes and number or samples to be drawn

sample_sizes = [4, 16, 64, 256, 1024]
num_samples = 100000    

means = np.zeros(num_samples)
stds = np.zeros(num_samples)

def evaluate_mean_std_for_random_samples(sample_size, num_samples, with_replacement = True):
    sample_means = np.zeros(num_samples)
    sample_stds = np.zeros(num_samples)
    for i in range(num_samples):
        sample = Population.sample(sample_size, with_replacement)
        sample_means[i] = np.mean(sample.column("Delay"))
        sample_stds[i] = np.std(sample.column("Delay"))
        if i % 10000 == 0:
            print(str(i) + " ", end = "")
    return([sample_means, sample_stds])



In [5]:
mean_std_filename = "mean-std.csv"
file_exists = os.path.isfile(mean_std_filename)
if file_exists:
    mean_std_tab = Table.read_table(mean_std_filename)

if not file_exists: 
    mean_std_tab = Table()
    for n in sample_sizes:
        evaluate_out = evaluate_mean_std_for_random_samples(n, num_samples)
        mean_estimates = evaluate_out[0]
        std_estimates = evaluate_out[1]
        mean_std_tab = mean_std_tab.with_column(str(n) + "-mean", mean_estimates)\
                                    .with_column(str(n) + "-std", std_estimates)
    
    mean_std_tab.to_csv(mean_std_filename)

mean_std_tab.show(5)

4-mean,4-std,16-mean,16-std,64-mean,64-std,256-mean,256-std,1024-mean,1024-std
15.5,15.5804,13.1875,21.2639,9.95312,19.8875,12.2578,23.9784,11.7949,22.7022
-3.5,3.5,7.375,13.5595,10.0312,21.3761,10.7773,21.5452,10.5098,21.916
3.0,4.74342,6.5625,14.5558,10.1562,19.015,10.1562,20.0413,10.1875,20.6313
2.0,12.7083,17.1875,25.4907,9.20312,18.0392,9.62891,19.6009,9.53711,20.0376
26.25,20.1541,6.4375,19.6404,10.625,20.2319,11.5742,21.2046,10.3975,21.6582


In [6]:
# Compute coverage probabilities

for n in sample_sizes:
    mean_label = str(n) + "-mean"
    std_label = str(n) + "-std"
    sample_means = mean_std_tab.column(mean_label)
    sample_stds = mean_std_tab.column(std_label)
    ci_limits = Table().with_columns("lower", sample_means - 2 * sample_stds / n ** (0.5),\
                                     "upper", sample_means + 2 * sample_stds / n ** (0.5))
    cover_count = ci_limits.where("lower", are.below(Population_mean))\
                           .where("upper", are.above(Population_mean)).num_rows
    print("n = " + str(n) + "   cover percentage = " + str(100 * cover_count / num_samples))

n = 4   cover percentage = 68.311
n = 16   cover percentage = 87.012
n = 64   cover percentage = 93.386
n = 256   cover percentage = 95.064
n = 1024   cover percentage = 95.427
