In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import t

# Constructing confidence intervals via central limit theorem

In [2]:
# From PTSD paper, there were 52 test cases, 42 of which were correct

n = 52
c = 42

# Create array of zeros, size n
rawdata = np.zeros(n)

# Set first c elements to 1
rawdata[range(c)] = 1

# Create pandas data frame
data = pd.DataFrame({"Match": rawdata}) # a dictionary {"Match": rawdata}

First we'll build the confidence interval assuming a normal variable.

In [5]:
## Using central limit theorem, compute confidence interval

stderr = np.std(data.Match, ddof=1)/np.sqrt(len(data.Match)) # compute standard error
print("Stderr: %.3f" %stderr)confidence 

# Area under a standard normal from -1.96 to 1.96 is about 95%
critval = 1.96

norm_ci = [data.Match.mean() - critval*stderr, 
           data.Match.mean() + critval*stderr]

print("Norm ci:",norm_ci)

Stderr: 0.055
Norm ci: [0.6995259303230998, 0.9158586850615156]


What if we use a t-test? The $100(1-\alpha)\%$ confidence interval is

$$ \bar{x} \pm  t_{1-\alpha/2, n-1} \dfrac{\hat{\sigma}}{\sqrt{n}} $$

The t distribution is available for us to use in the [scipy.stats.t package](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.t.html).

In [6]:
# Get the critical values for t at an alpha = 0.05/2, and 52-1 = 51 dof.
t_quantile = t.ppf(1 - 0.05 / 2, df = n - 1) # gets critical value for t distribution (ppf() from scipy.stats)
print('The t-based critical value is equal to %.3f' % t_quantile)

t_ci = data.Match.mean() + t_quantile * stderr * np.array([-1, 1])
print('The t-based confidence interval is equal to', t_ci)

The t-based critical value is equal to 2.008
The t-based confidence interval is equal to [0.69689993 0.91848469]
