# Statistische Inferenz: Bootstratp Konfidenzintervalle



In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/Library_Usage.csv", 
                 na_values="None",
                low_memory=False
                )
df = df[(
    (df['Year Patron Registered'] == 2010) & 
    (df['Circulation Active Year'] == 2016)
)]
df.head()

Unnamed: 0,Patron Type Code,Patron Type Definition,Total Checkouts,Total Renewals,Age Range,Home Library Code,Home Library Definition,Circulation Active Month,Circulation Active Year,Notice Preference Code,Notice Preference Definition,Provided Email Address,Year Patron Registered,Within San Francisco County
37223,2,Teen,3,2,10 to 19 years,b4,Bernal,Mar,2016.0,z,Email,True,2010,False
57957,0,Adult,109,50,45 to 54 years,m8,Mission Bay,Sep,2016.0,z,Email,True,2010,False
64977,2,Teen,146,11,10 to 19 years,p7,Potrero,Aug,2016.0,z,Email,True,2010,False
65696,2,Teen,20,0,10 to 19 years,p7,Potrero,Feb,2016.0,z,Email,True,2010,False
72677,0,Adult,25,0,20 to 24 years,c2,Chinatown,Jul,2016.0,z,Email,True,2010,False


## CI for the mean of `Total Renewals`

In [15]:
df['Total Renewals'].mean()

22.15

In [16]:
bt_means = []
alpha = 0.10
S= 10000
for i in range(S):
    stat = df['Total Renewals'].sample(len(df), replace=True).mean()
    bt_means.append(stat)

In [17]:
pd.Series(bt_means).between(89, 92).mean()

0.0

In [18]:
pd.Series(bt_means).quantile((alpha/2, 1-alpha/2))

0.05    11.099167
0.95    35.433333
dtype: float64

## Case-Study: CI for teen and adult library users

In [20]:
df['Patron Type Definition'].value_counts()

Teen     36
Adult    24
Name: Patron Type Definition, dtype: int64

In [22]:
teen = df['Total Checkouts'][df['Patron Type Definition'] == 'Teen']
adult = df['Total Checkouts'][df['Patron Type Definition'] == 'Adult']
print((len(teen), len(adult)))
print(teen.mean(), adult.mean())
print(teen.median(), adult.median())
print(teen.var(), adult.var())

(36, 24)
177.30555555555554 201.08333333333334
61.0 40.0
88054.90396825397 112411.81884057971


### Bootstrap CI for the median difference

In [23]:
bt_diffs = []
alpha = 0.1
repl = 10000
for i in range(repl):
    x = teen.sample(len(teen), replace=True).median()
    y = adult.sample(len(adult), replace=True).median()
    bt_diffs.append(x - y)
pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))

0.05   -55.0
0.95    93.0
dtype: float64

### Bootstrap CI for the mean difference

In [24]:
bt_diffs = []
alpha = 0.1
repl = 10000
for i in range(repl):
    x = teen.sample(len(teen), replace=True).mean()
    y = adult.sample(len(adult), replace=True).mean()
    bt_diffs.append(x- y)
pd.Series(bt_diffs).quantile((alpha/2, 1-alpha/2))

0.05   -165.555556
0.95    109.014583
dtype: float64

## Theoretically derived confidence intervals for the difference in means

In [25]:
import statsmodels.stats.api as sms

cm = sms.CompareMeans(sms.DescrStatsW(teen), 
                      sms.DescrStatsW(adult))
cm.tconfint_diff(usevar='unequal', alpha=0.10)

(-165.57260877264594, 118.01705321709034)