### Chapter 1: Uncertainty
#### The Bootstrap

In [10]:
# scikit's bootstrap functionality
from sklearn.utils import resample
import pandas as pd
# Useful javascript plotting API for pandas that improves on matplotlib's visualization.
#!pip install hvplot
import hvplot.pandas
import numpy as np

In [4]:
# Ingest the web browsers data
df = pd.read_csv("./examples/web-browsers.csv")

In [5]:
df.head()

Unnamed: 0,id,anychildren,broadband,hispanic,race,region,spend
0,1,0,1,0,white,MW,424
1,2,1,1,0,white,MW,2335
2,3,1,1,0,white,MW,279
3,4,0,1,0,white,MW,829
4,5,0,1,0,white,S,221


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
id             10000 non-null int64
anychildren    10000 non-null int64
broadband      10000 non-null int64
hispanic       10000 non-null int64
race           10000 non-null object
region         10000 non-null object
spend          10000 non-null int64
dtypes: int64(5), object(2)
memory usage: 547.0+ KB


In [28]:
# Mean spend value
mean = np.mean(df["spend"])
variance = np.var(df["spend"])

print("Mean spend is {} and the variance is {}.".format(round(mean, 2), round(variance, 2)))

Mean spend is 1946.44 and the variance is 64612792.26.


In [15]:
# Histogram of log spend
np.log(df["spend"]).hvplot.hist(color="seagreen", line_color="seagreen", bins=50)

In [48]:
# Number of bootstrap samples to generate
B = 1000
# Capture the bootstrap means in a list
mub = []

for i in range(1, B):
    # Using sklearn's bootstrap api to generate 10,000 instance bootstrap samples
    sample = resample(df["spend"], replace=True, n_samples=len(df))
    # Calculate the means and append them to the blank mub list
    mub.append(np.mean(sample))

# Calculate the standard deviation of the boot-strapped means
np.std(mub)

79.3937078566008

In [51]:
# Histogram of the bootstrapped means
pd.Series(mub).hvplot.hist(color="dodgerblue", line_color="dodgerblue", bins=30)

#### The Generalized Linear Model of Log-Spend

In [52]:
from sklearn import linear_model
reg = linear_model.LinearRegression()

In [54]:
y = df["spend"]
X1 = df["broadband"]
X2 = df["anychildren"]

0       0
1       1
2       1
3       0
4       0
       ..
9995    1
9996    1
9997    1
9998    0
9999    0
Name: anychildren, Length: 10000, dtype: int64