In [1]:
%matplotlib inline

In [2]:
import numpy as np
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import random
import models

sns.set(style="whitegrid")

In [3]:
def logistic( z):
    return 1.0 / (1.0 + np.exp( -z))

In [4]:
np.random.seed(83474722)
data = {}
data["x1"] = stats.norm.rvs(50.0, 5.0, 1000)
p = 0.67
data["x2"] = np.array([1.0 if np.random.rand() < p else 0.0 for _ in range( 1000)])
data["z"] = data["x1"] + 0.1*data["x2"] + 0.25*data["x1"]* data["x2"] -50
data["pr"] = list(map(lambda z: logistic(z), data["z"]))
data["y"] = list(map(lambda pr: 1 if np.random.uniform() < pr else 0, data["pr"]))
data = pd.DataFrame(data)

At first, I set beta_0 to be a positive number but I kept getting the error "This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0". It turned out that when z is over a certain value (greater than 5 or less that -5) the logistic function is always very close to 1 and the logistic_regression would think the data it's getting only contains 1 class. So I set beta_0 to be -50 so that $z$ would be in a range that logistic($z$) would result in various values between 0 and 1.

In [5]:
result = models.logistic_regression("y ~ x1 + x2 + x1:x2", data = data)
models.simple_describe_lgr(result)

0,1
model,y ~ x1 + x2 + x1:x2
coefficients,coefficients
$\beta_0$,-8.251427726352672
x1 ($\beta_1$),0.16762659722639675
x2 ($\beta_2$),-2.252408145921349
x1:x2 ($\beta_3$),0.13537017628300727
metrics,metrics
Error (%),5.2
$R^2$,0.6084918610068639


In [6]:
result = models.bootstrap_logistic_regression("y ~ x1 + x2 + x1:x2", data)
models.describe_bootstrap_lgr(result, 3)

0,1,2,3
model,y ~ x1 + x2 + x1:x2,,
coefficients,coefficients,95% BCI,P(y=1)
$\beta_0$,-8.251,"(-8.841, -7.569)",0.000
x1 ($\beta_1$),0.168,"(0.153, 0.180)",0.042
x2 ($\beta_2$),-2.252,"(-2.884, -1.428)",-0.563
x1:x2 ($\beta_3$),0.135,"(0.125, 0.146)",0.034
metrics,metrics,95% BCI,
Error (%),5.200,"(3.990, 6.810)",
$R^2$,0.608,"(0.564, 0.642)",


Even though the error rate and $R^2$ are not bad, the coefficients are quite different from the grounth truth.