In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as ss
import scipy.optimize as so
from scipy.stats import chi2

In [2]:
def log_likelihood(X, y, B):
    X = np.asarray(X)
    y = np.asarray(y).flatten()
    B = np.asarray(B).flatten()
    # maximum() & minimum() in this line to avoid taking log(0) in next line
    pi = np.maximum(1e-9, np.minimum(1-1e-9, 1 / (1 + np.exp(-X@B))))
    ll = (y*np.log(pi) + (1-y)*np.log(1-pi)).sum()
    return ll

In [3]:
def logistic_regression_fit(y, X):
    num_regressors = X.shape[1]
    
    res = so.minimize(
        # "minimize the negative log likelihood" == "maximize the log likelihood"
        lambda B: -log_likelihood(X, y, B),
        [0]*num_regressors,
    )
    assert res.success

    B = res.x
    return pd.DataFrame(data=B[:,None].T, columns=X.columns).T

In [50]:
def logistic_summary(y, X, B):
    columns = X.columns
    #print(columns)
    X = np.asarray(X)
    y = np.asarray(y).flatten()
    B = np.asarray(B).flatten()
    w = np.exp(X@B)
    w = w/(1 + w**2)
    W = np.diag(w)
    #print('X.T@W@X',X.T@W@X)
    se = np.sqrt(np.diag(np.linalg.inv(X.T@W@X)))
    z = B/se
    p_value = 1-ss.norm.cdf(np.abs(z))
    return pd.DataFrame(data=(B, se, z, p_value), columns=columns, index=['B', 'se', 'z','p_value']).T

In [51]:
df = pd.read_csv('https://socialsciences.mcmaster.ca/jfox/Books/Applied-Regression-2E/datasets/Chile.txt',
                      sep = '\s', engine='python')
df = df.loc[(df['vote']=='Y') | (df['vote']=='N')]
df = df.dropna()
df

Unnamed: 0,region,population,sex,age,education,income,statusquo,vote
1,N,175000,M,65.0,P,35000.0,1.00820,Y
2,N,175000,M,29.0,PS,7500.0,-1.29617,N
3,N,175000,F,38.0,P,15000.0,1.23072,Y
4,N,175000,F,49.0,P,35000.0,-1.03163,N
5,N,175000,F,23.0,S,35000.0,-1.10496,N
...,...,...,...,...,...,...,...,...
2692,M,15000,F,21.0,S,35000.0,1.22231,Y
2696,M,15000,M,42.0,P,15000.0,-1.26247,N
2697,M,15000,F,28.0,P,15000.0,1.32950,Y
2698,M,15000,F,44.0,P,75000.0,1.42045,Y


In [52]:
X = pd.DataFrame(np.array([
    np.ones(shape=(len(df),)),
    df['statusquo'],
    1*(df['sex']=='M'),
    1*(df['income'] > np.median(df['income'])),
    df['age']>=65,
    df['age']>30,
    1*(df['education']=='S'),
    1*(df['education']=='PS'),
    1*(df['region']=='C'),
    1*(df['region']=='M'),
    1*(df['region']=='N'),
    1*(df['region']=='S'),
]), ['intercept', 'statusquo', 'male', 'high-income', 'old', 'young', 'sec_edu', 'post_sec_edu',
    'region_C', 'region_M', 'region_N', 'region_S']).T
y = pd.DataFrame((1*(df['vote']=='Y').values)[:,None], columns=['vote'])

B = logistic_regression_fit(y, X)
print('B', B)
logistic_summary(y, X, B)

B                      0
intercept     1.090590
statusquo     3.200889
male         -0.573046
high-income  -0.163474
old          -0.169602
young        -0.028731
sec_edu      -0.666867
post_sec_edu -1.022815
region_C      0.133436
region_M      0.776404
region_N      0.069769
region_S     -0.177057


Unnamed: 0,B,se,z,p_value
intercept,1.09059,0.26099,4.178659,1.5e-05
statusquo,3.200889,0.138464,23.117107,0.0
male,-0.573046,0.170619,-3.358633,0.000392
high-income,-0.163474,0.193188,-0.846191,0.198723
old,-0.169602,0.423242,-0.400721,0.344313
young,-0.028731,0.18216,-0.157725,0.437337
sec_edu,-0.666867,0.205504,-3.245034,0.000587
post_sec_edu,-1.022815,0.282169,-3.624827,0.000145
region_C,0.133436,0.233875,0.570544,0.284155
region_M,0.776404,0.48018,1.616903,0.05295


In [62]:
B.values.reshape(1,-1)[0]

array([[ 1.09059025],
       [ 3.20088889],
       [-0.57304628],
       [-0.16347373],
       [-0.16960192],
       [-0.02873124],
       [-0.66686682],
       [-1.0228149 ],
       [ 0.13343581],
       [ 0.77640403],
       [ 0.06976866],
       [-0.17705699]])

In [64]:
np.dot(np.array(X), B.values.reshape(1,-1)[0])

array([ 3.4526419 , -4.58439842,  5.07102563, ...,  6.12257605,
        6.22149192,  4.53061562])

In [53]:
X

Unnamed: 0,intercept,statusquo,male,high-income,old,young,sec_edu,post_sec_edu,region_C,region_M,region_N,region_S
0,1.0,1.00820,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,-1.29617,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1.0,1.23072,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,-1.03163,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.0,-1.10496,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1698,1.0,1.22231,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1699,1.0,-1.26247,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1700,1.0,1.32950,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1701,1.0,1.42045,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [8]:
X = pd.DataFrame(np.array([
    np.ones(shape=(len(df),)),
    df['statusquo'],
    1*(df['sex']=='M'),

    1*(df['education']=='S'),
    1*(df['education']=='PS'),

]), ['intercept', 'statusquo', 'male', 'sec_edu', 'post_sec_edu']).T
y = pd.DataFrame((1*(df['vote']=='Y').values)[:,None], columns=['vote'])

B = logistic_regression_fit(y, X)
llf = log_likelihood(X, y, B)
logistic_summary(y, X, B)

Unnamed: 0,B,se,z,p_value
intercept,1.015271,0.156973,6.467796,4.972134e-11
statusquo,3.168931,0.135435,23.398145,0.0
male,-0.574245,0.168531,-3.40736,0.0003279723
sec_edu,-0.682755,0.184507,-3.700428,0.0001076181
post_sec_edu,-1.107408,0.244245,-4.534004,2.893802e-06


In [9]:
X0 = pd.DataFrame(np.ones(shape=(len(df))), columns=['intercept'])
B0 = logistic_regression_fit(y, X0)
llf0 = log_likelihood(X0, y, B0)

In [10]:
print (llf)
print (llf0)

-354.1182799361284
-1180.1474837610176


In [11]:
dod = -2*(llf0 - llf)
print (f"D - D0 = {dod:.2f}")
k = X.shape[1]
k0 = X0.shape[1]
p_value = 1 - chi2.cdf(dod, k-k0)
print (f"p-value = {p_value:.4e}")

D - D0 = 1652.06
p-value = 0.0000e+00
