In [11]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import metrics
from sklearn.metrics import roc_auc_score


In [3]:
url = "http://statweb.stanford.edu/~tibs/ElemStatLearn/datasets/SAheart.data"
df = pd.read_csv(url, index_col=0)
df.head()

Unnamed: 0_level_0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
2,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
3,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
4,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
5,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


In [4]:
dummies = pd.get_dummies(df['famhist']).astype('int')
#dummies


In [5]:
df = pd.concat([df, dummies], axis=1)
df.head()

Unnamed: 0_level_0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd,Absent,Present
row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1,0,1
2,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1,1,0
3,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0,0,1
4,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1,0,1
5,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1,0,1


In [6]:
X = df.drop(['adiposity', 'typea', 'famhist', 'Absent', 'chd'], axis=1)
X = sm.add_constant(X)
y = df.chd

In [7]:
X

Unnamed: 0_level_0,const,sbp,tobacco,ldl,obesity,alcohol,age,Present
row.names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,1.0,160,12.00,5.73,25.30,97.20,52,1
2,1.0,144,0.01,4.41,28.87,2.06,63,0
3,1.0,118,0.08,3.48,29.14,3.81,46,1
4,1.0,170,7.50,6.41,31.99,24.26,58,1
5,1.0,134,13.60,3.50,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...
459,1.0,214,0.40,5.98,28.45,0.00,58,0
460,1.0,182,4.20,4.41,28.61,18.72,52,0
461,1.0,108,3.00,1.59,20.09,26.64,55,0
462,1.0,118,5.40,11.61,27.35,23.97,40,0


In [8]:
model = sm.GLM(y, X, family=sm.families.Binomial())
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,chd,No. Observations:,462.0
Model:,GLM,Df Residuals:,454.0
Model Family:,Binomial,Df Model:,7.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-241.59
Date:,"Fri, 24 Nov 2023",Deviance:,483.17
Time:,10:24:29,Pearson chi2:,459.0
No. Iterations:,5,Pseudo R-squ. (CS):,0.2169
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-4.1296,0.964,-4.283,0.000,-6.019,-2.240
sbp,0.0058,0.006,1.023,0.306,-0.005,0.017
tobacco,0.0795,0.026,3.034,0.002,0.028,0.131
ldl,0.1848,0.057,3.218,0.001,0.072,0.297
obesity,-0.0345,0.029,-1.187,0.235,-0.092,0.023
alcohol,0.0006,0.004,0.136,0.892,-0.008,0.009
age,0.0425,0.010,4.181,0.000,0.023,0.062
Present,0.9392,0.225,4.177,0.000,0.498,1.380


In [9]:
result.aic


499.174032364739

In [10]:
X.shape

(462, 8)

In [36]:
p = 8
n = 462
result.aic + p*(np.log(n)-2) # compute BIC

532.2585514933929

In [37]:
def computeAIC(feature_set):
    model = sm.GLM(y, X[list(feature_set)], family=sm.families.Binomial())
    result = model.fit()
    AIC = result.aic
    return {"model":result, "AIC":AIC}

def computeBIC(feature_set):
    model = sm.GLM(y, X[list(feature_set)], family=sm.families.Binomial())
    result = model.fit()
    BIC = result.aic + len(feature_set)*(np.log(len(y))-2)
    return {"model":result, "BIC":BIC}

def AICforward(features):
    remaining_features = [p for p in X.columns if p not in features]

    results = []
    
    for p in remaining_features:
        results.append(computeAIC(features+[p]))
    
    models = pd.DataFrame(results)
    best_model = models.loc[models['AIC'].argmin()]
    
    return best_model

def BICforward(features):
    remaining_features = [p for p in X.columns if p not in features]

    results = []
    
    for p in remaining_features:
        results.append(computeBIC(features+[p]))
    
    models = pd.DataFrame(results)
    best_model = models.loc[models['BIC'].argmin()]
    
    return best_model

In [43]:
%%time

AIC_fwd = pd.DataFrame(columns=["AIC", "model"])
features = ['const']

for i in range(1,len(X.columns)):    
    AIC_fwd.loc[i] = AICforward(features)
    features = AIC_fwd.loc[i]["model"].model.exog_names
    
AIC_fwd

CPU times: user 53.1 ms, sys: 3.06 ms, total: 56.2 ms
Wall time: 54.5 ms


Unnamed: 0,AIC,model
1,529.562337,<statsmodels.genmod.generalized_linear_model.G...
2,512.658154,<statsmodels.genmod.generalized_linear_model.G...
3,503.385399,<statsmodels.genmod.generalized_linear_model.G...
4,495.443861,<statsmodels.genmod.generalized_linear_model.G...
5,496.296748,<statsmodels.genmod.generalized_linear_model.G...
6,497.192536,<statsmodels.genmod.generalized_linear_model.G...
7,499.174032,<statsmodels.genmod.generalized_linear_model.G...


In [41]:
tmp = AIC_fwd['AIC'].argmin() +1
AIC_fwd.loc[tmp]["model"].model.exog_names

['const', 'age', 'Present', 'tobacco', 'ldl']

In [42]:
BIC_fwd = pd.DataFrame(columns=["BIC", "model"])
features = ['const']

for i in range(1,len(X.columns)):    
    BIC_fwd.loc[i] = BICforward(features)
    features = BIC_fwd.loc[i]["model"].model.exog_names

BIC_fwd


Unnamed: 0,BIC,model
1,537.833467,<statsmodels.genmod.generalized_linear_model.G...
2,525.064848,<statsmodels.genmod.generalized_linear_model.G...
3,519.927658,<statsmodels.genmod.generalized_linear_model.G...
4,516.121685,<statsmodels.genmod.generalized_linear_model.G...
5,521.110137,<statsmodels.genmod.generalized_linear_model.G...
6,526.14149,<statsmodels.genmod.generalized_linear_model.G...
7,532.258551,<statsmodels.genmod.generalized_linear_model.G...


In [44]:
tmp = BIC_fwd['BIC'].argmin() +1
BIC_fwd.loc[tmp]["model"].model.exog_names

['const', 'age', 'Present', 'tobacco', 'ldl']