# Pythonでロジスティック回帰

2017/07/26

Rとカテゴリカルデータのモデリング(1)
- https://www1.doshisha.ac.jp/~mjin/R/47/47.html

Should statsmodels's GLM produce the same results as R's lm?
- https://stats.stackexchange.com/questions/92862/should-statsmodelss-glm-produce-the-same-results-as-rs-lm

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

import statsmodels.api as sm

%matplotlib inline

In [2]:
# Load the data from Spector and Mazzeo (1980)
spector_data = sm.datasets.spector.load()
spector_data.exog = sm.add_constant(spector_data.exog)

# Follow statsmodles ipython notebook
logit_mod = sm.Logit(spector_data.endog, spector_data.exog)
logit_res = logit_mod.fit(disp=0)

In [3]:
'Parameters: ', logit_res.params

('Parameters: ',
 array([-13.02134686,   2.82611259,   0.09515766,   2.37868766]))

In [4]:
logit_res.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,32.0
Model:,Logit,Df Residuals:,28.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 26 Jul 2017",Pseudo R-squ.:,0.374
Time:,14:10:49,Log-Likelihood:,-12.89
converged:,True,LL-Null:,-20.592
,,LLR p-value:,0.001502

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-13.0213,4.931,-2.641,0.008,-22.687,-3.356
x1,2.8261,1.263,2.238,0.025,0.351,5.301
x2,0.0952,0.142,0.672,0.501,-0.182,0.373
x3,2.3787,1.065,2.234,0.025,0.292,4.465


In [5]:
spector_data.endog

array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
        1.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.,  1.,  1.,
        1.,  0.,  1.,  1.,  0.,  1.])

In [6]:
spector_data.exog

array([[  1.  ,   2.66,  20.  ,   0.  ],
       [  1.  ,   2.89,  22.  ,   0.  ],
       [  1.  ,   3.28,  24.  ,   0.  ],
       [  1.  ,   2.92,  12.  ,   0.  ],
       [  1.  ,   4.  ,  21.  ,   0.  ],
       [  1.  ,   2.86,  17.  ,   0.  ],
       [  1.  ,   2.76,  17.  ,   0.  ],
       [  1.  ,   2.87,  21.  ,   0.  ],
       [  1.  ,   3.03,  25.  ,   0.  ],
       [  1.  ,   3.92,  29.  ,   0.  ],
       [  1.  ,   2.63,  20.  ,   0.  ],
       [  1.  ,   3.32,  23.  ,   0.  ],
       [  1.  ,   3.57,  23.  ,   0.  ],
       [  1.  ,   3.26,  25.  ,   0.  ],
       [  1.  ,   3.53,  26.  ,   0.  ],
       [  1.  ,   2.74,  19.  ,   0.  ],
       [  1.  ,   2.75,  25.  ,   0.  ],
       [  1.  ,   2.83,  19.  ,   0.  ],
       [  1.  ,   3.12,  23.  ,   1.  ],
       [  1.  ,   3.16,  25.  ,   1.  ],
       [  1.  ,   2.06,  22.  ,   1.  ],
       [  1.  ,   3.62,  28.  ,   1.  ],
       [  1.  ,   2.89,  14.  ,   1.  ],
       [  1.  ,   3.51,  26.  ,   1.  ],
       [  1.  , 

In [7]:
"""
http://www.statsmodels.org/0.8.0/datasets/generated/spector.html

Spector and Mazzeo (1980) - Program Effectiveness Data
Description

Experimental data on the effectiveness of the personalized system of instruction (PSI) program

Notes

Number of Observations - 32

Number of Variables - 4

Variable name definitions::

    Grade - binary variable indicating whether or not a student's grade
            improved.  1 indicates an improvement.
    TUCE  - Test score on economics test
    PSI   - participation in program
    GPA   - Student's grade point average
""";

In [8]:
features = ["Grade","TUCE","PSI","GPA"]

In [9]:
df = pd.DataFrame(spector_data.exog, columns=features)

In [10]:
objective = "endog"
df[objective] = spector_data.endog

In [11]:
df.head()

Unnamed: 0,Grade,TUCE,PSI,GPA,endog
0,1.0,2.66,20.0,0.0,0.0
1,1.0,2.89,22.0,0.0,0.0
2,1.0,3.28,24.0,0.0,0.0
3,1.0,2.92,12.0,0.0,0.0
4,1.0,4.0,21.0,0.0,1.0


In [12]:
res = sm.formula.glm("endog ~ TUCE+PSI+GPA",  family=sm.families.Binomial(), 
                     data=df).fit() 
res.summary()

0,1,2,3
Dep. Variable:,endog,No. Observations:,32.0
Model:,GLM,Df Residuals:,28.0
Model Family:,Binomial,Df Model:,3.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-12.89
Date:,"Wed, 26 Jul 2017",Deviance:,25.779
Time:,14:10:49,Pearson chi2:,27.3
No. Iterations:,5,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-13.0213,4.931,-2.641,0.008,-22.686,-3.356
TUCE,2.8261,1.263,2.238,0.025,0.351,5.301
PSI,0.0952,0.142,0.672,0.501,-0.182,0.373
GPA,2.3787,1.065,2.234,0.025,0.292,4.465


In [13]:
logit_res.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,32.0
Model:,Logit,Df Residuals:,28.0
Method:,MLE,Df Model:,3.0
Date:,"Wed, 26 Jul 2017",Pseudo R-squ.:,0.374
Time:,14:10:49,Log-Likelihood:,-12.89
converged:,True,LL-Null:,-20.592
,,LLR p-value:,0.001502

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-13.0213,4.931,-2.641,0.008,-22.687,-3.356
x1,2.8261,1.263,2.238,0.025,0.351,5.301
x2,0.0952,0.142,0.672,0.501,-0.182,0.373
x3,2.3787,1.065,2.234,0.025,0.292,4.465
