In [1]:
#import modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score



In [3]:
#load data and rename columns
dta = sm.datasets.fair.load_pandas().data
dta['affair'] = (dta.affairs > 0).astype(int)
y, X = dmatrices('affair ~ rate_marriage + age + yrs_married + children + \
religious + educ + C(occupation) + C(occupation_husb)',
dta, return_type="dataframe")

X = X.rename(columns = {'C(occupation)[T.2.0]':'occ_2',
'C(occupation)[T.3.0]':'occ_3',

'C(occupation)[T.4.0]':'occ_4',
'C(occupation)[T.5.0]':'occ_5',
'C(occupation)[T.6.0]':'occ_6',
'C(occupation_husb)[T.2.0]':'occ_husb_2',
'C(occupation_husb)[T.3.0]':'occ_husb_3',
'C(occupation_husb)[T.4.0]':'occ_husb_4',
'C(occupation_husb)[T.5.0]':'occ_husb_5',
'C(occupation_husb)[T.6.0]':'occ_husb_6'})
y = np.ravel(y)

In [4]:
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [24]:
#look at some of the data
print(dta.describe())
print("Mean # of women who had an affair %f" %dta.affair.mean())
print("Mean # of affairs %f" %dta.affairs.mean())
unique, counts = np.unique(dta.affair, return_counts=True)
print("Number of women who had an affair %d" %counts[1])

       rate_marriage          age  yrs_married     children    religious  \
count    6366.000000  6366.000000  6366.000000  6366.000000  6366.000000   
mean        4.109645    29.082862     9.009425     1.396874     2.426170   
std         0.961430     6.847882     7.280120     1.433471     0.878369   
min         1.000000    17.500000     0.500000     0.000000     1.000000   
25%         4.000000    22.000000     2.500000     0.000000     2.000000   
50%         4.000000    27.000000     6.000000     1.000000     2.000000   
75%         5.000000    32.000000    16.500000     2.000000     3.000000   
max         5.000000    42.000000    23.000000     5.500000     4.000000   

              educ   occupation  occupation_husb      affairs       affair  
count  6366.000000  6366.000000      6366.000000  6366.000000  6366.000000  
mean     14.209865     3.424128         3.850141     0.705374     0.322495  
std       2.178003     0.942399         1.346435     2.203374     0.467468  
min    

In [25]:
cols_tokeep = ['rate_marriage','age','yrs_married','religious','educ','affair']
data = dta[cols_tokeep]
data.head()

Unnamed: 0,rate_marriage,age,yrs_married,religious,educ,affair
0,3.0,32.0,9.0,3.0,17.0,1
1,3.0,27.0,13.0,1.0,14.0,1
2,4.0,22.0,2.5,1.0,16.0,1
3,4.0,37.0,16.5,3.0,16.0,1
4,5.0,27.0,9.0,1.0,14.0,1


In [37]:
train_cols = data.columns[0:5]
logit = sm.Logit(data['affair'], data[train_cols])
result = logit.fit()

Optimization terminated successfully.
         Current function value: 0.561212
         Iterations 5


In [38]:
result.summary()

0,1,2,3
Dep. Variable:,affair,No. Observations:,6366.0
Model:,Logit,Df Residuals:,6361.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 17 Jan 2019",Pseudo R-squ.:,0.1074
Time:,09:26:11,Log-Likelihood:,-3572.7
converged:,True,LL-Null:,-4002.5
,,LLR p-value:,8.912999999999999e-185

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
rate_marriage,-0.5490,0.028,-19.550,0.000,-0.604,-0.494
age,0.0170,0.008,2.106,0.035,0.001,0.033
yrs_married,0.0540,0.008,6.481,0.000,0.038,0.070
religious,-0.2790,0.033,-8.416,0.000,-0.344,-0.214
educ,0.0761,0.012,6.270,0.000,0.052,0.100


In [39]:
print("From the summary we can see some of the intuitively chosen variables hold a high probability of predicting an affair.")
print("How people rate their marriage, the number of years married, how they rate their level of religion, and level of education all show a very low P value.")

From the summary we can see some of the intuitively chosen variables hold a high probability of predicting an affair.
How people rate their marriage, the number of years married, how they rate their level of religion, and level of education all show a very low P value.
