#### packages

In [6]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn
import scipy

In [7]:
pd.options.display.float_format = '{:.5f}'.format

In [4]:
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')

  set_matplotlib_formats('retina')


##### Data

In [8]:
df = pd.read_csv('2023_07_27_data_step.csv', encoding='cp949')
df.isnull().sum()

pid              0
majorcat         0
province         0
area             0
sex              0
              ... 
삶의만족도_개인         0
삶의만족도_관계         0
삶의만족도_소속집단       0
종사자수          1235
업무만족도            0
Length: 117, dtype: int64

In [9]:
df_copy = df.copy()

In [10]:
df_copy.columns[100:]

Index(['supply', 'insurance', 'work_timeB', '나이', '학점', '성별', '건강상태', '결혼상태',
       '대학지역', '전공일치', '본전공', '재학중일자리경험', '삶의만족도_개인', '삶의만족도_관계', '삶의만족도_소속집단',
       '종사자수', '업무만족도'],
      dtype='object')

In [11]:
daegu = df_copy[df_copy['area']=='대구'].copy()

model = sm.Logit.from_formula('''LongWork2 ~ age + C(성별,Treatment("여자")) + C(건강상태) + C(결혼상태)''', daegu).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.333293
         Iterations 7


0,1,2,3
Dep. Variable:,LongWork2,No. Observations:,247.0
Model:,Logit,Df Residuals:,241.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 28 Jul 2023",Pseudo R-squ.:,0.1682
Time:,16:57:37,Log-Likelihood:,-82.323
converged:,True,LL-Null:,-98.967
Covariance Type:,nonrobust,LLR p-value:,3.299e-06

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-10.6423,3.018,-3.526,0.000,-16.558,-4.727
"C(성별, Treatment(""여자""))[T.남자]",0.1697,0.487,0.348,0.728,-0.785,1.125
C(건강상태)[T.건강하지않다],-0.6552,0.804,-0.815,0.415,-2.230,0.920
C(건강상태)[T.보통이다],-0.2733,0.490,-0.558,0.577,-1.233,0.687
C(결혼상태)[T.미혼],-1.1813,0.875,-1.350,0.177,-2.896,0.533
age,0.3752,0.101,3.722,0.000,0.178,0.573


In [12]:
print(f'모델 AIC:{model.aic}')
print('\n')

# 오즈비 계산(Confidence interval 포함)
odds_ratios = pd.DataFrame(
    {
        "OR": model.params,
        "Lower CI": model.conf_int()[0],
        "Upper CI": model.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
odds_ratios

모델 AIC:176.64696084126564




Unnamed: 0,OR,Lower CI,Upper CI
Intercept,2e-05,0.0,0.00886
"C(성별, Treatment(""여자""))[T.남자]",1.18499,0.45608,3.07883
C(건강상태)[T.건강하지않다],0.51934,0.10751,2.50884
C(건강상태)[T.보통이다],0.76083,0.29133,1.98694
C(결혼상태)[T.미혼],0.30689,0.05526,1.70431
age,1.45524,1.19439,1.77306


In [13]:
model_2 = sm.Logit.from_formula('''LongWork2 ~ 나이 + C(성별,Treatment("여자")) + C(건강상태,Treatment("건강하지않다")) 
    + C(결혼상태, Treatment("미혼"))+ C(대학지역,Treatment('비수도권')) + C(전공일치,Treatment("맞지않다")) 
    + 학점 + C(재학중일자리경험,Treatment('없다'))''', df_copy).fit()
model_2.summary()

Optimization terminated successfully.
         Current function value: 0.334900
         Iterations 7


0,1,2,3
Dep. Variable:,LongWork2,No. Observations:,4783.0
Model:,Logit,Df Residuals:,4772.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 28 Jul 2023",Pseudo R-squ.:,0.1387
Time:,16:57:39,Log-Likelihood:,-1601.8
converged:,True,LL-Null:,-1859.9
Covariance Type:,nonrobust,LLR p-value:,1.629e-104

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-9.7013,0.654,-14.842,0.000,-10.982,-8.420
"C(성별, Treatment(""여자""))[T.남자]",-0.1505,0.106,-1.426,0.154,-0.358,0.056
"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",0.1175,0.151,0.780,0.435,-0.178,0.413
"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",-0.0689,0.167,-0.414,0.679,-0.395,0.258
"C(결혼상태, Treatment(""미혼""))[T.기혼]",1.3639,0.202,6.744,0.000,0.967,1.760
"C(대학지역, Treatment('비수도권'))[T.수도권]",0.0915,0.094,0.975,0.330,-0.092,0.275
"C(전공일치, Treatment(""맞지않다""))[T.보통이다]",0.0669,0.120,0.559,0.576,-0.168,0.302
"C(전공일치, Treatment(""맞지않다""))[T.잘맞다]",0.5293,0.107,4.953,0.000,0.320,0.739
"C(재학중일자리경험, Treatment('없다'))[T.있다]",-0.8043,0.094,-8.589,0.000,-0.988,-0.621


In [10]:
### 수도권
# 대구 셋, 수도권 셋, 지방권 셋
capital = df_copy[(df_copy['area']=='서울')|(df_copy['area']=='경기')].copy()

country_side = df_copy[(df_copy['area']!='서울')&(df_copy['area']!='경기')].copy()


In [15]:
daegu['LongWork2'].value_counts()

LongWork2
0    213
1     34
Name: count, dtype: int64