##### Module import

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor # 다중공선성

In [2]:
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/NGULIM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

##### Data import

In [3]:
print(os.getcwd())

c:\Users\Owner\Desktop\윤태준\고용패널\고용패널 파이썬 분석\2023_04_06_변수추가


In [4]:
df = pd.read_csv('2023_04_06_data_step.csv', encoding='cp949')
df.columns

Index(['pid', 'majorcat', 'province', 'area', 'sex', 'age', 'a003', 'a010',
       'a141', 'a144', 'a146', 'd016', 'd131', 'd134', 'd136', 'f073', 'f074',
       'h001', 'q001', 'q015', 'q016', 'q017', 'p001', 'First_Work', 'year',
       'Resign', 'a038', 'a039', 'a043', 'a045', 'g191a046', 'a048', 'a392',
       'a140', 'd130', 'Admission', 'Graduation', 'Cur', 'Cur_Work_date',
       'Start_Work1_date', 'End_Work1_date', 'Work_to_Cur', 'Start_to_End',
       'FirstWorkPeriod', 'LongWork', 'LongWork2', 'major_same', 'major_help',
       'big_company', 'work_sati', 'score', 'q001G', 'p001G', 'provinceG',
       'major_sameG', 'major_helpG', 'majorcatG', 'q015G', 'q016G', 'q017G',
       'big_companyG', 'work_satiG', '나이', '학점', '성별', '건강상태', '결혼상태', '대학지역',
       '전공일치', '본전공', '재학중일자리경험', '삶의만족도_개인', '삶의만족도_관계', '삶의만족도_소속집단', '종사자수',
       '업무만족도'],
      dtype='object')

In [5]:
df_1 = df[['pid', 'majorcat', 'province', 'area', 'sex', 'age', 'a003', 'a010',
       'a141', 'a144', 'a146', 'd016', 'd131', 'd134', 'd136', 'f073', 'f074',
       'h001', 'q001', 'q015', 'q016', 'q017', 'p001', 'First_Work', 'year',
       'Resign', 'Admission', 'Graduation', 'Cur', 'Cur_Work_date',
       'Start_Work1_date', 'End_Work1_date', 'Work_to_Cur', 'Start_to_End',
       'FirstWorkPeriod', 'LongWork', 'LongWork2', 'major_same', 'major_help',
       'big_company', 'work_sati', 'score', 'q001G', 'p001G', 'provinceG',
       'major_sameG', 'major_helpG', 'majorcatG', 'q015G', 'q016G', 'q017G',
       'big_companyG', 'work_satiG',
       '나이', '성별', '건강상태', '결혼상태', '대학지역',  '전공일치', '본전공', '학점',
       '재학중일자리경험', '삶의만족도_개인', '삶의만족도_관계', '삶의만족도_소속집단',
       '종사자수', '업무만족도']].copy()

모델 Logistic Regression
1. 인구통계학적 특성
2. 인구통계학 + 전공 연관성, 대학 지역, 학점, 재학 중 경험 일자리(완전 대학 관련)
3. 인구통계학 + 전공 연관성 + 만족도(개인, 관계, 소속 집단, 업무만족도)
---------------------------------------------------
- 근속 종속변수 : :LongWork, LongWork2가 존재
    - LongWork2가 2년 이상 일한 사람 
    - 2년 ~ 3년 사이 기간을 분석하기로 했으니 Target : LongWork2로 지정해서 사용

##### 다변량 모델

<h6> 1번 인구통계학적특징 로지스틱 </h6>

In [6]:
model_1 = sm.Logit.from_formula('''LongWork2 ~ 나이 + C(성별,Treatment("여자")) + C(건강상태,Treatment("건강하지않다")) 
    + C(결혼상태, Treatment("미혼"))''', df_1).fit()
print('==================================Model Summary=========================================')
model_1.summary()

Optimization terminated successfully.
         Current function value: 0.544678
         Iterations 6


0,1,2,3
Dep. Variable:,LongWork2,No. Observations:,5808.0
Model:,Logit,Df Residuals:,5802.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 07 Apr 2023",Pseudo R-squ.:,0.0611
Time:,20:48:12,Log-Likelihood:,-3163.5
converged:,True,LL-Null:,-3369.4
Covariance Type:,nonrobust,LLR p-value:,8.731e-87

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-9.1323,0.541,-16.887,0.000,-10.192,-8.072
"C(성별, Treatment(""여자""))[T.남자]",0.2175,0.072,3.040,0.002,0.077,0.358
"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",0.3406,0.103,3.298,0.001,0.138,0.543
"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",0.0796,0.113,0.702,0.483,-0.143,0.302
"C(결혼상태, Treatment(""미혼""))[T.기혼]",0.3799,0.202,1.878,0.060,-0.017,0.776
나이,0.2981,0.021,14.248,0.000,0.257,0.339


In [12]:
print('==================================Odds Ratio=========================================')
# 오즈비 계산(Confidence interval 포함)
odds_ratios_human = pd.DataFrame(
    {
        "OR": model_1.params,
        "Lower CI": model_1.conf_int()[0],
        "Upper CI": model_1.conf_int()[1],
    }
)
odds_ratios_human = np.exp(odds_ratios_human)
odds_ratios_human



Unnamed: 0,OR,Lower CI,Upper CI
Intercept,0.000108,3.7e-05,0.000312
"C(성별, Treatment(""여자""))[T.남자]",1.242912,1.080308,1.429991
"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",1.40586,1.148189,1.721357
"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",1.082811,0.867054,1.352258
"C(결혼상태, Treatment(""미혼""))[T.기혼]",1.462103,0.983633,2.173315
나이,1.34728,1.293153,1.403673


In [7]:
# Get the predictor variables from the formula
X = model_1.model.exog

# Calculate the VIF for each predictor variable
vif_1 = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Create a data frame with the variable names and VIF values
vif_df_1 = pd.DataFrame({'Variable': model_1.params.index, 'VIF': vif_1})
vif_df_1

Unnamed: 0,Variable,VIF
0,Intercept,269.307172
1,"C(성별, Treatment(""여자""))[T.남자]",1.369344
2,"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",2.392321
3,"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",2.358897
4,"C(결혼상태, Treatment(""미혼""))[T.기혼]",1.015278
5,나이,1.355762


###### 2번 인구통계 + 대학 생활 로지스틱 회귀모형

In [8]:
model_2 = sm.Logit.from_formula('''LongWork2 ~ 나이 + C(성별,Treatment("여자")) + C(건강상태,Treatment("건강하지않다")) 
    + C(결혼상태, Treatment("미혼"))+ C(대학지역,Treatment('비수도권')) + C(전공일치,Treatment("맞지않다")) 
    + 학점 + C(재학중일자리경험,Treatment('없다'))''', df_1).fit()
model_2.summary()

Optimization terminated successfully.
         Current function value: 0.528554
         Iterations 6


0,1,2,3
Dep. Variable:,LongWork2,No. Observations:,5568.0
Model:,Logit,Df Residuals:,5557.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 07 Apr 2023",Pseudo R-squ.:,0.09508
Time:,20:49:09,Log-Likelihood:,-2943.0
converged:,True,LL-Null:,-3252.2
Covariance Type:,nonrobust,LLR p-value:,2e-126

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-9.6145,0.598,-16.078,0.000,-10.787,-8.442
"C(성별, Treatment(""여자""))[T.남자]",0.2404,0.076,3.170,0.002,0.092,0.389
"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",0.2869,0.108,2.650,0.008,0.075,0.499
"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",0.0907,0.119,0.763,0.445,-0.142,0.324
"C(결혼상태, Treatment(""미혼""))[T.기혼]",0.3201,0.213,1.505,0.132,-0.097,0.737
"C(대학지역, Treatment('비수도권'))[T.수도권]",0.2251,0.065,3.462,0.001,0.098,0.352
"C(전공일치, Treatment(""맞지않다""))[T.보통이다]",0.5714,0.084,6.802,0.000,0.407,0.736
"C(전공일치, Treatment(""맞지않다""))[T.잘맞다]",1.0627,0.076,13.966,0.000,0.914,1.212
"C(재학중일자리경험, Treatment('없다'))[T.있다]",-0.1348,0.066,-2.043,0.041,-0.264,-0.005


In [9]:
print('==================================Odds Ratio=========================================')
# 오즈비 계산(Confidence interval 포함)
odds_ratios_2 = pd.DataFrame(
    {
        "OR": model_2.params,
        "Lower CI": model_2.conf_int()[0],
        "Upper CI": model_2.conf_int()[1],
    }
)
odds_ratios_2 = np.exp(odds_ratios_2)
odds_ratios_2



Unnamed: 0,OR,Lower CI,Upper CI
Intercept,6.7e-05,2.1e-05,0.000216
"C(성별, Treatment(""여자""))[T.남자]",1.271821,1.09611,1.475698
"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",1.332348,1.077622,1.647285
"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",1.094952,0.867468,1.38209
"C(결혼상태, Treatment(""미혼""))[T.기혼]",1.377214,0.907891,2.089149
"C(대학지역, Treatment('비수도권'))[T.수도권]",1.252415,1.102581,1.422612
"C(전공일치, Treatment(""맞지않다""))[T.보통이다]",1.770699,1.501904,2.087599
"C(전공일치, Treatment(""맞지않다""))[T.잘맞다]",2.894271,2.493251,3.359792
"C(재학중일자리경험, Treatment('없다'))[T.있다]",0.873856,0.767799,0.994562
나이,1.355772,1.298379,1.415702


In [10]:
# Get the predictor variables from the formula
X = model_2.model.exog

# Calculate the VIF for each predictor variable
vif_2 = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Create a data frame with the variable names and VIF values
vif_df_2 = pd.DataFrame({'Variable': model_2.params.index, 'VIF': vif_2})
vif_df_2

Unnamed: 0,Variable,VIF
0,Intercept,305.795827
1,"C(성별, Treatment(""여자""))[T.남자]",1.412517
2,"C(건강상태, Treatment(""건강하지않다""))[T.건강하다]",2.423541
3,"C(건강상태, Treatment(""건강하지않다""))[T.보통이다]",2.386656
4,"C(결혼상태, Treatment(""미혼""))[T.기혼]",1.016712
5,"C(대학지역, Treatment('비수도권'))[T.수도권]",1.021192
6,"C(전공일치, Treatment(""맞지않다""))[T.보통이다]",1.229206
7,"C(전공일치, Treatment(""맞지않다""))[T.잘맞다]",1.225823
8,"C(재학중일자리경험, Treatment('없다'))[T.있다]",1.029485
9,나이,1.384516


###### 3번 삶의 만족도 로지스틱 

In [11]:
model_3 = sm.Logit.from_formula('''LongWork2 ~ 나이 + C(성별,Treatment('여자')) + C(건강상태,Treatment('건강하지않다')) 
    + C(결혼상태, Treatment('미혼'))+ C(대학지역,Treatment('비수도권')) + C(전공일치,Treatment('맞지않다')) 
    + 학점 + C(재학중일자리경험,Treatment('없다')) + C(삶의만족도_개인,Treatment('그렇지않다')) 
    + C(삶의만족도_관계, Treatment('그렇지않다')) + C(삶의만족도_소속집단, Treatment('그렇지않다')) +
    C(업무만족도,Treatment('그렇지않다')) ''', df_1).fit()
model_3.summary()

Optimization terminated successfully.
         Current function value: 0.518176
         Iterations 6


0,1,2,3
Dep. Variable:,LongWork2,No. Observations:,5568.0
Model:,Logit,Df Residuals:,5549.0
Method:,MLE,Df Model:,18.0
Date:,"Fri, 07 Apr 2023",Pseudo R-squ.:,0.1128
Time:,20:49:34,Log-Likelihood:,-2885.2
converged:,True,LL-Null:,-3252.2
Covariance Type:,nonrobust,LLR p-value:,3.468e-144

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-10.1737,0.622,-16.354,0.000,-11.393,-8.954
"C(성별, Treatment('여자'))[T.남자]",0.2399,0.077,3.114,0.002,0.089,0.391
"C(건강상태, Treatment('건강하지않다'))[T.건강하다]",0.1070,0.117,0.911,0.362,-0.123,0.337
"C(건강상태, Treatment('건강하지않다'))[T.보통이다]",0.0307,0.123,0.250,0.802,-0.210,0.271
"C(결혼상태, Treatment('미혼'))[T.기혼]",0.2634,0.213,1.236,0.216,-0.154,0.681
"C(대학지역, Treatment('비수도권'))[T.수도권]",0.2385,0.066,3.622,0.000,0.109,0.368
"C(전공일치, Treatment('맞지않다'))[T.보통이다]",0.5324,0.085,6.243,0.000,0.365,0.699
"C(전공일치, Treatment('맞지않다'))[T.잘맞다]",0.8951,0.078,11.410,0.000,0.741,1.049
"C(재학중일자리경험, Treatment('없다'))[T.있다]",-0.1416,0.067,-2.115,0.034,-0.273,-0.010


In [12]:
print('==================================Odds Ratio=========================================')
# 오즈비 계산(Confidence interval 포함)
odds_ratios_3 = pd.DataFrame(
    {
        "OR": model_3.params,
        "Lower CI": model_3.conf_int()[0],
        "Upper CI": model_3.conf_int()[1],
    }
)
odds_ratios_3 = np.exp(odds_ratios_3)
odds_ratios_3



Unnamed: 0,OR,Lower CI,Upper CI
Intercept,3.8e-05,1.1e-05,0.000129
"C(성별, Treatment('여자'))[T.남자]",1.271118,1.092954,1.478324
"C(건강상태, Treatment('건강하지않다'))[T.건강하다]",1.112945,0.884178,1.400902
"C(건강상태, Treatment('건강하지않다'))[T.보통이다]",1.031208,0.810594,1.311864
"C(결혼상태, Treatment('미혼'))[T.기혼]",1.301393,0.856991,1.976244
"C(대학지역, Treatment('비수도권'))[T.수도권]",1.269382,1.11566,1.444284
"C(전공일치, Treatment('맞지않다'))[T.보통이다]",1.702945,1.440854,2.01271
"C(전공일치, Treatment('맞지않다'))[T.잘맞다]",2.447567,2.098734,2.85438
"C(재학중일자리경험, Treatment('없다'))[T.있다]",0.867928,0.761147,0.98969
"C(삶의만족도_개인, Treatment('그렇지않다'))[T.그렇다]",1.433478,1.088294,1.888146


In [13]:
# Get the predictor variables from the formula
X = model_3.model.exog

# Calculate the VIF for each predictor variable
vif = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

# Create a data frame with the variable names and VIF values
vif_df = pd.DataFrame({'Variable': model_3.params.index, 'VIF': vif})
vif_df

Unnamed: 0,Variable,VIF
0,Intercept,319.018362
1,"C(성별, Treatment('여자'))[T.남자]",1.422182
2,"C(건강상태, Treatment('건강하지않다'))[T.건강하다]",2.818593
3,"C(건강상태, Treatment('건강하지않다'))[T.보통이다]",2.496576
4,"C(결혼상태, Treatment('미혼'))[T.기혼]",1.020437
5,"C(대학지역, Treatment('비수도권'))[T.수도권]",1.025075
6,"C(전공일치, Treatment('맞지않다'))[T.보통이다]",1.239803
7,"C(전공일치, Treatment('맞지않다'))[T.잘맞다]",1.302388
8,"C(재학중일자리경험, Treatment('없다'))[T.있다]",1.03307
9,"C(삶의만족도_개인, Treatment('그렇지않다'))[T.그렇다]",3.608719
