In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

##### SAS Data import

In [2]:
print(os.getcwd())

c:\Users\Owner\Desktop\윤태준\고용패널\3월 분석


In [3]:
df = pd.read_csv('2023_03_30_data_step.csv', encoding='cp949')
df.head()

Unnamed: 0,pid,majorcat,province,area,sex,age,a003,a144,a146,d134,...,성별,건강상태,결혼상태,대학지역,전공일치,본전공,재학중일자리경험,삶의만족도_개인,삶의만족도_관계,삶의만족도_소속집단
0,100166,1,2,8,2,22.5,,,,3.0,...,여자,보통이다,미혼,수도권,보통이다,문과,있다,그렇다,그렇지않다,그렇지않다
1,100191,1,4,7,2,26.916667,,,,3.0,...,여자,건강하다,미혼,비수도권,보통이다,문과,없다,보통이다,그렇지않다,그렇지않다
2,100212,1,2,8,2,24.666667,,,,1.0,...,여자,보통이다,미혼,수도권,맞지않다,문과,있다,보통이다,보통이다,보통이다
3,100221,1,2,8,2,25.25,,,,1.0,...,여자,건강하다,미혼,수도권,맞지않다,문과,있다,그렇지않다,그렇지않다,그렇지않다
4,100232,1,3,11,1,25.25,2.0,2.0,2.0,,...,남자,건강하다,미혼,비수도권,맞지않다,문과,있다,그렇지않다,그렇지않다,그렇지않다


모델 Logistic Regression
1. 인구통계학적 특성
2. 전공 연관성, 대학 지역, 학점, 재학 중 경험 일자리(완전 대학 관련)
3. 삶의 만족도(개인, 관계, 소속 집단)
    3-1 삶의 만족도 평균
4. full model
---------------------------------------------------
- 근속 종속변수 : :LongWork, LongWork2가 존재
    - LongWork2가 2년 이상 일한 사람 
    - 2년 ~ 3년 사이 기간을 분석하기로 했으니 Target : LongWork2로 지정해서 사용

<h6> 1번 인구통계학적특징 로지스틱 </h6>

In [4]:
model_human = sm.Logit.from_formula('''LongWork2 ~ age + C(성별,Treatment("여자")) + C(건강상태) + C(결혼상태)''', df).fit()
print(model_human.summary())
print(f'모델 AIC:{model_human.aic}')
print('\n')

# 오즈비 계산(Confidence interval 포함)
odds_ratios = pd.DataFrame(
    {
        "OR": model_human.params,
        "Lower CI": model_human.conf_int()[0],
        "Upper CI": model_human.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)

Optimization terminated successfully.
         Current function value: 0.544678
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:              LongWork2   No. Observations:                 5808
Model:                          Logit   Df Residuals:                     5802
Method:                           MLE   Df Model:                            5
Date:                Fri, 31 Mar 2023   Pseudo R-squ.:                 0.06110
Time:                        06:07:44   Log-Likelihood:                -3163.5
converged:                       True   LL-Null:                       -3369.4
Covariance Type:            nonrobust   LLR p-value:                 8.731e-87
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept                       -8.4118      0.591    -14.245      0.000

###### 2번 대학 생활 로지스틱 회귀모형

In [5]:
model_college = sm.Logit.from_formula('''LongWork2 ~ C(대학지역,Treatment('비수도권')) + C(전공일치) 
    + score + C(재학중일자리경험,Treatment('없다'))''', df).fit()
print(model_college.summary())
print(f'모델 AIC:{model_college.aic}')
print('\n')

# 오즈비 계산(Confidence interval 포함)
odds_ratios = pd.DataFrame(
    {
        "OR": model_college.params,
        "Lower CI": model_college.conf_int()[0],
        "Upper CI": model_college.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)


Optimization terminated successfully.
         Current function value: 0.563673
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:              LongWork2   No. Observations:                 5568
Model:                          Logit   Df Residuals:                     5562
Method:                           MLE   Df Model:                            5
Date:                Fri, 31 Mar 2023   Pseudo R-squ.:                 0.03495
Time:                        06:07:46   Log-Likelihood:                -3138.5
converged:                       True   LL-Null:                       -3252.2
Covariance Type:            nonrobust   LLR p-value:                 4.009e-47
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -1.4639      0.202    

###### 3번 삶의 만족도 로지스틱 

In [7]:
model_satisfaction = sm.Logit.from_formula('''LongWork2 ~ C(삶의만족도_개인,Treatment('그렇지않다')) 
    + C(삶의만족도_관계, Treatment('그렇지않다')) + C(삶의만족도_소속집단, Treatment('그렇지않다')) ''', df).fit()
print(model_satisfaction.summary())
print(f'모델 AIC:{model_satisfaction.aic}')
print('\n')
print(f'오즈비{np.exp(model_satisfaction.params)}')

Optimization terminated successfully.
         Current function value: 0.572184
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:              LongWork2   No. Observations:                 5808
Model:                          Logit   Df Residuals:                     5801
Method:                           MLE   Df Model:                            6
Date:                Fri, 31 Mar 2023   Pseudo R-squ.:                 0.01369
Time:                        06:08:52   Log-Likelihood:                -3323.2
converged:                       True   LL-Null:                       -3369.4
Covariance Type:            nonrobust   LLR p-value:                 1.040e-17
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
Intercept                                    -

###### 4번 Full model

In [9]:
model_full = sm.Logit.from_formula('''LongWork2 ~ age + C(성별,Treatment("여자")) + C(건강상태) 
    + C(결혼상태) + C(대학지역,Treatment('비수도권')) + C(전공일치) + score 
    + C(재학중일자리경험,Treatment('없다')) + C(삶의만족도_개인,Treatment('그렇지않다'))
    + C(삶의만족도_관계, Treatment('그렇지않다')) + C(삶의만족도_소속집단, Treatment('그렇지않다'))''', 
    df).fit()
print(model_full.summary())
print(f'모델 AIC:{model_full.aic}')
print('\n')

# 오즈비 계산(Confidence interval 포함)
odds_ratios = pd.DataFrame(
    {
        "OR": model_full.params,
        "Lower CI": model_full.conf_int()[0],
        "Upper CI": model_full.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)



Optimization terminated successfully.
         Current function value: 0.525648
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:              LongWork2   No. Observations:                 5568
Model:                          Logit   Df Residuals:                     5551
Method:                           MLE   Df Model:                           16
Date:                Fri, 31 Mar 2023   Pseudo R-squ.:                  0.1000
Time:                        06:11:10   Log-Likelihood:                -2926.8
converged:                       True   LL-Null:                       -3252.2
Covariance Type:            nonrobust   LLR p-value:                3.824e-128
                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
Intercept                                    -