In [110]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


In [76]:
DATAPATH = '/home/wagyu0923/project/commuting_happiness/data/2024 서울서베이 가구원_data_코드북.xlsx'
excel_data = pd.read_excel(DATAPATH, sheet_name = 0)


In [103]:
# 사용할 변수들만 걸러내기
using_cols = [# 핵심 변수
    'Q34',      # 지난 2주간 스트레스
    'Q20',      # 통근/통학 여부
    'Q20C1',     # 통근/통학 시간_시간
    'Q20C2',     # 통근/통학 시간_분

    #  통제변수 - 개인 특성
    'SQ1_2',     # 성별
    'SQ1_3',     # 출생연도 
    'SQ1_4',     # 혼인상태
    'SQ1_7',     # 등록장애인 여부

    # 통제변수 - 가구/소득
    'FAM1',      # 전체가구원수
    'AQ1',       # 월평균 가구소득

    # 통제변수 - 교육/직업/노동
    'DQ1',       # 학력
    'DQ3',       # 직업
    'DQ3A',      # 고용형태
    'DQ3B1',     # 주당 평균 근로시간_시간
    'DQ3B2',     # 주당 평균 근로시간_분

    # 통제변수 - 주거/지역
    'GU',        # 거주지 구코드
    'SQ0_2',     # 주택형태
    'SQ0_3',     # 주거점유형태
]

raw_df = excel_data.copy()[using_cols]

In [104]:
rename_dict = {
    'Q34': 'Stress_Level',           
    'Q20': 'Commute_Status',           
    'Q20C1': 'Commute_Hour',        
    'Q20C2': 'Commute_Min',           
    'SQ1_2': 'Gender',                 
    'SQ1_3': 'Birth_Year',            
    'SQ1_4': 'Marital_Status',        
    'SQ1_7': 'Disabled_Reg',          
    'FAM1': 'Household_Size',         
    'AQ1': 'Household_Income',         
    'DQ1': 'Education',                  
    'DQ3': 'Job_Type',                 
    'DQ3A': 'Employment_Type',         
    'DQ3B1': 'Work_Hours_Week',        
    'DQ3B2': 'Work_Mins_Week',               
    'GU': 'District_Code',             
    'SQ0_2': 'House_Type',            
    'SQ0_3': 'House_Occupancy',         
}

raw_df = raw_df.rename(columns=rename_dict)

In [105]:
print('결측치 제거 전 행 수:' ,len(raw_df))
df = raw_df.copy().dropna()
print('결측치 제거 후 행 수:' ,len(df))


결측치 제거 전 행 수: 36280
결측치 제거 후 행 수: 24707


In [106]:
# 시간 분단위로 변환
df['Commute_Time'] = df['Commute_Hour']*60 + df['Commute_Min']
df['Work_Time'] = df['Work_Hours_Week']*60 + df['Work_Mins_Week']

# 출생년도로부터 나이 도출
df['Age'] = 2024 - df['Birth_Year']

# 남성 0, 여성 1로 변경
df[['Gender', 'Disabled_Reg']] = df[['Gender','Disabled_Reg']].replace({1: 0, 2: 1})

# 자가 소유 외의 주거 점유 형태를 0으로 통일 
df['House_Occupancy'] = df['House_Occupancy'].replace({1 : 1, 2 : 0, 3 : 0, 4 : 0, 5 : 0, 6 : 0 })


In [107]:
not_use_anymore = ['Commute_Status', 'Commute_Hour','Commute_Min', 'Birth_Year', 'Work_Hours_Week', 'Work_Mins_Week']
df = df.drop(columns = not_use_anymore)

In [108]:
df.describe()

Unnamed: 0,Stress_Level,Gender,Marital_Status,Disabled_Reg,Household_Size,Household_Income,Education,Job_Type,Employment_Type,District_Code,House_Type,House_Occupancy,Commute_Time,Work_Time,Age
count,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0
mean,3.135103,0.419031,1.58963,0.995062,2.394908,11.999798,5.139717,4.162545,2.008419,436.216457,2.201522,0.510786,33.504351,2549.930303,49.1058
std,0.911511,0.49341,1.024872,0.070098,0.996267,4.264721,0.972954,1.930376,1.779793,179.97364,0.935109,0.499894,17.637267,476.604444,13.321352
min,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,110.0,1.0,0.0,1.0,120.0,19.0
25%,2.0,0.0,1.0,1.0,2.0,9.0,4.0,3.0,1.0,290.0,2.0,0.0,20.0,2400.0,37.0
50%,3.0,0.0,1.0,1.0,2.0,12.0,5.0,4.0,1.0,440.0,2.0,1.0,30.0,2400.0,49.0
75%,4.0,1.0,2.0,1.0,3.0,15.0,6.0,5.0,2.0,590.0,3.0,1.0,40.0,2700.0,60.0
max,5.0,1.0,5.0,1.0,8.0,21.0,7.0,13.0,7.0,740.0,4.0,1.0,150.0,10020.0,90.0


In [None]:
sns.pairplot(df)

In [114]:
model = smf.ols(
    "Stress_Level ~ Gender +  C(Marital_Status) + Disabled_Reg + Household_Size + Household_Income + Education + C(Job_Type) + C(Employment_Type) + C(District_Code) +  C(House_Type) + House_Occupancy + Commute_Time + Work_Time + Age",
    data = df
).fit()

In [115]:
model.summary()

0,1,2,3
Dep. Variable:,Stress_Level,R-squared:,0.05
Model:,OLS,Adj. R-squared:,0.048
Method:,Least Squares,F-statistic:,23.59
Date:,"Sat, 06 Dec 2025",Prob (F-statistic):,1.25e-228
Time:,00:31:24,Log-Likelihood:,-32134.0
No. Observations:,24707,AIC:,64380.0
Df Residuals:,24651,BIC:,64840.0
Df Model:,55,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.4506,0.123,27.957,0.000,3.209,3.693
C(Marital_Status)[T.2],-0.0408,0.018,-2.211,0.027,-0.077,-0.005
C(Marital_Status)[T.3],0.1402,0.025,5.606,0.000,0.091,0.189
C(Marital_Status)[T.4],0.0403,0.064,0.630,0.529,-0.085,0.166
C(Marital_Status)[T.5],0.0222,0.030,0.748,0.455,-0.036,0.080
C(Job_Type)[T.2],0.1290,0.037,3.518,0.000,0.057,0.201
C(Job_Type)[T.3],-0.0266,0.031,-0.855,0.393,-0.087,0.034
C(Job_Type)[T.4],0.0030,0.032,0.092,0.927,-0.060,0.066
C(Job_Type)[T.5],0.0476,0.033,1.422,0.155,-0.018,0.113

0,1,2,3
Omnibus:,1410.483,Durbin-Watson:,1.374
Prob(Omnibus):,0.0,Jarque-Bera (JB):,620.854
Skew:,-0.17,Prob(JB):,1.5199999999999998e-135
Kurtosis:,2.302,Cond. No.,119000.0
