### 데이터 불러오기

In [153]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf


In [120]:
DATAPATH = '/home/wagyu0923/project/commuting_happiness/data/2024 서울서베이 가구원_data_코드북.xlsx'
excel_data = pd.read_excel(DATAPATH, sheet_name = 0)


### 데이터 전처리

In [134]:
# 사용할 변수들만 걸러내기
using_cols = [# 핵심 변수
    'Q34',      # 지난 2주간 스트레스
    'Q20',      # 통근/통학 여부
    'Q20C1',     # 통근/통학 시간_시간
    'Q20C2',     # 통근/통학 시간_분

    #  통제변수 - 개인 특성
    'SQ1_2',     # 성별
    'SQ1_3',     # 출생연도 
    'SQ1_4',     # 혼인상태
    'SQ1_7',     # 등록장애인 여부

    # 통제변수 - 가구/소득
    'FAM1',      # 전체가구원수
    'AQ1',       # 월평균 가구소득

    # 통제변수 - 교육/직업/노동
    'DQ1',       # 학력
    'DQ3A',      # 고용형태
    'DQ3B1',     # 주당 평균 근로시간_시간
    'DQ3B2',     # 주당 평균 근로시간_분

    # 통제변수 - 주거/지역
    'GU',        # 거주지 구코드
    'SQ0_2',     # 주택형태
    'SQ0_3',     # 주거점유형태
]

raw_df = excel_data.copy()[using_cols]

In [137]:
rename_dict = {
    'Q34': 'Stress_Level',           
    'Q20': 'Commute_Status',           
    'Q20C1': 'Commute_Hour',        
    'Q20C2': 'Commute_Min',           
    'SQ1_2': 'Gender',                 
    'SQ1_3': 'Birth_Year',            
    'SQ1_4': 'Marital_Status',        
    'SQ1_7': 'Disabled_Reg',          
    'FAM1': 'Household_Size',         
    'AQ1': 'Household_Income',         
    'DQ1': 'Education',                            
    'DQ3A': 'Employment_Type',         
    'DQ3B1': 'Work_Hours_Week',        
    'DQ3B2': 'Work_Mins_Week',               
    'GU': 'District_Code',             
    'SQ0_2': 'House_Type',            
    'SQ0_3': 'House_Occupancy',         
}

raw_df = raw_df.rename(columns=rename_dict)

In [147]:
print('결측치 제거 전 행 수:' ,len(raw_df))
mid_df = raw_df.copy().dropna()
print('결측치 제거 후 행 수:' ,len(mid_df))

# 시간 단위로 변환
mid_df['Commute_Time'] = (mid_df['Commute_Hour']*60 + mid_df['Commute_Min'])/60
mid_df['Work_Time'] = (mid_df['Work_Hours_Week']*60 + mid_df['Work_Mins_Week'])/60

# 출생년도로부터 나이 도출
mid_df['Age'] = 2024 - mid_df['Birth_Year']

# 남성 0, 여성 1로 변경
mid_df[['Gender', 'Disabled_Reg']] = mid_df[['Gender','Disabled_Reg']].replace({1: 0, 2: 1})

# 자가 소유 외의 주거 점유 형태를 0으로 통일 
mid_df['House_Occupancy'] = mid_df['House_Occupancy'].replace({1 : 1, 2 : 0, 3 : 0, 4 : 0, 5 : 0, 6 : 0 })


결측치 제거 전 행 수: 36280
결측치 제거 후 행 수: 24707


In [148]:
not_use_anymore = ['Commute_Status', 'Commute_Hour','Commute_Min', 'Birth_Year', 'Work_Hours_Week', 'Work_Mins_Week']
later_df = mid_df.copy().drop(columns = not_use_anymore)

In [None]:
# 연속형 변수 중심화
cols_to_cent = ['Household_Income','Commute_Time','Work_Time', 'Age']
df_final = later_df.copy()
for col in cols_to_cent:
    mean = later_df[col].mean()
    df_final[col] = later_df[col]-mean

In [150]:
df_final.describe()

Unnamed: 0,Stress_Level,Gender,Marital_Status,Disabled_Reg,Household_Size,Household_Income,Education,Employment_Type,District_Code,House_Type,House_Occupancy,Commute_Time,Work_Time,Age
count,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0,24707.0
mean,3.135103,0.419031,1.58963,0.995062,2.394908,-1.030714e-15,5.139717,2.008419,436.216457,2.201522,0.510786,5.492924000000001e-17,3.128953e-16,-9.018748e-16
std,0.911511,0.49341,1.024872,0.070098,0.996267,4.264721,0.972954,1.779793,179.97364,0.935109,0.499894,0.2939545,7.943407,13.32135
min,1.0,0.0,1.0,0.0,1.0,-10.9998,1.0,1.0,110.0,1.0,0.0,-0.5417392,-40.49884,-30.1058
25%,2.0,0.0,1.0,1.0,2.0,-2.999798,4.0,1.0,290.0,2.0,0.0,-0.2250725,-2.498838,-12.1058
50%,3.0,0.0,1.0,1.0,2.0,0.0002023718,5.0,1.0,440.0,2.0,1.0,-0.05840585,-2.498838,-0.1058
75%,4.0,1.0,2.0,1.0,3.0,3.000202,6.0,2.0,590.0,3.0,1.0,0.1082608,2.501162,10.8942
max,5.0,1.0,5.0,1.0,8.0,9.000202,7.0,7.0,740.0,4.0,1.0,1.941594,124.5012,40.8942


### 매칭 전 결과

In [160]:
model = smf.ols(
    "Stress_Level ~ Gender + C(Marital_Status) + Disabled_Reg + Household_Size + Household_Income + Education + C(Employment_Type)+ C(District_Code) +  C(House_Type) + House_Occupancy +Commute_Time + Commute_Time:House_Occupancy + Work_Time + Age",
    data = df_final
).fit(cov_type='cluster', cov_kwds={'groups': df_final['District_Code']})
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           Stress_Level   R-squared:                       0.048
Model:                            OLS   Adj. R-squared:                  0.046
Method:                 Least Squares   F-statistic:                     866.1
Date:                Sat, 06 Dec 2025   Prob (F-statistic):           8.57e-30
Time:                        16:43:00   Log-Likelihood:                -32163.
No. Observations:               24707   AIC:                         6.442e+04
Df Residuals:                   24659   BIC:                         6.481e+04
Df Model:                          47                                         
Covariance Type:              cluster                                         
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
Intercept       



### 매칭

In [187]:
def smd(x_treat, x_control):
    mu_t = np.mean(x_treat)
    mu_c = np.mean(x_control)
    sd_t = np.std(x_treat, ddof = 1)
    sd_c = np.std(x_control, ddof = 1)
    pooled_sd = np.sqrt((sd_t**2 + sd_c**2) / 2)
    return (mu_t - mu_c) / pooled_sd if pooled_sd > 0 else 0

def compute_balance(df, num_vars, cat_vars,treat_col = 'House_Occupancy'):
    treated = df[df[treat_col]==1].copy()
    control = df[df[treat_col]==0].copy()

    rows = []
    for num_var in num_vars:
        smd_val = smd(treated[num_var], control[num_var])
        rows.append([
            num_var,
            treated[num_var].mean(),
            control[num_var].mean(),
            smd_val
        ])

    for cat_var in cat_vars:
        categories = df[cat_var].unique()
        for cat in categories:
            t_binary = (treated[cat_var] == cat).astype(int)
            c_binary = (control[cat_var] == cat).astype(int)
            smd_val = smd(t_binary, c_binary)
            
            rows.append([
                f"{cat_var}_{cat}",
                t_binary.mean(),
                c_binary.mean(),
                smd_val
            ]
            )

    return pd.DataFrame(rows, columns = ["variable", "mean_treat", 'mean_control', 'SMD'])

In [188]:
# 매칭 전 SMD 값
num_vars = ['Stress_Level', 'Gender', 'Disabled_Reg', 'Household_Size', 'Household_Income','Education', 'Commute_Time', 'Work_Time','Age']
cat_vars = ['Marital_Status', 'Employment_Type', 'District_Code', 'House_Type']
smd_output = compute_balance(df_final, num_vars, cat_vars)
smd_output

Unnamed: 0,variable,mean_treat,mean_control,SMD
0,Stress_Level,3.125911,3.144701,-0.020612
1,Gender,0.387559,0.45189,-0.130626
2,Disabled_Reg,0.994929,0.995201,-0.003892
3,Household_Size,2.6271,2.152478,0.490239
4,Household_Income,0.826589,-0.863039,0.404744
5,Education,5.005864,5.279474,-0.284402
6,Commute_Time,-0.00571,0.005962,-0.039744
7,Work_Time,0.218478,-0.228112,0.056359
8,Age,5.00561,-5.226343,0.832555
9,Marital_Status_2,0.159509,0.274262,-0.28117
