In [None]:
# ---------------------------------------------------
# 로지스틱 회귀 (Logistic Regression)  - 범주형 예측 
# ---------------------------------------------------
# - Y가 "점수 자체"가 아니라 "합격/불합격"처럼 **범주형(0/1)** 일 때 사용  
# - 예) 시험 점수(Y)를 바로 예측하는 게 아니라,  
#   "합격(1) / 불합격(0)"을 예측  
# - 공식적으로는 선형결과를 S자 곡선(시그모이드 함수)에 통과시켜 0~1 확률로 변환  

# 예시:  
# - 공부시간 20시간 → 합격 확률 80% (Y=1일 가능성 높음)  
# - 공부시간 2시간 → 합격 확률 10% (Y=0일 가능성 높음)  

In [1]:
# -----------------------------
# 불러오기
# -----------------------------
import pandas as pd

df = pd.read_csv("health_survey.csv")
# df = pd.read_csv("https://raw.githubusercontent.com/lovedlim/bigdata_analyst_cert/main/part3/ch5/health_survey.csv")

In [2]:
df

Unnamed: 0,age,bmi,smoker,activity_level,disease
0,62,35.179089,0,0,1
1,65,18.576042,0,2,1
2,71,33.178426,0,1,1
3,18,37.063007,1,2,0
4,21,17.613266,0,0,0
...,...,...,...,...,...
995,75,23.600372,0,2,1
996,24,9.659333,0,2,0
997,37,25.959939,0,0,0
998,52,25.356929,0,1,1


In [3]:
# 모델학습
from statsmodels.formula.api import logit

model = logit('disease ~ age + bmi', data=df).fit()


Optimization terminated successfully.
         Current function value: 0.643725
         Iterations 5


In [4]:
print(model.summary())

                           Logit Regression Results                           
Dep. Variable:                disease   No. Observations:                 1000
Model:                          Logit   Df Residuals:                      997
Method:                           MLE   Df Model:                            2
Date:                Mon, 13 Oct 2025   Pseudo R-squ.:                 0.04996
Time:                        02:24:25   Log-Likelihood:                -643.72
converged:                       True   LL-Null:                       -677.58
Covariance Type:            nonrobust   LLR p-value:                 1.984e-15
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -1.8700      0.289     -6.482      0.000      -2.435      -1.305
age            0.0177      0.004      4.747      0.000       0.010       0.025
bmi            0.0563      0.009      6.418      0.0

In [7]:
# 오즈비값 확인
import numpy as np

print(model.params['bmi']) #회귀계수(coef) 확인 
print(np.exp(model.params['bmi']))#오즈비(Ratio) 확인 

0.056333879687088535
1.057950853075076


In [8]:
import numpy as np

print(model.params['age']) #회귀계수(coef) 확인 
print(np.exp(model.params['age']))#오즈비(Ratio) 확인 

0.017705261174417463
1.0178629284499419


In [14]:
# 질병 확률 확인
X_new = pd.DataFrame([{"age" : 50, "bmi":30.0}])
pred = model.predict(X_new)[0]
pred

0.6693529360380776