### 주제
뇌졸중(Stoke) 발생 여부를 예측하는 이진 분류(Binary Classification) 문제

### 목적
환자의 건강, 생활습관 및 인구통계학적 정보를 바탕으로:
- stroke를 예측함으로써
- 예방적 의료 개입 및 위험도 분류를 가능하게 하는 머신러닝 모델 구축이 목적

In [40]:
# https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
import os
print(os.getcwd())
print(os.listdir())

d:\workspace\hit_ml_dl\practice
['data', 'data_dictionary.png', 'linear_regression', 'stroke.ipynb', 'stroke_fixed.py', 'titanic.ipynb']


In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("./data/stroke.csv", encoding='utf-8')
df.info()
df.head()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [42]:
# 결측치 확인
print(df.isnull().sum())
df['bmi'].fillna(df['bmi'].median(), inplace=True) # median 선택 이유: 이상치 존재
df = df[df['gender'] != 'Other'].drop(columns='id')

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['bmi'].fillna(df['bmi'].median(), inplace=True) # median 선택 이유: 이상치 존재


In [43]:
# 컬럼 타입 분류 (범주형 vs 수치형)
# 숫자형 (int, float)
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_features.remove('stroke')

# 이진 변수 (binary feature)
# 전처리 및 해석에 따라 '범주형'처럼 취급하는 것이 일반적이지만
# 머신러닝 모델에 넣을 때는 '숫자형 그대로 사용'하는 것이 효율적입니다.
binary_features = ['stroke', 'hypertension', 'heart_disease']

# 범주형 (object, category 등)
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

print("Numeric columns: ", numeric_features)
print("Categorical columns: ", categorical_features)

Numeric columns:  ['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi']
Categorical columns:  ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']


In [44]:
# 데이터 분할
X = df.drop("stroke", axis=1)
y = df["stroke"]

In [45]:
# 기초 통계 확인 (숫자형)
print(df.describe())

               age  hypertension  heart_disease  avg_glucose_level  \
count  5109.000000   5109.000000    5109.000000        5109.000000   
mean     43.229986      0.097475       0.054022         106.140399   
std      22.613575      0.296633       0.226084          45.285004   
min       0.080000      0.000000       0.000000          55.120000   
25%      25.000000      0.000000       0.000000          77.240000   
50%      45.000000      0.000000       0.000000          91.880000   
75%      61.000000      0.000000       0.000000         114.090000   
max      82.000000      1.000000       1.000000         271.740000   

               bmi       stroke  
count  5109.000000  5109.000000  
mean     28.863300     0.048738  
std       7.699785     0.215340  
min      10.300000     0.000000  
25%      23.800000     0.000000  
50%      28.100000     0.000000  
75%      32.800000     0.000000  
max      97.600000     1.000000  


In [46]:
# 범주형 데이터 분포 확인
for col in categorical_features:
    print(f"{col} 분포:")
    print(df[col].value_counts())
    print("\n")

gender 분포:
gender
Female    2994
Male      2115
Name: count, dtype: int64


ever_married 분포:
ever_married
Yes    3353
No     1756
Name: count, dtype: int64


work_type 분포:
work_type
Private          2924
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: count, dtype: int64


Residence_type 분포:
Residence_type
Urban    2596
Rural    2513
Name: count, dtype: int64


smoking_status 분포:
smoking_status
never smoked       1892
Unknown            1544
formerly smoked     884
smokes              789
Name: count, dtype: int64




In [47]:
# 타겟 변수 불균형 확인
print(df['stroke'].value_counts(normalize=True))

stroke
0    0.951262
1    0.048738
Name: proportion, dtype: float64


In [None]:
# 전처리 구성
from sklearn.preprocessing import OneHotEncoder # 범주형 변수를 0/1 벡터로 변환
from sklearn.compose import ColumnTransformer   # 열별로 다른 전처리를 적용
from sklearn.pipeline import Pipeline           # 모델 학습 과정을 하나로 묶어 재현성 보장
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트 분류기
from sklearn.model_selection import train_test_split # 데이터 분할
from sklearn.metrics import classification_report, confusion_matrix # 분류 성능 평가
from imblearn.over_sampling import SMOTE # 소수 클래스 샘플을 가상 생성하여 클래스 불균형 해결

#
categorical_transformer = Pipeline(steps=[
    # handle_unknown="ignore" : 테스트셋에 학습셋에 없는 범주가 있어도 무시
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# 
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ],
    remainder="passthrough"  # binary_features 그대로 사용
)

# 학습/데이터 분리
# stratify=y: stroke 비율을 유지하면서 훈련/테스트 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 전처리 실행
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# SMOTE 적용
# Synthetic Minority Over-sampling Technique
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

# 모델 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# 예측 및 평가
y_pred = model.predict(X_test_preprocessed)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [49]:
# 특성 중요도 확인
# 범주령 특성의 원-핫 인코딩된 컬럼명 생성
categorical_feature_names = []
for i, col in enumerate(categorical_features):
    categories = preprocessor.named_transformers_['cat'].named_steps['onehot'].categories_[i]
    categorical_feature_names.extend([f"{col}_{cat}" for cat in categories])

# 모든 특성명 결합
all_feature_names = numeric_features + categorical_feature_names

feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n상위 10개 특성 중요도:")
print(feature_importance.head(10))

# 모델 성능 요약
print(f"\n모델 성능 요약:")
print(f"정확도 (Accuracy): {(968 + 2) / (968 + 4 + 48 + 2):.4f}")
print(f"정밀도 (Precision): {2 / (2 + 4):.4f}")
print(f"재현율 (Recall): {2 / (2 + 48):.4f}")
print(f"F1-Score: {2 * (2 / (2 + 4)) * (2 / (2 + 48)) / ((2 / (2 + 4)) + (2 / (2 + 48))):.4f}") 


상위 10개 특성 중요도:
                           feature  importance
0                              age    0.222844
1                     hypertension    0.087838
7                  ever_married_No    0.062443
18     smoking_status_never smoked    0.058872
3                avg_glucose_level    0.058664
16          smoking_status_Unknown    0.049021
2                    heart_disease    0.047132
17  smoking_status_formerly smoked    0.047103
8                 ever_married_Yes    0.046165
4                              bmi    0.043850

모델 성능 요약:
정확도 (Accuracy): 0.9491
정밀도 (Precision): 0.3333
재현율 (Recall): 0.0400
F1-Score: 0.0714
