In [16]:
# 라이브러리 import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import optuna
import joblib
import os

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import (classification_report, confusion_matrix, f1_score, 
                            roc_curve, auc, matthews_corrcoef, cohen_kappa_score, 
                            balanced_accuracy_score)
from imblearn.over_sampling import SMOTE

# 한글 폰트 설정
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
pd.options.display.float_format = '{:.2f}'.format


In [17]:
# 데이터 로드 및 전처리
try:
    df = pd.read_csv("../data/healthcare-dataset-stroke-data.csv", encoding='utf-8')
    # 결측치 처리 및 데이터 정리
    df['bmi'].fillna(df['bmi'].median(), inplace=True)
    df = df[df['gender'] != 'Other'].drop(columns='id')
    print(f"✅ 데이터 로드 완료 - Shape: {df.shape}")
except FileNotFoundError:
    print("❌ 데이터 파일을 찾을 수 없습니다.")
    df = None

✅ 데이터 로드 완료 - Shape: (5109, 11)



A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [18]:
# 기본 데이터 탐색
if df is not None:
    print(f"뇌졸중 비율: {df['stroke'].mean():.2%}")
    print(f"뇌졸중 환자 수: {df['stroke'].sum():,}")
    print("\n데이터 타입:")
    print(df.dtypes)
    df.head()

뇌졸중 비율: 4.87%
뇌졸중 환자 수: 249

데이터 타입:
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object


In [19]:
# 모델 학습 준비
if df is not None:
    # 특성 분류
    numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
    numeric_features.remove('stroke')
    
    # 데이터 분할
    X = df.drop("stroke", axis=1)
    y = df["stroke"]
    
    # 전처리 파이프라인 구성
    categorical_transformer = Pipeline(steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", "passthrough", numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ],
        remainder="passthrough"
    )
    
    # 학습/테스트 데이터 분리
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, stratify=y, test_size=0.2, random_state=42
    )
    
    print(f"훈련 데이터: {X_train.shape}, 테스트 데이터: {X_test.shape}")


훈련 데이터: (4087, 10), 테스트 데이터: (1022, 10)


In [20]:
# 기본 모델 학습 (SMOTE 적용)
if 'X_train' in locals():
    # 전처리 실행
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    # SMOTE 적용
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)
    
    # RandomForest 모델 학습
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_resampled, y_train_resampled)
    
    # 예측
    y_pred = model.predict(X_test_preprocessed)
    y_pred_proba = model.predict_proba(X_test_preprocessed)
    
    print("✅ 기본 모델 학습 완료")
    print(f"학습 데이터 불균형 해결: {len(y_train_resampled)} 샘플")


✅ 기본 모델 학습 완료
학습 데이터 불균형 해결: 7776 샘플


In [21]:
# 모델 성능 평가
if 'model' in locals():
    # 기본 성능 지표
    report = classification_report(y_test, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # 추가 통계 지표
    mcc = matthews_corrcoef(y_test, y_pred)
    kappa = cohen_kappa_score(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    
    print("🎯 모델 성능 지표")
    print(f"정확도: {report['accuracy']:.3f}")
    print(f"정밀도: {report['1']['precision']:.3f}")
    print(f"재현율: {report['1']['recall']:.3f}")
    print(f"F1-Score: {report['1']['f1-score']:.3f}")
    print(f"Matthews Correlation: {mcc:.3f}")
    print(f"Cohen's Kappa: {kappa:.3f}")
    print(f"Balanced Accuracy: {bal_acc:.3f}")
    
    # 특성 중요도 계산
    categorical_feature_names = []
    for i, col in enumerate(categorical_features):
        categories = preprocessor.named_transformers_['cat'].named_steps['onehot'].categories_[i]
        categorical_feature_names.extend([f"{col}_{cat}" for cat in categories])
    
    all_feature_names = numeric_features + categorical_feature_names
    
    feature_importance = pd.DataFrame({
        'feature': all_feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\n📊 상위 10개 특성 중요도:")
    print(feature_importance.head(10))

🎯 모델 성능 지표
정확도: 0.949
정밀도: 0.333
재현율: 0.040
F1-Score: 0.071
Matthews Correlation: 0.101
Cohen's Kappa: 0.062
Balanced Accuracy: 0.518

📊 상위 10개 특성 중요도:
                           feature  importance
0                              age        0.22
1                     hypertension        0.09
7                  ever_married_No        0.06
18     smoking_status_never smoked        0.06
3                avg_glucose_level        0.06
16          smoking_status_Unknown        0.05
2                    heart_disease        0.05
17  smoking_status_formerly smoked        0.05
8                 ever_married_Yes        0.05
4                              bmi        0.04


In [22]:
# 시각화 1: 혼동 행렬
if 'conf_matrix' in locals():
    fig = go.Figure(data=go.Heatmap(
        z=conf_matrix,
        x=['뇌졸중 없음', '뇌졸중 있음'],
        y=['뇌졸중 없음', '뇌졸중 있음'],
        colorscale='Blues',
        text=conf_matrix,
        texttemplate="%{text}",
        textfont={"size": 16}
    ))
    fig.update_layout(
        title="혼동 행렬 (Confusion Matrix)",
        xaxis_title="예측값",
        yaxis_title="실제값"
    )
    fig.show()

In [23]:
# 시각화 2: 특성 중요도
if 'feature_importance' in locals():
    top_features = feature_importance.head(10)
    fig = px.bar(
        top_features, 
        x='importance', 
        y='feature',
        orientation='h',
        title="상위 10개 특성 중요도"
    )
    fig.update_layout(xaxis_title="중요도", yaxis_title="특성")
    fig.show()

In [24]:
# 시각화 3: ROC 커브
if 'y_pred_proba' in locals():
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
    roc_auc = auc(fpr, tpr)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        mode='lines',
        name=f'ROC 커브 (AUC = {roc_auc:.3f})',
        line=dict(color='blue', width=2)
    ))
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        mode='lines',
        name='무작위 분류기',
        line=dict(color='red', width=2, dash='dash')
    ))
    fig.update_layout(
        title="ROC 커브",
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        xaxis=dict(range=[0, 1]),
        yaxis=dict(range=[0, 1])
    )
    fig.show()
    print(f"ROC AUC Score: {roc_auc:.3f}")

ROC AUC Score: 0.763


In [25]:
# 시각화 4: 연령대별 뇌졸중 발생률
if df is not None:
    df['age_group'] = pd.cut(df['age'], 
                            bins=[0, 20, 40, 60, 80, 100], 
                            labels=['0-20', '21-40', '41-60', '61-80', '80+'])
    age_stroke_rate = df.groupby('age_group')['stroke'].mean()
    
    fig = px.bar(
        x=age_stroke_rate.index,
        y=age_stroke_rate.values,
        title="연령대별 뇌졸중 발생률",
        labels={'x': '연령대', 'y': '뇌졸중 발생률'}
    )
    fig.show()
    
    # 성별 뇌졸중 발생률
    gender_stroke_rate = df.groupby('gender')['stroke'].mean()
    fig = px.bar(
        x=gender_stroke_rate.index,
        y=gender_stroke_rate.values,
        title="성별 뇌졸중 발생률",
        labels={'x': '성별', 'y': '뇌졸중 발생률'}
    )
    fig.show()





In [26]:
# 시각화 5: 상관관계 분석
if df is not None:
    numeric_df = df.select_dtypes(include=['int64', 'float64'])
    correlation_matrix = numeric_df.corr()
    
    # 뇌졸중과의 상관관계 순위
    stroke_corr = correlation_matrix['stroke'].sort_values(ascending=False)
    print("🎯 뇌졸중과의 상관관계 순위:")
    for var, corr in stroke_corr.items():
        if var != 'stroke':
            print(f"{var}: {corr:.3f}")
    
    # 상관관계 히트맵
    reversed_y = correlation_matrix.columns[::-1]
    reversed_z = correlation_matrix.values[::-1]
    
    fig = go.Figure(data=go.Heatmap(
        z=reversed_z,
        x=correlation_matrix.columns,
        y=reversed_y,
        colorscale='RdBu_r',
        zmid=0,
        text=np.round(reversed_z, 3),
        texttemplate="%{text}",
        textfont={"size": 10},
        hoverongaps=False
    ))
    fig.update_layout(
        title="변수 간 상관관계 히트맵",
        xaxis_title="변수",
        yaxis_title="변수",
        width=600,
        height=500
    )
    fig.show()

🎯 뇌졸중과의 상관관계 순위:
age: 0.245
heart_disease: 0.135
avg_glucose_level: 0.132
hypertension: 0.128
bmi: 0.036


In [27]:
# Optuna 하이퍼파라미터 최적화 (선택적 실행)
# 시간이 오래 걸리므로 필요할 때만 실행
RUN_OPTIMIZATION = False  # True로 변경하면 최적화 실행

if RUN_OPTIMIZATION and 'X_train' in locals():
    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 15)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
        
        opt_model = RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth,
            min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,
            random_state=42
        )
        
        opt_model.fit(X_train_resampled, y_train_resampled)
        y_pred_opt = opt_model.predict(X_test_preprocessed)
        
        return f1_score(y_test, y_pred_opt, average='weighted')
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20)
    
    print("🔧 최적화 결과:")
    print(f"최고 F1-Score: {study.best_value:.4f}")
    print(f"최적 파라미터: {study.best_params}")
    
    # 최적화된 모델로 재학습
    best_model = RandomForestClassifier(**study.best_params, random_state=42)
    best_model.fit(X_train_resampled, y_train_resampled)
    y_pred_best = best_model.predict(X_test_preprocessed)
    
    # 성능 비교
    print(f"\n기본 모델 F1-Score: {f1_score(y_test, y_pred, average='weighted'):.4f}")
    print(f"최적화 모델 F1-Score: {f1_score(y_test, y_pred_best, average='weighted'):.4f}")
else:
    print("⏩ 하이퍼파라미터 최적화 건너뛰기 (RUN_OPTIMIZATION = False)")

⏩ 하이퍼파라미터 최적화 건너뛰기 (RUN_OPTIMIZATION = False)


In [28]:
# 개인 뇌졸중 위험 예측 함수
def predict_stroke_risk(age, gender, hypertension, heart_disease, ever_married, 
                        work_type, residence_type, avg_glucose_level, bmi, smoking_status):
    """개인의 뇌졸중 위험을 예측하는 함수"""
    if 'model' not in locals() or 'preprocessor' not in locals():
        print("❌ 모델이 학습되지 않았습니다.")
        return None
    
    # 입력 데이터 생성
    input_data = pd.DataFrame({
        'age': [age],
        'gender': [gender],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'ever_married': [ever_married],
        'work_type': [work_type],
        'Residence_type': [residence_type],
        'avg_glucose_level': [avg_glucose_level],
        'bmi': [bmi],
        'smoking_status': [smoking_status]
    })
    
    # 전처리 및 예측
    input_preprocessed = preprocessor.transform(input_data)
    prediction = model.predict(input_preprocessed)
    prediction_proba = model.predict_proba(input_preprocessed)
    
    risk_level = '높음' if prediction[0] == 1 else '낮음'
    probability = prediction_proba[0][1]
    
    print(f"🎯 뇌졸중 위험도: {risk_level}")
    print(f"📊 발생 확률: {probability:.2%}")
    
    return {'prediction': prediction[0], 'probability': probability, 'risk_level': risk_level}

# 예측 예시 (50세 남성, 고혈압 있음)
if 'model' in locals():
    example_result = predict_stroke_risk(
        age=50, gender="Male", hypertension=1, heart_disease=0,
        ever_married="Yes", work_type="Private", residence_type="Urban",
        avg_glucose_level=150.0, bmi=28.0, smoking_status="formerly smoked"
    )

❌ 모델이 학습되지 않았습니다.


In [29]:
# 최종 실험 결과 요약
if 'report' in locals():
    print("="*60)
    print("🎯 뇌졸중 예측 모델 실험 결과 요약")
    print("="*60)
    print(f"📊 데이터셋 크기: {df.shape[0]:,} 샘플, {df.shape[1]-1} 특성")
    print(f"⚖️ 클래스 불균형: 뇌졸중 {df['stroke'].mean():.2%} vs 정상 {1-df['stroke'].mean():.2%}")
    print(f"🔄 SMOTE 적용 후: {len(y_train_resampled):,} 샘플")
    print("\n📈 모델 성능:")
    print(f"   • 정확도: {report['accuracy']:.3f}")
    print(f"   • 정밀도: {report['1']['precision']:.3f}")
    print(f"   • 재현율: {report['1']['recall']:.3f}")
    print(f"   • F1-Score: {report['1']['f1-score']:.3f}")
    print(f"   • ROC AUC: {roc_auc:.3f}")
    print(f"   • Matthews 상관계수: {mcc:.3f}")
    print(f"   • Cohen's Kappa: {kappa:.3f}")
    print(f"   • 균형 정확도: {bal_acc:.3f}")
    
    print("\n🔝 상위 5개 중요 특성:")
    for i, (feature, importance) in enumerate(feature_importance.head(5).values):
        print(f"   {i+1}. {feature}: {importance:.3f}")
    
    print("\n💡 주요 발견사항:")
    # 연령 관련 발견사항
    elderly_rate = df[df['age'] >= 60]['stroke'].mean()
    young_rate = df[df['age'] < 60]['stroke'].mean()
    print(f"   • 60세 이상 뇌졸중 비율: {elderly_rate:.2%} (60세 미만: {young_rate:.2%})")
    
    # 성별 관련 발견사항
    male_rate = df[df['gender'] == 'Male']['stroke'].mean()
    female_rate = df[df['gender'] == 'Female']['stroke'].mean()
    print(f"   • 남성 뇌졸중 비율: {male_rate:.2%}, 여성: {female_rate:.2%}")
    
    # 고혈압 관련 발견사항
    hyper_rate = df[df['hypertension'] == 1]['stroke'].mean()
    normal_rate = df[df['hypertension'] == 0]['stroke'].mean()
    print(f"   • 고혈압 환자 뇌졸중 비율: {hyper_rate:.2%} (정상: {normal_rate:.2%})")
    
    print("="*60)


🎯 뇌졸중 예측 모델 실험 결과 요약
📊 데이터셋 크기: 5,109 샘플, 11 특성
⚖️ 클래스 불균형: 뇌졸중 4.87% vs 정상 95.13%
🔄 SMOTE 적용 후: 7,776 샘플

📈 모델 성능:
   • 정확도: 0.949
   • 정밀도: 0.333
   • 재현율: 0.040
   • F1-Score: 0.071
   • ROC AUC: 0.763
   • Matthews 상관계수: 0.101
   • Cohen's Kappa: 0.062
   • 균형 정확도: 0.518

🔝 상위 5개 중요 특성:
   1. age: 0.223
   2. hypertension: 0.088
   3. ever_married_No: 0.062
   4. smoking_status_never smoked: 0.059
   5. avg_glucose_level: 0.059

💡 주요 발견사항:
   • 60세 이상 뇌졸중 비율: 13.15% (60세 미만: 1.82%)
   • 남성 뇌졸중 비율: 5.11%, 여성: 4.71%
   • 고혈압 환자 뇌졸중 비율: 13.25% (정상: 3.97%)


In [30]:
# 추가 분석: 학습 곡선 (선택적)
PLOT_LEARNING_CURVE = False  # True로 변경하면 학습 곡선 표시

if PLOT_LEARNING_CURVE and 'model' in locals():
    train_sizes, train_scores, val_scores = learning_curve(
        model, X_train_preprocessed, y_train_resampled, cv=5, n_jobs=-1, 
        train_sizes=np.linspace(0.1, 1.0, 10), scoring='f1_weighted'
    )
    
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    val_mean = np.mean(val_scores, axis=1)
    val_std = np.std(val_scores, axis=1)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=train_sizes, y=train_mean,
        mode='lines+markers', name='훈련 점수',
        line=dict(color='blue'),
        error_y=dict(type='data', array=train_std, visible=True)
    ))
    fig.add_trace(go.Scatter(
        x=train_sizes, y=val_mean,
        mode='lines+markers', name='검증 점수',
        line=dict(color='red'),
        error_y=dict(type='data', array=val_std, visible=True)
    ))
    fig.update_layout(
        title="학습 곡선",
        xaxis_title='훈련 샘플 수',
        yaxis_title='F1 Score'
    )
    fig.show()
else:
    print("⏩ 학습 곡선 건너뛰기 (PLOT_LEARNING_CURVE = False)") 

⏩ 학습 곡선 건너뛰기 (PLOT_LEARNING_CURVE = False)
