In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import shap

# 한글 폰트 설치 및 설정
!apt-get update -qq
!apt-get install fonts-nanum* -qq

import matplotlib.font_manager as fm
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')

# 데이터 로드
file_path = '/content/drive/MyDrive/2024/train_OutlierRemoval.csv'  # Colab에 업로드한 파일 경로를 사용하세요
data = pd.read_csv(file_path)

# 불필요한 열 제거
data = data.drop(columns=['Unnamed: 0', 'ID'])

# 사용할 특성 선택 및 흡연 여부(label) 컬럼 설정
features = ['헤모글로빈', '혈청 크레아티닌', '충치','키(cm)', '몸무게(kg)', '중성 지방', '시력', '공복 혈당', '요 단백', '저밀도지단백', '나이', '고밀도지단백', '간 효소율']
X = data[features]
y = data['label']

# 결측치 처리 및 데이터 스케일링
numeric_imputer = SimpleImputer(strategy='mean')
scaler = StandardScaler()

X = pd.DataFrame(numeric_imputer.fit_transform(X), columns=X.columns)
X[features] = scaler.fit_transform(X[features])

# SMOTE를 사용한 오버샘플링
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 학습 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=0)

# 로지스틱 회귀 모델 설정
logreg = LogisticRegression(max_iter=10000)

# 그리드 서치 파라미터 설정
param_grid = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# 그리드 서치 설정 및 실행
grid_search = GridSearchCV(logreg, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 출력
print(f"Best Parameters: {grid_search.best_params_}")

# 최적 모델 학습
best_model = grid_search.best_estimator_

# 교차 검증 점수 계산 및 결과 출력
logreg_scores = cross_val_score(best_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
for fold_idx, score in enumerate(logreg_scores, 1):
    print(f"Fold {fold_idx} Accuracy: {score:.4f}")
print(f"Mean CV Accuracy: {np.mean(logreg_scores):.4f}")

# 최적 모델 평가
y_pred_proba = best_model.predict_proba(X_test)[:, 1]
y_pred = best_model.predict(X_test)

# 성능 지표 계산
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print(f'ROC AUC: {roc_auc:.4f}')

# 추가적인 평가
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
conf_matrix = confusion_matrix(y_test, y_pred)

# 혼돈 행렬 시각화
ConfusionMatrixDisplay(conf_matrix).plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

# ROC 곡선 시각화
RocCurveDisplay.from_estimator(best_model, X_test, y_test)
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.show()

# 분포도 시각화
sns.histplot(y_test, kde=False, label='Actual', color='blue', alpha=0.6)
sns.histplot(y_pred, kde=False, label='Predicted', color='orange', alpha=0.6)
plt.legend()
plt.title('Distribution of Actual vs Predicted')
plt.show()

# 로지스틱 회귀 계수 시각화
logreg_coef = pd.Series(best_model.coef_[0], index=features)
logreg_coef = logreg_coef.sort_values()
plt.figure(figsize=(10, 6))
logreg_coef.plot(kind='barh')
plt.title('Logistic Regression Coefficients')
plt.show()

# SHAP 값 계산 및 시각화
explainer = shap.LinearExplainer(best_model, X_test)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=features)
shap.summary_plot(shap_values, X_test, feature_names=features)