In [1]:
from google.colab import drive
drive.mount("/content/drive")
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    roc_auc_score, confusion_matrix, accuracy_score,
    precision_score, recall_score, f1_score, classification_report
)
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/Multicampus-8/4_·ÑÜ·Ö•·Ñâ·Öµ·Ü´·ÑÖ·Ö•·ÑÇ·Öµ·Üº_·ÑÉ·Öµ·Ü∏·ÑÖ·Ö•·ÑÇ·Öµ·Üº/·Ñë·Ö≥·ÑÖ·Ö©·Ñå·Ö¶·Ü®·Ñê·Ö≥/'

train_df = pd.read_csv(DATA_PATH + "dataset/train.csv")
test_df = pd.read_csv(DATA_PATH + "dataset/test.csv")
target = "diagnosed_diabetes"
X = train_df.drop(["id", target], axis=1, errors="ignore")
y = train_df[target]

X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb
import warnings
from tqdm.auto import tqdm

warnings.filterwarnings('ignore')


features = [
    'age','physical_activity_minutes_per_week','diet_score','bmi',
    'systolic_bp','triglycerides','hdl_cholesterol',
    'family_history_diabetes','hypertension_history'
]

X = train_df[features]
y = train_df['diagnosed_diabetes']

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

preprocessor = Pipeline([('scaler', StandardScaler())])


model_configs = [
    {
        "name": "Logistic Regression",
        "model": LogisticRegression(max_iter=2000, class_weight='balanced', random_state=42),
        "params": {"model__C": [0.01,0.1,1,5,10]}
    },
    {
        "name": "Decision Tree",
        "model": DecisionTreeClassifier(class_weight='balanced', random_state=42),
        "params": {"model__max_depth":[3,5,7,10,None],"model__min_samples_leaf":[5,10,20]}
    },
    {
        "name": "XGBoost",
        "model": xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False, random_state=42),
        "params": {
            "model__n_estimators":[300,500,700],
            "model__learning_rate":[0.01,0.03,0.05],
            "model__max_depth":[3,4,5],
            "model__subsample":[0.8,1.0],
            "model__colsample_bytree":[0.8,1.0]
        }
    },
    {
        "name": "LightGBM",
        "model": lgb.LGBMClassifier(objective='binary', class_weight='balanced', random_state=42),
        "params": {
            "model__n_estimators":[600,800,1000],
            "model__learning_rate":[0.01,0.02,0.03],
            "model__num_leaves":[31,63,127],
            "model__subsample":[0.8,0.9],
            "model__colsample_bytree":[0.8,0.9]
        }
    }
]


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for config in tqdm(model_configs, desc="Î™®Îç∏ ÏµúÏ†ÅÌôî ÏßÑÌñâ Ï§ë"):
    pipeline = Pipeline([('prep', preprocessor), ('model', config['model'])])

    search = RandomizedSearchCV(
        pipeline,
        param_distributions=config['params'],
        n_iter=10,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )

    search.fit(X_train, y_train)

    val_probs = search.predict_proba(X_valid)[:, 1]
    val_auc = roc_auc_score(y_valid, val_probs)

    results.append({
        "Model": config['name'],
        "Val_AUC": round(val_auc,4),
        "Best_CV_AUC": round(search.best_score_,4),
        "Best_Params": search.best_params_
    })

results_df = pd.DataFrame(results).sort_values(by='Val_AUC', ascending=False).reset_index(drop=True)
display(results_df)


best_model_info = max(results, key=lambda x: x['Val_AUC'])
best_model_name = best_model_info['Model']
best_params = best_model_info['Best_Params']

selected_config = next(config for config in model_configs if config["name"] == best_model_name)
final_model_obj = selected_config["model"]

final_pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", final_model_obj)
])
final_pipeline.set_params(**best_params)


X_full = pd.concat([X_train, X_valid], axis=0)
y_full = pd.concat([y_train, y_valid], axis=0)
final_pipeline.fit(X_full, y_full)


test_X = test_df[features]
test_proba = final_pipeline.predict_proba(test_X)[:, 1]

submission = pd.DataFrame({
    "id": test_df["id"] if "id" in test_df.columns else np.arange(len(test_df)),
    "diagnosed_diabetes": test_proba
})
submission.to_csv("submission_best_model.csv", index=False)
display(submission.head())


Î™®Îç∏ ÏµúÏ†ÅÌôî ÏßÑÌñâ Ï§ë:   0%|          | 0/4 [00:00<?, ?it/s]

Fitting 5 folds for each of 5 candidates, totalling 25 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.114823 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 994
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


Unnamed: 0,Model,Val_AUC,Best_CV_AUC,Best_Params
0,LightGBM,0.7246,0.724,"{'model__subsample': 0.9, 'model__num_leaves':..."
1,XGBoost,0.7213,0.721,"{'model__subsample': 1.0, 'model__n_estimators..."
2,Decision Tree,0.694,0.6918,"{'model__min_samples_leaf': 20, 'model__max_de..."
3,Logistic Regression,0.6933,0.6925,{'model__C': 5}


[LightGBM] [Info] Number of positive: 436307, number of negative: 263693
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069658 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 994
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


Unnamed: 0,id,diagnosed_diabetes
0,700000,0.338476
1,700001,0.600298
2,700002,0.637103
3,700003,0.26766
4,700004,0.876392


In [3]:
import time
from datetime import timedelta
import warnings
from tqdm.auto import tqdm # ÏßÑÌñâ Î∞î ÎùºÏù¥Î∏åÎü¨Î¶¨

warnings.filterwarnings('ignore', category=UserWarning)

# 1. ÍµêÏ∞® Í≤ÄÏ¶ù Î∞è Í≤∞Í≥º Ï†ÄÏû• Ï§ÄÎπÑ
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

# 2. Ï†ÑÏ≤¥ Î™®Îç∏ Î£®ÌîÑÏóê tqdm Ï†ÅÏö© (Ï†ÑÏ≤¥ ÏßÑÌñâÎ•† ÌôïÏù∏)
for config in tqdm(model_configs, desc="Ï†ÑÏ≤¥ Î™®Îç∏ ÏµúÏ†ÅÌôî ÏßÑÌñâ Ï§ë"):
    print(f"\n [{config['name']}] ÎûúÎç§ ÏÑúÏπò ÌÉêÏÉâÏùÑ ÏãúÏûëÌï©ÎãàÎã§...")

    # ÏãúÍ∞Ñ Ï∏°Ï†ï ÏãúÏûë
    start_time = time.perf_counter()

    pipeline = Pipeline([("prep", preprocessor), ("model", config['model'])])

    # RandomizedSearchCV ÏÑ§Ï†ï
    # verbose=1: Í∞ÑÎã®Ìïú ÏßÑÌñâ ÏÉÅÌô© Ï∂úÎ†• (Í∞Å ÌååÎùºÎØ∏ÌÑ∞ Ï°∞Ìï© ÏãúÎèÑ Ïãú Ï∂úÎ†•)
    # n_jobs=-1: Î≥ëÎ†¨ Ï≤òÎ¶¨ ÌôúÏÑ±Ìôî
    search = RandomizedSearchCV(
        pipeline,
        param_distributions=config['params'],
        n_iter=10,
        cv=cv,
        scoring='roc_auc',
        n_jobs=-1,
        random_state=42,
        verbose=1 # <--- Ïã§Ìñâ Ï§ëÏù∏ ÏÉÅÌÉúÎ•º Î°úÍ∑∏Î°ú Î≥¥Ïó¨Ï§çÎãàÎã§.
    )

    # Î™®Îç∏ ÌïôÏäµ (Ïã§Ï†ú ÌÉêÏÉâ ÏàòÌñâ)
    search.fit(X_train, y_train)

    # ÏãúÍ∞Ñ Ï∏°Ï†ï Ï¢ÖÎ£å Î∞è Î≥ÄÌôò
    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    elapsed_hms = str(timedelta(seconds=int(elapsed_time)))

    # Í≤ÄÏ¶ùÏÖã ÌèâÍ∞Ä
    val_probs = search.predict_proba(X_valid)[:, 1]
    val_auc = roc_auc_score(y_valid, val_probs)
    val_pred = search.predict(X_valid)

    # ÏÉÅÏÑ∏ ÏßÄÌëú Í≥ÑÏÇ∞
    tn, fp, fn, tp = confusion_matrix(y_val, val_pred).ravel()

    print(f"[{config['name']}] ÌÉêÏÉâ ÏôÑÎ£å!")
    print(f"ÏÜåÏöî ÏãúÍ∞Ñ: {elapsed_hms} | Best CV AUC: {search.best_score_:.4f} | Val AUC: {val_auc:.4f}")

    # Í≤∞Í≥º Ï†ÄÏû•
    results.append({
        "Model": config['name'],
        "AUC": roc_auc_score(y_val, val_probs),
        "Accuracy": accuracy_score(y_val, val_pred),
        "Precision": precision_score(y_val, val_pred),
        "Recall": recall_score(y_val, val_pred),
        "F1": f1_score(y_val, val_pred),
        "TP": tp, "TN": tn, "FP": fp, "FN": fn,
        "Best_CV_AUC": search.best_score_,
        "Val_AUC": val_auc,
        "Time": elapsed_hms,
        "Best_Params": search.best_params_
    })

Ï†ÑÏ≤¥ Î™®Îç∏ ÏµúÏ†ÅÌôî ÏßÑÌñâ Ï§ë:   0%|          | 0/4 [00:00<?, ?it/s]


 [Logistic Regression] ÎûúÎç§ ÏÑúÏπò ÌÉêÏÉâÏùÑ ÏãúÏûëÌï©ÎãàÎã§...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[Logistic Regression] ÌÉêÏÉâ ÏôÑÎ£å!
ÏÜåÏöî ÏãúÍ∞Ñ: 0:00:13 | Best CV AUC: 0.6925 | Val AUC: 0.6933

 [Decision Tree] ÎûúÎç§ ÏÑúÏπò ÌÉêÏÉâÏùÑ ÏãúÏûëÌï©ÎãàÎã§...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[Decision Tree] ÌÉêÏÉâ ÏôÑÎ£å!
ÏÜåÏöî ÏãúÍ∞Ñ: 0:01:39 | Best CV AUC: 0.6918 | Val AUC: 0.6940

 [XGBoost] ÎûúÎç§ ÏÑúÏπò ÌÉêÏÉâÏùÑ ÏãúÏûëÌï©ÎãàÎã§...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[XGBoost] ÌÉêÏÉâ ÏôÑÎ£å!
ÏÜåÏöî ÏãúÍ∞Ñ: 0:13:21 | Best CV AUC: 0.7210 | Val AUC: 0.7213

 [LightGBM] ÎûúÎç§ ÏÑúÏπò ÌÉêÏÉâÏùÑ ÏãúÏûëÌï©ÎãàÎã§...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.064395 seconds.
You can set `force_col_wise=true` to remove the overhead.
[

In [8]:
import pandas as pd
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,AUC,Accuracy,Precision,Recall,F1,TP,TN,FP,FN,Best_CV_AUC,Val_AUC,Time,Best_Params
0,Logistic Regression,0.69329,0.625829,0.754187,0.592945,0.663916,51741,35875,16864,35520,0.692541,0.69329,0:00:13,{'model__C': 5}
1,Decision Tree,0.693995,0.6302,0.75225,0.606422,0.67151,52917,35311,17428,34344,0.691762,0.693995,0:01:39,"{'model__min_samples_leaf': 20, 'model__max_de..."
2,XGBoost,0.721297,0.6809,0.70299,0.845085,0.767517,73743,21583,31156,13518,0.721047,0.721297,0:13:21,"{'model__subsample': 1.0, 'model__n_estimators..."
3,LightGBM,0.724612,0.653879,0.772623,0.630133,0.694141,54986,36557,16182,32275,0.723985,0.724612,0:39:40,"{'model__subsample': 0.9, 'model__num_leaves':..."


In [9]:
display(results_df[["Model", "Best_CV_AUC", "Val_AUC", "Time"]])

Unnamed: 0,Model,Best_CV_AUC,Val_AUC,Time
0,Logistic Regression,0.692541,0.69329,0:00:13
1,Decision Tree,0.691762,0.693995,0:01:39
2,XGBoost,0.721047,0.721297,0:13:21
3,LightGBM,0.723985,0.724612,0:39:40


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ÏÑ±Îä•(AUC)Í≥º ÏãúÍ∞Ñ(Ï¥à Îã®ÏúÑ) ÎπÑÍµê Ï∞®Ìä∏
results_df['Seconds'] = results_df['Time'].apply(lambda x: sum(int(a) * 60**i for i, a in enumerate(reversed(x.split(':')))))

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.add_trace(go.Bar(x=results_df['Model'], y=results_df['Val_AUC'], name="Validation AUC"), secondary_y=False)
fig.add_trace(go.Scatter(x=results_df['Model'], y=results_df['Seconds'], name="Time (seconds)", mode='lines+markers'), secondary_y=True)

fig.update_layout(title_text="Î™®Îç∏Î≥Ñ ÏÑ±Îä• Î∞è ÏµúÏ†ÅÌôî ÏÜåÏöî ÏãúÍ∞Ñ ÎπÑÍµê")
fig.update_yaxes(title_text="AUC Score", secondary_y=False)
fig.update_yaxes(title_text="Time (s)", secondary_y=True)
fig.show()

In [11]:
# 1. ÎûúÎç§ ÏÑúÏπò Í≤∞Í≥º Ï§ë Í∞ÄÏû• ÏÑ±Îä•(Val_AUC)Ïù¥ Ï¢ãÏùÄ Î™®Îç∏ Ï†ïÎ≥¥ Í∞ÄÏ†∏Ïò§Í∏∞
best_model_info = max(results, key=lambda x: x['Val_AUC'])
best_model_name = best_model_info['Model']
best_params = best_model_info['Best_Params']

print(f"üèÜ ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú Î™®Îç∏: {best_model_name}")
print(f"‚öôÔ∏è ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞: {best_params}")

# 2. ÏµúÏ†Å Î™®Îç∏ Í∞ùÏ≤¥ Ï∞æÍ∏∞ Î∞è ÌååÎùºÎØ∏ÌÑ∞ Ï†ÅÏö©
# model_configs Î¶¨Ïä§Ìä∏ÏóêÏÑú Ïù¥Î¶ÑÏù¥ Í∞ôÏùÄ Î™®Îç∏ ÏÑ§Ï†ïÏùÑ Ï∞æÏäµÎãàÎã§.
selected_config = next(config for config in model_configs if config["name"] == best_model_name)
final_model_obj = selected_config["model"]

# ÌååÏÉùÎêú ÌååÏù¥ÌîÑÎùºÏù∏ ÏÉùÏÑ± (Ï†ÑÏ≤òÎ¶¨Í∏∞ + ÏÑ†ÌÉùÎêú Î™®Îç∏)
final_pipeline = Pipeline([
    ("prep", preprocessor),
    ("model", final_model_obj)
])

# RandomizedSearchCVÏóêÏÑú Ï∞æÏùÄ 'best_params'Î•º ÌååÏù¥ÌîÑÎùºÏù∏Ïóê ÏßÅÏ†ë Ï£ºÏûÖÌï©ÎãàÎã§.
final_pipeline.set_params(**best_params)

# 3. Ï†ÑÏ≤¥ Îç∞Ïù¥ÌÑ∞Î°ú Ïû¨ÌïôÏäµ (Train + Validation Ìï©Ï≥êÏÑú ÌïôÏäµÌïòÎ©¥ ÏÑ±Îä•Ïù¥ Îçî Ï¢ãÏïÑÏßëÎãàÎã§)
# Í∞ïÏùòÏö©Ïù¥ÎùºÎ©¥ Í∞ÑÎã®ÌïòÍ≤å X_train_rawÎßå ÏÇ¨Ïö©Ìï¥ÎèÑ Î¨¥Î∞©Ìï©ÎãàÎã§.
print("ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞Î°ú Î™®Îç∏ Ïû¨ÌïôÏäµ Ï§ë...")
final_pipeline.fit(X_train, y_train)

# 4. ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ ÏòàÏ∏°
test_X = test_df[features]
test_proba = final_pipeline.predict_proba(test_X)[:, 1]

# 5. Ï†úÏ∂ú Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑ Íµ¨ÏÑ±
submission = pd.DataFrame({
    "id": test_df["id"] if "id" in test_df.columns else np.arange(len(test_df)),
    "diagnosed_diabetes": test_proba
})

# 6. ÌååÏùº Ï†ÄÏû• Î∞è ÌôïÏù∏
file_name = f"submission_best_{best_model_name.lower().replace(' ', '_')}.csv"
save_path = DATA_PATH + "output/" + file_name
submission.to_csv(save_path, index=False)
print(f"Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû• ÏôÑÎ£å: {save_path}")
display(submission.head())

üèÜ ÏµúÏ¢Ö ÏÑ†ÌÉùÎêú Î™®Îç∏: LightGBM
‚öôÔ∏è ÏµúÏ†Å ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞: {'model__subsample': 0.9, 'model__num_leaves': 63, 'model__n_estimators': 800, 'model__learning_rate': 0.03, 'model__colsample_bytree': 0.8}
ÏµúÏ†Å ÌååÎùºÎØ∏ÌÑ∞Î°ú Î™®Îç∏ Ïû¨ÌïôÏäµ Ï§ë...
[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.055406 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 994
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Ï†úÏ∂ú ÌååÏùº Ï†ÄÏû• ÏôÑÎ£å: /content/drive/MyDrive/Colab Notebooks/Multicampus-8/4_·ÑÜ·Ö•·Ñâ·Öµ·Ü´·ÑÖ·Ö•·ÑÇ·Öµ·Üº_·ÑÉ·Öµ·Ü∏·ÑÖ·Ö•·ÑÇ·Öµ·Üº/·Ñë·Ö≥·ÑÖ·Ö©·Ñå·Ö¶·Ü®·Ñê·Ö≥/output/submission_best_lightgbm.csv


Unnamed: 0,id,diagnosed_diabetes
0,700000,0.343998
1,700001,0.620765
2,700002,0.651108
3,700003,0.266749
4,700004,0.88484


In [12]:
# 5. ÌååÏùº Îã§Ïö¥Î°úÎìú (Colab)
from google.colab import files
files.download(save_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>