In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from scipy import stats

%matplotlib inline
from matplotlib import font_manager, rc

font_location = "C:\Windows\Fonts\malgun.ttf"
font_name = font_manager.FontProperties(fname=font_location).get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.feature_selection import mutual_info_classif
from lightgbm import LGBMClassifier
import random

df = pd.read_csv("국민건강보험공단_건강검진정보_2024.CSV", encoding="cp949")
df["고혈당_분석용"] = (df["식전혈당(공복혈당)"] >= 100).astype(int)

df_work = df.sample(n=100000, random_state=42).copy()

TARGET = "고혈당_분석용"
ID_LIKE = ["기준년도", "가입자일련번호"]
GLUCOSE_COLS = ["식전혈당(공복혈당)"]
MISSING_THRESH = 0.30
LOW_REL_TOP_K = 15
N_TRIALS = 10          
K_RANGE = (3, 5)

ANCHORS = ["연령대코드(5세단위)", "허리둘레", "수축기혈압", "이완기혈압",
           "HDL콜레스테롤", "LDL콜레스테롤", "흡연상태", "음주여부"]
ANCHORS = [c for c in ANCHORS if c in df_work.columns]

exclude_cols = set([TARGET] + ID_LIKE + [c for c in GLUCOSE_COLS if c in df_work.columns])
feature_pool = [c for c in df_work.columns if c not in exclude_cols]


missing_rate = df_work[feature_pool].isna().mean().sort_values()
kept_by_missing = missing_rate[missing_rate <= MISSING_THRESH].index.tolist()

def eval_random_combos(df, pool_cols, y_col, n_trials=10, k_range=(3,5),
                       always_include=None, random_state=42, verbose_every=5):
    rng = random.Random(random_state)
    results = []
    pool_cols = [c for c in pool_cols if c not in (always_include or [])]
    y = df[y_col].astype(int)

    for i in range(1, n_trials+1):
        k = rng.randint(k_range[0], k_range[1])
        subset = rng.sample(pool_cols, min(k, len(pool_cols)))
        if always_include:
            subset = list(set(subset + list(always_include)))

        X = df[subset].copy()
        for c in X.columns:
            if X[c].nunique() <= 15:
                X[c] = X[c].fillna(X[c].mode().iloc[0])
            else:
                X[c] = X[c].astype(float).fillna(X[c].median())

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=i
        )

        neg, pos = (y_train==0).sum(), (y_train==1).sum()
        scale = max(1.0, neg / max(1, pos))

        model = LGBMClassifier(
            n_estimators=100,     
            learning_rate=0.1,    
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=i,
            class_weight={0:1, 1:scale},
            n_jobs=-1
        )
        model.fit(X_train, y_train)

        proba = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, proba)

        results.append({
            "trial": i,
            "k": len(subset),
            "cols": subset,
            "AUC": round(auc, 4)
        })

        if verbose_every and i % verbose_every == 0:
            print(f"… {i}/{n_trials} done (last AUC={auc:.3f})")

    return pd.DataFrame(results).sort_values(by="AUC", ascending=False)

res_test = eval_random_combos(
    df=df_work,
    pool_cols=kept_by_missing,
    y_col=TARGET,
    n_trials=N_TRIALS,
    k_range=K_RANGE,
    always_include=ANCHORS,
    random_state=123
)

print("\n=== 빠른 테스트 결과 TOP 5 ===")
print(res_test.head(5))


[LightGBM] [Info] Number of positive: 31686, number of negative: 48314
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 982
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info] Number of positive: 31729, number of negative: 48271
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004617 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 865
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[Lig

In [7]:
low_rel_candidates = [
    "치아우식증유무", "치석"
]

In [8]:

df_sample = df_work.sample(n=100000, random_state=42)


DENTAL_EXTRA = [c for c in ["치아우식증유무", "치석"] if c in df_sample.columns]

res_with_dental = eval_random_combos(
    df=df_sample,
    pool_cols=low_rel_candidates,    
    y_col=TARGET,
    n_trials=20,                     
    k_range=(5, 8),                 
    always_include=ANCHORS + DENTAL_EXTRA,   
    random_state=123
)

print("\n=== 치과 변수 포함 (샘플 10만건, TOP 결과 10) ===")
print(res_with_dental.head(10))


[LightGBM] [Info] Number of positive: 31732, number of negative: 48268
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002991 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 848
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
[LightGBM] [Info] Number of positive: 31804, number of negative: 48196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003708 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 845
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
[LightGBM] [Info

### 치아변수 포함시 성능향상 = 치아와 혈당 관계 있음을 발견

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFE
from lightgbm import LGBMClassifier
import shap


df = pd.read_csv("국민건강보험공단_건강검진정보_2024.CSV", encoding="cp949")


df["고혈당_분석용"] = (df["식전혈당(공복혈당)"] >= 100).astype(int)

target = "고혈당_분석용"
y = df[target]


exclude_cols = ["고혈당_분석용", "고혈당_서비스용", "식전혈당(공복혈당)", "risk_level"]
X = df.drop(columns=exclude_cols, errors="ignore")


null_only_cols = [c for c in X.columns if X[c].isna().all()]
print(" 전부 NaN인 컬럼:", null_only_cols)

X = X.drop(columns=null_only_cols)


cat_cols = ["성별코드", "흡연상태", "음주여부"]
X = pd.get_dummies(X, columns=[c for c in cat_cols if c in X.columns], drop_first=True)

imputer = SimpleImputer(strategy="median")
scaler = StandardScaler()

X_imputed = imputer.fit_transform(X)
X_scaled = scaler.fit_transform(X_imputed)

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
y = y.astype(int)

log_model = LogisticRegression(max_iter=500, solver="liblinear")
rfe = RFE(log_model, n_features_to_select=10)
rfe.fit(X_scaled_df, y)

rfe_support = pd.DataFrame({
    "feature": X.columns,
    "selected": rfe.support_,
    "ranking": rfe.ranking_
}).sort_values("ranking")

print("\n=== RFE Top 10 피처 ===")
print(rfe_support[rfe_support["selected"]])

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.2, random_state=42
)

lgb_model = LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
lgb_model.fit(X_train, y_train)


explainer = shap.TreeExplainer(lgb_model)
shap_values = explainer.shap_values(X_test)

shap_importance = np.abs(shap_values).mean(axis=0)
shap_df = pd.DataFrame({
    "feature": X.columns,
    "shap_importance": shap_importance
}).sort_values("shap_importance", ascending=False)

print("\n=== SHAP Top 10 피처 ===")
print(shap_df.head(10))

y_pred = lgb_model.predict(X_test)
y_proba = lgb_model.predict_proba(X_test)[:,1]

print("\n=== LightGBM 성능 ===")
print(f"ACC = {accuracy_score(y_test, y_pred):.3f}")
print(f"F1  = {f1_score(y_test, y_pred):.3f}")
print(f"AUC = {roc_auc_score(y_test, y_proba):.3f}")


⚠️ 전부 NaN인 컬럼: ['결손치 유무', '치아마모증유무', '제3대구치(사랑니) 이상']

=== RFE Top 10 피처 ===
        feature  selected  ranking
22        감마지티피      True        1
21   혈청지피티(ALT)      True        1
3   연령대코드(5세단위)      True        1
20   혈청지오티(AST)      True        1
5     체중(5kg단위)      True        1
6          허리둘레      True        1
17          혈색소      True        1
16     LDL콜레스테롤      True        1
11        수축기혈압      True        1
13       총콜레스테롤      True        1
[LightGBM] [Info] Number of positive: 317970, number of negative: 482030
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.066355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2789
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.397462 -> initscore=-0.416049
[LightGBM] [Info] Start training from score -0.416049





=== SHAP Top 10 피처 ===
        feature  shap_importance
3   연령대코드(5세단위)         0.476083
6          허리둘레         0.253645
22        감마지티피         0.238442
11        수축기혈압         0.132355
21   혈청지피티(ALT)         0.110055
20   혈청지오티(AST)         0.104809
19      혈청크레아티닌         0.063159
17          혈색소         0.053821
14     트리글리세라이드         0.051776
5     체중(5kg단위)         0.043946

=== LightGBM 성능 ===
ACC = 0.688
F1  = 0.582
AUC = 0.745


In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

thresholds = [0.3, 0.4, 0.5, 0.55, 0.6, 0.65, 0.7]

results = []
for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({
        "Threshold": t,
        "Precision": round(prec, 3),
        "Recall": round(rec, 3),
        "F1": round(f1, 3)
    })

df_thresh = pd.DataFrame(results)
print(df_thresh)


   Threshold  Precision  Recall     F1
0       0.30      0.523   0.850  0.648
1       0.40      0.569   0.727  0.639
2       0.50      0.623   0.551  0.585
3       0.55      0.653   0.446  0.530
4       0.60      0.685   0.331  0.446
5       0.65      0.721   0.219  0.335
6       0.70      0.759   0.121  0.208


In [12]:
df["콜레스테롤비율"] = df["총콜레스테롤"] / (df["HDL콜레스테롤"] + 1e-6)
df["AST_ALT비율"] = df["혈청지오티(AST)"] / (df["혈청지피티(ALT)"] + 1e-6)


target = "고혈당_분석용"
y = df[target].astype(int)

exclude_cols = ["고혈당_분석용","고혈당_서비스용","식전혈당(공복혈당)","risk_level"]
X = df.drop(columns=exclude_cols, errors="ignore")

cat_cols = ["성별코드","흡연상태","음주여부"]
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
X = X.astype("float32").fillna(0)


from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

neg, pos = (y_train==0).sum(), (y_train==1).sum()
scale = neg / pos if pos > 0 else 1

model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight={0:1, 1:scale},
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)


y_pred_proba = model.predict_proba(X_test)[:,1]

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Threshold={t:.2f} → Precision={prec:.3f}, Recall={rec:.3f}, F1={f1:.3f}, AUC={auc:.3f}")

[LightGBM] [Info] Number of positive: 317970, number of negative: 482030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.040490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3292
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Threshold=0.30 → Precision=0.486, Recall=0.917, F1=0.636, AUC=0.747
Threshold=0.40 → Precision=0.527, Recall=0.843, F1=0.648, AUC=0.747
Threshold=0.50 → Precision=0.570, Recall=0.728, F1=0.639, AUC=0.747
Threshold=0.60 → Precision=0.622, Recall=0.557, F1=0.588, AUC=0.747
Threshold=0.70 → Precision=0.688, Recall=0.319, F1=0.436, AUC=0.747


In [13]:
df["BMI"] = df["체중(5kg단위)"] * 5 / ((df["신장(5cm단위)"]*5 / 100) ** 2)  
df["혈압차"] = df["수축기혈압"] - df["이완기혈압"]
df["콜레스테롤비율"] = df["총콜레스테롤"] / (df["HDL콜레스테롤"] + 1e-6)
df["AST_ALT비율"] = df["혈청지오티(AST)"] / (df["혈청지피티(ALT)"] + 1e-6)

target = "고혈당_분석용"
y = df[target].astype(int)

exclude_cols = ["고혈당_분석용","고혈당_서비스용","식전혈당(공복혈당)","risk_level"]
X = df.drop(columns=exclude_cols, errors="ignore")

cat_cols = ["성별코드","흡연상태","음주여부"]
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
X = X.astype("float32").fillna(0)


from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

neg, pos = (y_train==0).sum(), (y_train==1).sum()
scale = neg / pos if pos > 0 else 1

model = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight={0:1, 1:scale},
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)


y_pred_proba = model.predict_proba(X_test)[:,1]


thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]
for t in thresholds:
    y_pred = (y_pred_proba >= t).astype(int)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"Threshold={t:.2f} → Precision={prec:.3f}, Recall={rec:.3f}, F1={f1:.3f}, AUC={auc:.3f}")


[LightGBM] [Info] Number of positive: 317970, number of negative: 482030
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 800000, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Threshold=0.30 → Precision=0.487, Recall=0.917, F1=0.636, AUC=0.747
Threshold=0.40 → Precision=0.527, Recall=0.842, F1=0.648, AUC=0.747
Threshold=0.50 → Precision=0.570, Recall=0.727, F1=0.639, AUC=0.747
Threshold=0.60 → Precision=0.622, Recall=0.556, F1=0.587, AUC=0.747
Threshold=0.70 → Precision=0.688, Recall=0.319, F1=0.436, AUC=0.747


In [None]:
import pickle
import numpy as np


model.fit(X_train, y_train)


meta = {
    "features": X.columns.tolist(),  
    "median_values": X.median().to_dict() 
}

with open("glucose_model.pkl", "wb") as f:
    pickle.dump((model, meta), f)
