In [1]:
import numpy as np, pandas as pd
from joblib import dump
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

df = pd.read_csv("d:/data/diabetes/diabetes2.csv")
for c in ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]:
    if c in df.columns: df[c] = df[c].replace(0, np.nan) #0을 nan으로 처리

X, y = df[["Age", "BMI", "Glucose"]], df["Outcome"].astype(int)

pipe = make_pipeline(
    SimpleImputer(strategy = "median"),
    StandardScaler(),
    LogisticRegression(max_iter = 1000)
)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
pipe.fit(X_tr, y_tr)
proba = pipe.predict_proba(X_te)[:, 1]
print(f"AUC={roc_auc_score(y_te, proba):.3f} | ACC={accuracy_score(y_te, (proba >= 0.5).astype(int)):3.f}")

dump({"model": pipe, "features": ["Age", "BMI", "Glucose"]}, 'd:/data/diabetes/diabetes_model.joblib')
print('완료되었습니다.')

ValueError: Format specifier missing precision