In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

%matplotlib inline
from matplotlib import font_manager, rc

font_location = "C:\Windows\Fonts\malgun.ttf"
font_name = font_manager.FontProperties(fname=font_location).get_name()
rc('font', family=font_name)
plt.rcParams['axes.unicode_minus'] = False


In [6]:
import xml.etree.ElementTree as ET
import pandas as pd

def load_xml_to_df(file):
    tree = ET.parse(file)
    root = tree.getroot()

    rows = []
    for event in root.find("glucose_level").findall("event"):
        ts = event.attrib["ts"]
        value = event.attrib["value"]
        rows.append([ts, float(value)])

    return pd.DataFrame(rows, columns=["timestamp", "glucose"])


files = ["570-ws-training.xml", "563-ws-training.xml", "588-ws-training.xml"]
patients = [load_xml_to_df(f) for f in files]

for i, df in enumerate(patients):
    print(f"환자 {i+1} 데이터 크기:", df.shape)
    print(df.head(), "\n")


환자 1 데이터 크기: (10982, 2)
             timestamp  glucose
0  07-12-2021 16:29:00    101.0
1  07-12-2021 16:34:00    100.0
2  07-12-2021 16:39:00    100.0
3  07-12-2021 16:44:00     99.0
4  07-12-2021 16:49:00     98.0 

환자 2 데이터 크기: (12124, 2)
             timestamp  glucose
0  13-09-2021 12:33:00    219.0
1  13-09-2021 12:38:00    229.0
2  13-09-2021 12:43:00    224.0
3  13-09-2021 12:48:00    221.0
4  13-09-2021 12:53:00    215.0 

환자 3 데이터 크기: (12640, 2)
             timestamp  glucose
0  30-08-2021 11:53:00    116.0
1  30-08-2021 11:58:00    117.0
2  30-08-2021 12:03:00    119.0
3  30-08-2021 12:08:00    116.0
4  30-08-2021 12:13:00    111.0 



In [7]:
for i, df in enumerate(patients):
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%d-%m-%Y %H:%M:%S")
    df.set_index("timestamp", inplace=True)
    
    patients[i] = df.resample("5min").mean().interpolate()


In [8]:
import numpy as np
import pandas as pd

def make_xy_from_df(df, recent_len=12, horizon=24):
    g = df["glucose"].values.astype(float)
    X, y = [], []
    limit = len(g) - (recent_len + horizon)
    for i in range(limit):
        X.append(g[i:i+recent_len])
        y.append(g[i+recent_len+horizon-1])  # 120분 뒤 1포인트
    return np.array(X), np.array(y)

per_patient = [make_xy_from_df(df, recent_len=12, horizon=24) for df in patients]


In [9]:
import numpy as np
import pandas as pd

def make_xy_with_aggs(df, recent_len=12, horizon=24, step_minutes=5):
    """
    df: ['glucose'] 컬럼을 가지고 있고 5분 간격으로 resample & interpolate 된 상태(DatetimeIndex)
    recent_len: 입력 시퀀스 길이(예: 12=60분)
    horizon: 예측 시점까지의 간격(예: 24=120분)
    """
    g = df["glucose"].values.astype(float)
    n = len(g)
    X, y = [], []

    
    W30 = 6    
    W60 = 12   
    W120 = 24  


    def slope_of(arr):
        
        if len(arr) < 2: 
            return 0.0
        x = np.arange(len(arr))
        b1, b0 = np.polyfit(x, arr, 1)  # y ≈ b1*x + b0
        return float(b1)

    limit = n - (recent_len + horizon)
    for i in range(limit):
        seg = g[i:i+recent_len]              
        last_idx = i + recent_len             
        last_val = seg[-1]

        
        win30 = g[i+recent_len-W30 : i+recent_len] if i+recent_len >= W30 else seg[-W30:]
        win60 = g[i+recent_len-W60 : i+recent_len] if i+recent_len >= W60 else seg[-W60:]
        win120= g[i+recent_len-W120: i+recent_len] if i+recent_len >= W120 else seg[-W120:]

        
        feats = [
            
            *seg.tolist(),

            
            np.mean(win30), np.std(win30), np.min(win30), np.max(win30), slope_of(win30),
            
            np.mean(win60), np.std(win60), np.min(win60), np.max(win60), slope_of(win60),
            
            np.mean(win120), np.std(win120), np.min(win120), np.max(win120), slope_of(win120),

            
            last_val - g[last_idx-2] if last_idx-2 >= 0 else 0.0,          
            last_val - g[last_idx-W30] if last_idx-W30 >= 0 else 0.0,      
        ]

        X.append(feats)
        y.append(g[i + recent_len + horizon - 1])  

    X = np.array(X)
    y = np.array(y)
    return X, y


In [10]:
per_patient = [make_xy_with_aggs(df, recent_len=12, horizon=24) for df in patients]


X_all = np.vstack([pp[0] for pp in per_patient])
y_all = np.hstack([pp[1] for pp in per_patient])



In [7]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle



X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])


X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


mix_model = LGBMRegressor(
    n_estimators=800, learning_rate=0.05, num_leaves=63,
    max_depth=-1, subsample=0.9, colsample_bytree=0.9,
    random_state=42
)
mix_model.fit(X_train, y_train)


train_preds = mix_model.predict(X_train)
test_preds  = mix_model.predict(X_test)

mae_train = mean_absolute_error(y_train, train_preds)
rmse_train = np.sqrt(mean_squared_error(y_train, train_preds))

mae_test = mean_absolute_error(y_test, test_preds)
rmse_test = np.sqrt(mean_squared_error(y_test, test_preds))

print("\n=== LGBM Mixed 결과 (Train/Test Split) ===")
print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")


mixed_rows = []
for i in range(len(per_patient)):
    X_te, y_te = per_patient[i]
    preds = mix_model.predict(X_te)
    mae = mean_absolute_error(y_te, preds)
    rmse = np.sqrt(mean_squared_error(y_te, preds))
    mixed_rows.append({"환자": f"환자 {i+1}", "MAE": mae, "RMSE": rmse})

mixed_df = pd.DataFrame(mixed_rows)
print("\n=== LGBM Mixed 결과 (모든 환자 합쳐 학습 → 환자별 평가) ===")
print(mixed_df)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145





=== LGBM Mixed 결과 (Train/Test Split) ===
[Train] MAE: 16.02, RMSE: 21.25
[Test ] MAE: 25.36, RMSE: 33.95





=== LGBM Mixed 결과 (모든 환자 합쳐 학습 → 환자별 평가) ===
     환자        MAE       RMSE
0  환자 1  17.040109  23.433518
1  환자 2  18.773607  25.389373
2  환자 3  17.763482  24.013105


In [8]:
with open("lgbm_mixed_glucose_model.pkl", "wb") as f:
    pickle.dump(mix_model, f)

print("\n모델 저장 완료 → lgbm_mixed_glucose_model.pkl")


모델 저장 완료 → lgbm_mixed_glucose_model.pkl


In [11]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error
import optuna

X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 2000),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.05, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 1.0),
        "random_state": 42,
        "n_jobs": -1
    }
    model = LGBMRegressor(**params)
    score = -np.mean(cross_val_score(model, X_train, y_train,
                                     scoring="neg_mean_absolute_error", cv=3, n_jobs=-1))
    return score

print("⏳ Optuna 튜닝 중...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)  # trial 수 조절 가능
best_params = study.best_params
print("✅ Best Params:", best_params)


lgb_best = LGBMRegressor(**best_params)


xgb = XGBRegressor(
    n_estimators=800, learning_rate=0.05, max_depth=6,
    subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1
)
ridge = Ridge(alpha=1.0)

ensemble = VotingRegressor(
    estimators=[("lgb", lgb_best), ("xgb", xgb), ("ridge", ridge)]
)


lgb_best.fit(X_train, y_train)
ensemble.fit(X_train, y_train)


def eval_model(name, model):
    y_pred_train = model.predict(X_train)
    y_pred_test  = model.predict(X_test)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    mae_test = mean_absolute_error(y_test, y_pred_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    print(f"\n=== {name} 결과 ===")
    print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
    print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")

eval_model("최적화된 LGBM", lgb_best)
eval_model("앙상블 (LGBM+XGB+Ridge)", ensemble)


[I 2025-09-29 14:26:21,827] A new study created in memory with name: no-name-46eed3f1-f314-47ce-aab8-27f78d72c85b


⏳ Optuna 튜닝 중...


[I 2025-09-29 14:26:31,039] Trial 0 finished with value: 29.174622328111937 and parameters: {'n_estimators': 1066, 'learning_rate': 0.011031450960541369, 'num_leaves': 210, 'max_depth': 7, 'subsample': 0.9586223370858629, 'colsample_bytree': 0.8673638777018334, 'min_child_samples': 28, 'reg_alpha': 0.046582011802972834, 'reg_lambda': 0.8392112231447361}. Best is trial 0 with value: 29.174622328111937.
[I 2025-09-29 14:26:40,027] Trial 1 finished with value: 27.8491256934668 and parameters: {'n_estimators': 1419, 'learning_rate': 0.03987108557381485, 'num_leaves': 32, 'max_depth': 15, 'subsample': 0.9333171409567212, 'colsample_bytree': 0.8720560592852002, 'min_child_samples': 50, 'reg_alpha': 0.013461671300301292, 'reg_lambda': 0.4830942773715716}. Best is trial 1 with value: 27.8491256934668.
[I 2025-09-29 14:26:51,377] Trial 2 finished with value: 27.694091174887763 and parameters: {'n_estimators': 1225, 'learning_rate': 0.030904361272655913, 'num_leaves': 61, 'max_depth': 0, 'subsam

✅ Best Params: {'n_estimators': 1773, 'learning_rate': 0.03323878009176854, 'num_leaves': 98, 'max_depth': 15, 'subsample': 0.9245231069729564, 'colsample_bytree': 0.9432466479883282, 'min_child_samples': 12, 'reg_alpha': 0.8756381018461254, 'reg_lambda': 0.518796289570948}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145

=== 최적화된 LGBM 결과 ===
[Train] MAE: 8.

In [13]:
import pickle

lgb_best.fit(X_train, y_train)

# 저장
with open("lgb_best.pkl", "wb") as f:
    pickle.dump(lgb_best, f)

print("✅ 모델 저장 완료: lgb_best.pkl")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
✅ 모델 저장 완료: lgb_best.pkl


In [16]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import pandas as pd


X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_tr, X_te, y_tr, y_te = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


X_tr2, X_va, y_tr2, y_va = train_test_split(
    X_tr, y_tr, test_size=0.2, random_state=42, shuffle=True
)

def eval_model(name, model):
    model.fit(
        X_tr2, y_tr2,
        eval_set=[(X_va, y_va)],
        eval_metric="l1",            
        callbacks=[],
    )

    pred_tr = model.predict(X_tr)
    pred_te = model.predict(X_te)
    mae_tr = mean_absolute_error(y_tr, pred_tr)
    mae_te = mean_absolute_error(y_te, pred_te)
    rmse_tr = np.sqrt(mean_squared_error(y_tr, pred_tr))
    rmse_te = np.sqrt(mean_squared_error(y_te, pred_te))
    gap = mae_te - mae_tr
    print(f"\n=== {name} ===")
    print(f"[Train] MAE={mae_tr:.2f}, RMSE={rmse_tr:.2f}")
    print(f"[Test ] MAE={mae_te:.2f}, RMSE={rmse_te:.2f}, Gap={gap:.2f}")
    return {"name": name, "model": model, "mae_tr": mae_tr, "mae_te": mae_te, "gap": gap}


candidates = [
    ("Preset-Soft(과적합 위험)", LGBMRegressor(
        n_estimators=3000, learning_rate=0.015,
        num_leaves=128, max_depth=-1,
        min_child_samples=30, reg_alpha=0.05, reg_lambda=0.3,
        subsample=0.85, subsample_freq=1, colsample_bytree=0.85,
        random_state=42, n_jobs=-1
    )),
    ("Preset-Balanced(권장)", LGBMRegressor(
        n_estimators=2500, learning_rate=0.02,
        num_leaves=96, max_depth=8,
        min_child_samples=50, reg_alpha=0.1, reg_lambda=0.6,
        subsample=0.8, subsample_freq=1, colsample_bytree=0.8,
        random_state=42, n_jobs=-1
    )),
    ("Preset-Strong(규제 강함)", LGBMRegressor(
        n_estimators=1800, learning_rate=0.03,
        num_leaves=64, max_depth=6,
        min_child_samples=80, reg_alpha=0.2, reg_lambda=1.0,
        subsample=0.7, subsample_freq=1, colsample_bytree=0.7,
        random_state=42, n_jobs=-1
    )),
]

results = [eval_model(name, mdl) for name, mdl in candidates]


def score(row, gap_tolerance=10.0, gap_weight=0.3):
    penalty = max(0.0, row["gap"] - gap_tolerance) * gap_weight
    return row["mae_te"] + penalty

sel = min(results, key=lambda r: score(r))
print(f"\n✅ 선택: {sel['name']}  (Test MAE={sel['mae_te']:.2f}, Gap={sel['gap']:.2f})")

best_model = sel["model"]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002910 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7204
[LightGBM] [Info] Number of data points in the train set: 24132, number of used features: 29
[LightGBM] [Info] Start training from score 165.771874

=== Preset-Soft(과적합 위험) ===
[Train] MAE=11.61, RMSE=18.01
[Test ] MAE=25.06, RMSE=33.90, Gap=13.45
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001997 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7204
[LightGBM] [Info] Number of data points in the train set: 24132, number of used features: 29
[LightGBM] [Info] Start training from score 165.771874

=== Preset-Balanced(권장) ===
[Train] MAE=20.72, RMSE=27.62
[Test ] MAE=27.45, RMSE=36.52, Gap=6.74
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002780 seconds.
You can s

In [17]:
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


preset_medium = LGBMRegressor(
    n_estimators=1200,       
    learning_rate=0.03,      
    num_leaves=63,           
    max_depth=10,           
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.2,           
    reg_lambda=0.3,        
    min_child_samples=30,
    random_state=42,
    n_jobs=-1
)


preset_medium.fit(X_train, y_train)

y_pred_tr = preset_medium.predict(X_train)
y_pred_te = preset_medium.predict(X_test)

mae_tr = mean_absolute_error(y_train, y_pred_tr)
rmse_tr = mean_squared_error(y_train, y_pred_tr, squared=False)
mae_te = mean_absolute_error(y_test, y_pred_te)
rmse_te = mean_squared_error(y_test, y_pred_te, squared=False)

gap = mae_te - mae_tr

print("=== Preset-Medium(균형형) ===")
print(f"[Train] MAE={mae_tr:.2f}, RMSE={rmse_tr:.2f}")
print(f"[Test ] MAE={mae_te:.2f}, RMSE={rmse_te:.2f}, Gap={gap:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019797 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
=== Preset-Medium(균형형) ===
[Train] MAE=18.36, RMSE=24.24
[Test ] MAE=26.10, RMSE=34.81, Gap=7.74


In [20]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)

from lightgbm import early_stopping, log_evaluation

model = LGBMRegressor(
    n_estimators=5000,
    learning_rate=0.01,
    num_leaves=63,
    subsample=0.7,
    colsample_bytree=0.7,
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="l1",   
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(100)   
    ]
)


y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("\n=== LGBM Mixed (Early Stopping) 결과 ===")
print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 34.5437	valid_0's l2: 1878.28
[200]	valid_0's l1: 31.4119	valid_0's l2: 1646.12
[300]	valid_0's l1: 30.32	valid_0's l2: 1569.82
[400]	valid_0's l1: 29.7279	valid_0's l2: 1520.89
[500]	valid_0's l1: 29.31	valid_0's l2: 1484.17
[600]	valid_0's l1: 28.9682	valid_0's l2: 1453.4
[700]	valid_0's l1: 28.6919	valid_0's l2: 1428.84
[800]	valid_0's l1: 28.447	valid_0's l2: 1407.22
[900]	valid_0's l1: 28.2347	valid_0's l2: 1389.1
[1000]	valid_0's l1: 28.034	valid_0's l2: 1372.28
[1100]	valid_0's l1: 27.8511	valid_0's l2: 1356.02
[1200]	valid_0's l1: 27.6799	valid_0's l2: 1341.56

In [71]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


model = LGBMRegressor(
    n_estimators=3850,
    learning_rate=0.015,    
    num_leaves=49,           
    max_depth=-1,            
    subsample=0.75,         
    colsample_bytree=0.75,   
    reg_alpha=0.0,          
    reg_lambda=0.1,          
    min_child_samples=30,    
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="l1",
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(200)
    ]
)


y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("\n=== 목표: Train MAE ≈ 15 근처 ===")
print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")
print(f"Gap = {mae_test - mae_train:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005837 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 30.5905	valid_0's l2: 1596.87
[400]	valid_0's l1: 29.3337	valid_0's l2: 1489.92
[600]	valid_0's l1: 28.6929	valid_0's l2: 1431.91
[800]	valid_0's l1: 28.2253	valid_0's l2: 1391.74
[1000]	valid_0's l1: 27.8218	valid_0's l2: 1357.16
[1200]	valid_0's l1: 27.4744	valid_0's l2: 1326.81
[1400]	valid_0's l1: 27.1875	valid_0's l2: 1301.56
[1600]	valid_0's l1: 26.938	valid_0's l2: 1280.35
[1800]	valid_0's l1: 26.723	valid_0's l2: 1262.6
[2000]	valid_0's l1: 26.5313	valid_0's l2: 1247.51
[2200]	valid_0's l1: 26.3211	valid_0's l2: 1231.11
[2400]	valid_0's l1: 26.1522	valid_0's l

In [70]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


model = LGBMRegressor(
    n_estimators=3830,
    learning_rate=0.015,    
    num_leaves=49,           
    max_depth=-1,            
    subsample=0.75,          
    colsample_bytree=0.75,   
    reg_alpha=0.0,           
    reg_lambda=0.2,          
    min_child_samples=30,    
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="l1",
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(200)
    ]
)


y_pred_train = model.predict(X_train)
y_pred_test  = model.predict(X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("\n=== 목표: Train MAE ≈ 15 근처 ===")
print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")
print(f"Gap = {mae_test - mae_train:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 30.5793	valid_0's l2: 1596.24
[400]	valid_0's l1: 29.3518	valid_0's l2: 1493.4
[600]	valid_0's l1: 28.6689	valid_0's l2: 1432.46
[800]	valid_0's l1: 28.1795	valid_0's l2: 1389.39
[1000]	valid_0's l1: 27.8132	valid_0's l2: 1356.96
[1200]	valid_0's l1: 27.4978	valid_0's l2: 1329.04
[1400]	valid_0's l1: 27.2255	valid_0's l2: 1305.44
[1600]	valid_0's l1: 26.9821	valid_0's l2: 1285.88
[1800]	valid_0's l1: 26.7505	valid_0's l2: 1266.71
[2000]	valid_0's l1: 26.5161	valid_0's l2: 1248.35
[2200]	valid_0's l1: 26.3281	valid_0's l2: 1232.41
[2400]	valid_0's l1: 26.1629	valid_0's

In [72]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


X_all = np.vstack([per_patient[i][0] for i in range(len(per_patient))])
y_all = np.hstack([per_patient[i][1] for i in range(len(per_patient))])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)


lgbm_final = LGBMRegressor(
    n_estimators=3830,
    learning_rate=0.015,
    num_leaves=49,
    max_depth=-1,
    subsample=0.75,
    colsample_bytree=0.75,
    reg_alpha=0.0,
    reg_lambda=0.2,
    min_child_samples=30,
    random_state=42,
    n_jobs=-1
)

lgbm_final.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="l1",
    callbacks=[
        early_stopping(stopping_rounds=100),
        log_evaluation(200)
    ]
)


y_pred_train = lgbm_final.predict(X_train)
y_pred_test  = lgbm_final.predict(X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))

print("\n=== 목표: Train MAE ≈ 15 근처 ===")
print(f"[Train] MAE: {mae_train:.2f}, RMSE: {rmse_train:.2f}")
print(f"[Test ] MAE: {mae_test:.2f}, RMSE: {rmse_test:.2f}")
print(f"Gap = {mae_test - mae_train:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004687 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 30.5793	valid_0's l2: 1596.24
[400]	valid_0's l1: 29.3518	valid_0's l2: 1493.4
[600]	valid_0's l1: 28.6689	valid_0's l2: 1432.46
[800]	valid_0's l1: 28.1795	valid_0's l2: 1389.39
[1000]	valid_0's l1: 27.8132	valid_0's l2: 1356.96
[1200]	valid_0's l1: 27.4978	valid_0's l2: 1329.04
[1400]	valid_0's l1: 27.2255	valid_0's l2: 1305.44
[1600]	valid_0's l1: 26.9821	valid_0's l2: 1285.88
[1800]	valid_0's l1: 26.7505	valid_0's l2: 1266.71
[2000]	valid_0's l1: 26.5161	valid_0's l2: 1248.35
[2200]	valid_0's l1: 26.3281	valid_0's l2: 1232.41
[2400]	valid_0's l1: 26.1629	valid_0's

In [77]:
import pickle
with open("lgbm_final.pkl", "wb") as f:
    pickle.dump(lgbm_final, f)
    

In [78]:
print("✅ 모델 저장 완료: lgbm_final.pkl")

✅ 모델 저장 완료: lgbm_final.pkl


In [75]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


def load_xml_to_df(file):
    tree = ET.parse(file)
    root = tree.getroot()
    rows = []
    for event in root.find("glucose_level").findall("event"):
        ts = event.attrib["ts"]
        value = float(event.attrib["value"])
        rows.append([ts, value])
    df = pd.DataFrame(rows, columns=["timestamp", "glucose"])
    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%d-%m-%Y %H:%M:%S")
    df.set_index("timestamp", inplace=True)
    return df.resample("5min").mean().interpolate()


files = ["570-ws-training.xml", "563-ws-training.xml", "588-ws-training.xml"]
patients = [load_xml_to_df(f) for f in files]


def make_xy_with_aggs(df, recent_len=12, horizon=24):
    g = df["glucose"].values.astype(float)
    n = len(g)
    X, y = [], []
    def slope_of(arr):
        if len(arr) < 2: return 0.0
        x = np.arange(len(arr))
        b1, b0 = np.polyfit(x, arr, 1)
        return float(b1)

    W30, W60, W120 = 6, 12, 24
    limit = n - (recent_len + horizon)
    for i in range(limit):
        seg = g[i:i+recent_len]
        last_idx = i + recent_len
        last_val = seg[-1]
        win30 = g[max(0,last_idx-W30):last_idx]
        win60 = g[max(0,last_idx-W60):last_idx]
        win120= g[max(0,last_idx-W120):last_idx]

        feats = [
            *seg.tolist(),
            np.mean(win30), np.std(win30), np.min(win30), np.max(win30), slope_of(win30),
            np.mean(win60), np.std(win60), np.min(win60), np.max(win60), slope_of(win60),
            np.mean(win120),np.std(win120),np.min(win120),np.max(win120),slope_of(win120),
            last_val - g[last_idx-2] if last_idx-2 >= 0 else 0.0,
            last_val - g[last_idx-W30] if last_idx-W30 >= 0 else 0.0,
        ]
        X.append(feats)
        y.append(g[i+recent_len+horizon-1])
    return np.array(X), np.array(y)

per_patient = [make_xy_with_aggs(df) for df in patients]
X_all = np.vstack([pp[0] for pp in per_patient])
y_all = np.hstack([pp[1] for pp in per_patient])

X_train, X_test, y_train, y_test = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, shuffle=True
)

lgbm_final = LGBMRegressor(
    n_estimators=3830, learning_rate=0.015,
    num_leaves=49, max_depth=-1,
    subsample=0.75, colsample_bytree=0.75,
    reg_alpha=0.0, reg_lambda=0.2,
    min_child_samples=30,
    random_state=42, n_jobs=-1
)

lgbm_final.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="l1",
    callbacks=[early_stopping(100), log_evaluation(200)]
)


y_pred_train = lgbm_final.predict(X_train)
y_pred_test  = lgbm_final.predict(X_test)

mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
mae_test = mean_absolute_error(y_test, y_pred_test)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print("\n=== 최종 모델 결과 ===")
print(f"[Train] MAE={mae_train:.2f}, RMSE={rmse_train:.2f}")
print(f"[Test ] MAE={mae_test:.2f}, RMSE={rmse_test:.2f}")
print(f"Gap={mae_test - mae_train:.2f}")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002726 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7226
[LightGBM] [Info] Number of data points in the train set: 30165, number of used features: 29
[LightGBM] [Info] Start training from score 165.830145
Training until validation scores don't improve for 100 rounds
[200]	valid_0's l1: 30.5887	valid_0's l2: 1596.03
[400]	valid_0's l1: 29.3683	valid_0's l2: 1493.44
[600]	valid_0's l1: 28.697	valid_0's l2: 1434.43
[800]	valid_0's l1: 28.175	valid_0's l2: 1389.49
[1000]	valid_0's l1: 27.7899	valid_0's l2: 1354.28
[1200]	valid_0's l1: 27.443	valid_0's l2: 1324.64
[1400]	valid_0's l1: 27.1662	valid_0's l2: 1301.2
[1600]	valid_0's l1: 26.9452	valid_0's l2: 1283.02
[1800]	valid_0's l1: 26.7327	valid_0's l2: 1266.07
[2000]	valid_0's l1: 26.5438	valid_0's l2: 1250.26
[2200]	valid_0's l1: 26.3664	valid_0's l2: 1236.07
[2400]	valid_0's l1: 26.1868	valid_0's l2