In [None]:
# # 코랩 환경에서 실행시 필요. 로컬에서 실행시 이 셀 전체를 주석처리 등으로 제외하면 됩니다.
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir('/content/drive/MyDrive/shared-acorn/dacon-palsaekjo-shared')
# print(os.getcwd())

# LightGBM을 활용한 모델 학습 및 피쳐엔지니어링 (학습)

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
tqdm.pandas()

In [None]:
from mymodules.paths import Paths
PATH = Paths()

## 데이터 불러오기

In [None]:
train_A    = pd.read_csv(PATH.train_A)
train_B    = pd.read_csv(PATH.train_B)
train_meta = pd.read_csv(PATH.train_meta)
meta_A = train_meta[train_meta["Test"] == "A"].copy()
meta_B = train_meta[train_meta["Test"] == "B"].copy()

print("train_A:", train_A.shape)
print("train_B:", train_B.shape)
print("meta_A:", meta_A.shape)
print("meta_B:", meta_B.shape)

## 데이터 전처리

### 1차 Feature Engineering

In [None]:
from mymodules.preprocess import preprocess_A, preprocess_B

train_A_features = preprocess_A(train_A)
train_B_features = preprocess_B(train_B)

print("A:", train_A_features.shape, "B:", train_B_features.shape)

### 2차 Feature Engineering

In [None]:
from mymodules.feature_engineering import add_features_A, add_features_B

train_A_features = add_features_A(train_A_features)
train_B_features = add_features_B(train_B_features)

print("A+feat:", train_A_features.shape, "B+feat:", train_B_features.shape)

## 학습용/검증용 데이터세트 분리

In [None]:
COLS_TO_DROP = ["Test_id","Test","PrimaryKey","Age","TestDate"]

X_A = train_A_features.drop(columns=COLS_TO_DROP)
y_A = meta_A["Label"].to_numpy()
X_B = train_B_features.drop(columns=COLS_TO_DROP)
y_B = meta_B["Label"].to_numpy()

# 확인
print(f"A 세트: X={len(X_A)}, y={len(y_A)}")
print(f"B 세트: X={len(X_B)}, y={len(y_B)}")

In [None]:
X_train_A, X_val_A, y_train_A, y_val_A = train_test_split(X_A, y_A, test_size=0.2, stratify=y_A, random_state=42)
X_train_B, X_val_B, y_train_B, y_val_B = train_test_split(X_B, y_B, test_size=0.2, stratify=y_B, random_state=42)

## 모델 정의, 학습

In [None]:
def train_and_eval(X_train, y_train, X_val, y_val, group_label):
    model = lgb.LGBMClassifier(
        objective="binary",
        metric="auc",
        n_estimators=3000,
        learning_rate=0.05,
        n_jobs=-1,
        random_state=42,
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)]
    )

    val_pred = model.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_val, val_pred)
    print(f"[{group_label}] Validation AUC: {auc:.4f}")
    return model

In [None]:
model_A = train_and_eval(X_train_A, y_train_A, X_val_A, y_val_A, "A")
model_B = train_and_eval(X_train_B, y_train_B, X_val_B, y_val_B, "B")

In [None]:
# 피처 중요도 (Feature Importance)
print(model_A.get_params())
lgb.plot_importance(model_A, max_num_features=20)
print(model_B.get_params())
lgb.plot_importance(model_B, max_num_features=20)

## 모델 저장

In [None]:
import joblib

joblib.dump(model_A, PATH.model_A)
joblib.dump(model_B, PATH.model_B)

print(f"모델 저장 완료: {PATH.model_A}, {PATH.model_B}")