In [None]:
# # 코랩 환경에서 실행시 필요. 로컬에서 실행시 이 셀 전체를 주석처리 등으로 제외하면 됩니다.
# from google.colab import drive
# drive.mount('/content/drive')

# import os
# os.chdir('/content/drive/MyDrive/shared-acorn/dacon-palsaekjo-shared')
# print(os.getcwd())

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from mymodules.wkqehdtksl import Paths, COLS_TO_DROP
from mymodules.preprocess import preprocess_A, preprocess_B
from mymodules.feature_engineering import add_features_A, add_features_B

In [None]:
PATH = Paths('open_v2')
CHUNK_SIZE = 10000

In [None]:
model_A = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=3000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
)
model_B = lgb.LGBMClassifier(
    objective="binary",
    metric="auc",
    n_estimators=3000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
)

In [None]:
def get_data_A(train_A: pd.DataFrame) -> tuple:

    train_A_features = preprocess_A(train_A)
    train_A_features = add_features_A(train_A_features)
    train_A_features = train_A_features.drop(columns=COLS_TO_DROP)

    X_A = train_A_features.drop(columns=['Label'])
    y_A = train_A_features['Label']

    X_train_A, X_val_A, y_train_A, y_val_A = train_test_split(X_A, y_A, test_size=0.2, stratify=y_A, random_state=42)
    return X_train_A, X_val_A, y_train_A, y_val_A

def get_data_B(train_B: pd.DataFrame) -> tuple:

    train_B_features = preprocess_B(train_B)
    train_B_features = add_features_B(train_B_features)
    train_B_features = train_B_features.drop(columns=COLS_TO_DROP)

    X_B = train_B_features.drop(columns=['Label'])
    y_B = train_B_features['Label']

    X_train_B, X_val_B, y_train_B, y_val_B = train_test_split(X_B, y_B, test_size=0.2, stratify=y_B, random_state=42)
    return X_train_B, X_val_B, y_train_B, y_val_B


In [None]:
def train_and_eval(model, X_train, X_val, y_train, y_val):

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric="auc",
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)]
    )

    val_pred = model.predict_proba(X_val)[:,1] # type: ignore
    auc = roc_auc_score(y_val, val_pred)
    print(f"Validation AUC: {auc:.4f}")
    return model

In [None]:
reader = pd.read_csv(PATH.aggt_A, index_col='Test_id', chunksize=CHUNK_SIZE)
for i, train_A_chunk in enumerate(reader):
    if(i%10==0): print('반복', i)
    X_train_A, X_val_A, y_train_A, y_val_A = get_data_A(train_A_chunk)
    train_and_eval(model_A, X_train_A, X_val_A, y_train_A, y_val_A)

In [None]:
reader = pd.read_csv(PATH.aggt_B, index_col='Test_id', chunksize=CHUNK_SIZE)
for i, train_B_chunk in enumerate(reader):
    if(i%10==0): print('반복', i)
    X_train_B, X_val_B, y_train_B, y_val_B = get_data_B(train_B_chunk)
    train_and_eval(model_B, X_train_B, X_val_B, y_train_B, y_val_B)

In [None]:
# 피처 중요도 (Feature Importance)
print(model_A.get_params())
lgb.plot_importance(model_A, max_num_features=20)
print(model_B.get_params())
lgb.plot_importance(model_B, max_num_features=20)

In [None]:
# 모델 저장
import joblib

joblib.dump(model_A, PATH.model_A)
joblib.dump(model_B, PATH.model_B)

print(f"모델 저장 완료: {PATH.model_A}, {PATH.model_B}")