### seq 컬럼 통계치 추가
min, max, mean, std, sum 통계 피처 추가

In [11]:
import pandas as pd
import numpy as np
import polars as pl
import lightgbm as lgb
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, log_loss
from src.metrics import toss_metric, lgbm_toss_metric, weighted_log_loss
from src.parallel import generate_seq_stats

warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_parquet("./data/train_optimized.parquet")
test_df = pd.read_parquet("./data/test_optimized.parquet")

In [3]:
train_df = generate_seq_stats(train_df, col="seq")  # swifter 버전
test_df = generate_seq_stats(test_df, col="seq")  # swifter 버전

Generating stats features with swifter...


Pandas Apply:   0%|          | 0/10704179 [00:00<?, ?it/s]

Generating stats features with swifter...


Pandas Apply:   0%|          | 0/1527298 [00:00<?, ?it/s]

In [4]:
stats_features = ['seq_len', 'seq_first', 'seq_last', 'seq_max', 'seq_min', 'seq_mean', 'seq_std', 'seq_sum']
train_df[stats_features].to_parquet("./data/processed/train_seq_stats.parquet")
test_df[stats_features].to_parquet("./data/processed/test_seq_stats.parquet")

In [6]:
# 상호작용피처 생성
train_df['age_inv_interaction'] = train_df['age_group'].astype(str) + '_' + train_df['inventory_id'].astype(str)
test_df['age_inv_interaction'] = test_df['age_group'].astype(str) + '_' + test_df['inventory_id'].astype(str)

In [None]:
# 피처 및 타겟 정의
TARGET = 'clicked'
# ID, seq, 타겟을 제외한 모든 컬럼을 피처로 사용
features = [col for col in train_df.columns if col not in ['ID', 'seq', TARGET]]
categorical_features = [
    'gender', 'age_group', 'inventory_id', 'day_of_week', 'hour', 
    'seq_len', 'seq_first', 'seq_last', 'seq_max', 'seq_min', 'seq_mean', 'seq_std', 'seq_sum',
    'age_inv_interaction'
]

# LightGBM이 카테고리 피처를 인식하도록 타입 변경
for col in categorical_features:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')
        test_df[col] = test_df[col].astype('category')

print("Feature lists and types updated.")

X_train = train_df[features]
y_train = train_df[TARGET]

# 5단계. 모델 학습 및 교차 검증
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train_df))
test_preds = np.zeros(len(test_df))
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"===== Fold {fold+1} =====")
    X_train_fold, y_train_fold = X_train.iloc[train_idx], y_train.iloc[train_idx]
    X_val_fold, y_val_fold = X_train.iloc[val_idx], y_train.iloc[val_idx]
    
    # 클래스 불균형 해소를 위한 scale_pos_weight 계산
    scale_pos_weight = np.sum(y_train_fold == 0) / np.sum(y_train_fold == 1)

    lgbm = lgb.LGBMClassifier(
        objective='binary',
        metric='none', # 커스텀 평가지표 사용
        # device='gpu',              # GPU 사용
        # gpu_platform_id=0,         # 기본 GPU 플랫폼
        # gpu_device_id=0,           # GPU ID
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_estimators=1000, # 조기 종료를 사용하므로 넉넉하게 설정
        learning_rate=0.05,
        num_leaves=31,
        # 속도 향상을 위한 파라미터
        n_jobs=-1,
        verbose=-1
    )
    
    lgbm.fit(X_train_fold, y_train_fold,
             eval_set=[(X_val_fold, y_val_fold)],
             eval_metric=lgbm_toss_metric,
             callbacks=[lgb.early_stopping(100, verbose=True)])
    
    val_preds = lgbm.predict_proba(X_val_fold)[:, 1]
    oof_preds[val_idx] = val_preds
    
    fold_score = toss_metric(y_val_fold, val_preds)
    cv_scores.append(fold_score)
    print(f"Fold {fold+1} Score: {fold_score}")

    test_preds += lgbm.predict_proba(test_df[features])[:, 1] / N_SPLITS

print(f"\nAverage CV Score: {np.mean(cv_scores):.5f} (+/- {np.std(cv_scores):.5f})")

# --- 5. 제출 파일 생성 ---
submission = pd.read_csv('./data/sample_submission.csv')
submission['clicked'] = test_preds

from datetime import datetime
now = datetime.now().strftime('%Y%m%d_%H%M%S')
submission.to_csv(f'./submissions/submission_{now}.csv', index=False)
print("Submission file created.")

pd.DataFrame({
    "y_train": y_train,
    # "lgbm_baseline_pred": oof_preds
    "oof_preds": oof_preds
}).to_csv(f"./oof_preds/oof_preds_{now}.csv", index=False)
print(f"oof_preds_*.csv created.")

Feature lists and types updated.
===== Fold 1 =====


LightGBMError: bin size 9766 cannot run on GPU

In [12]:
# 모델 학습이 끝난 후, lgbm 모델 객체로 피처 중요도를 그립니다.
# (CV를 사용했다면, 마지막 fold의 모델이나 모든 fold 모델의 평균을 사용)
plt.figure(figsize=(10, 10))
lgb.plot_importance(lgbm, max_num_features=30)
plt.title("Feature Importance")
plt.show()

NotFittedError: No booster found. Need to call fit beforehand.

<Figure size 1000x1000 with 0 Axes>