In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

In [2]:
# 데이터 불러오기 및 전처리
df = pd.read_csv('train.csv')
df = df.drop('ID', axis=1)

In [3]:
# 결측치 처리 및 특성 엔지니어링
fill_cols = ['F04', 'F11', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
df[fill_cols] = df[fill_cols].fillna(0)
df = df.fillna('NAN')
float_columns = df.select_dtypes(include=['float64']).columns
df[float_columns] = df[float_columns].astype('int64')
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].astype('category')

In [4]:
# 표준화
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('Click', axis=1).select_dtypes(include=[np.number]))
scaled_df = pd.DataFrame(scaled_features, columns=df.drop('Click', axis=1).select_dtypes(include=[np.number]).columns)
df.update(scaled_df)

In [5]:
# 범주형 피처를 학습 데이터에서 가져옴
cat_features = list(object_columns)

In [6]:
# Optuna 목적 함수 정의
def objective(trial):
    # K-폴드 교차 검증 설정
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []
    
    for train_idx, valid_idx in kf.split(df):
        train_x, valid_x = df.iloc[train_idx].drop('Click', axis=1), df.iloc[valid_idx].drop('Click', axis=1)
        train_y, valid_y = df.iloc[train_idx]['Click'], df.iloc[valid_idx]['Click']
        
        param = {
            'objective': 'binary',
            'metric': 'auc',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'num_leaves': trial.suggest_int('num_leaves', 20, 200),
            'max_depth': trial.suggest_int('max_depth', 3, 15),
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        }
        
        model = lgb.LGBMClassifier(**param)
        model.fit(
            train_x, train_y,
            eval_set=[(valid_x, valid_y)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)],
            categorical_feature=cat_features
        )
        
        preds = model.predict_proba(valid_x)[:, 1]
        auc = roc_auc_score(valid_y, preds)
        aucs.append(auc)
    
    return np.mean(aucs)

In [None]:
# Optuna를 사용한 하이퍼파라미터 최적화
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2024-06-02 18:32:12,726] A new study created in memory with name: no-name-3ad7c798-d5f7-44a7-93c5-204bc5ec9b3b
[I 2024-06-02 18:35:06,700] Trial 0 finished with value: 0.76512031372364 and parameters: {'num_leaves': 85, 'max_depth': 13, 'learning_rate': 0.028645948552580586, 'feature_fraction': 0.9168534977041871, 'bagging_fraction': 0.7955834197017922, 'bagging_freq': 7, 'min_child_samples': 15}. Best is trial 0 with value: 0.76512031372364.
[I 2024-06-02 18:37:38,699] Trial 1 finished with value: 0.7361917646326145 and parameters: {'num_leaves': 123, 'max_depth': 10, 'learning_rate': 0.0003358918572924058, 'feature_fraction': 0.7679235165813328, 'bagging_fraction': 0.7518429865743834, 'bagging_freq': 1, 'min_child_samples': 59}. Best is trial 0 with value: 0.76512031372364.
[I 2024-06-02 18:39:28,048] Trial 2 finished with value: 0.7072879683163231 and parameters: {'num_leaves': 83, 'max_depth': 6, 'learning_rate': 0.0009727618373533279, 'feature_fraction': 0.9112267038910702, 'ba

In [None]:
# 최적 하이퍼파라미터로 모델 훈련
best_params = study.best_params
model = lgb.LGBMClassifier(**best_params)
model.fit(df.drop('Click', axis=1), df['Click'], categorical_feature=cat_features)

In [None]:
# 테스트 데이터 불러오기 및 전처리
def load_data():
    df = pd.read_csv('test.csv')
    df = df.drop('ID', axis=1)
    fill_cols = ['F04', 'F11', 'F18', 'F19', 'F24', 'F27', 'F29', 'F32', 'F33', 'F36', 'F38']
    df[fill_cols] = df[fill_cols].fillna(0)
    df = df.fillna('NAN')
    float_columns = df.select_dtypes(include=['float64']).columns
    df[float_columns] = df[float_columns].astype('int64')
    object_columns = df.select_dtypes(include=['object']).columns
    df[object_columns] = df[object_columns].astype('category')
    
    # 테스트 데이터의 범주형 피처를 학습 데이터와 동일하게 설정
    for col in cat_features:
        if col in df.columns:
            df[col] = df[col].astype('category')
            df[col] = df[col].cat.set_categories(df[col].cat.categories)
        else:
            df[col] = pd.Categorical([None] * len(df), categories=df[col].cat.categories)
    
    # 표준화
    scaled_features = scaler.transform(df.select_dtypes(include=[np.number]))
    scaled_df = pd.DataFrame(scaled_features, columns=df.select_dtypes(include=[np.number]).columns)
    df.update(scaled_df)
    
    return df

test_df = load_data()

In [None]:
# 테스트 데이터 예측
pred = model.predict_proba(test_df)

In [None]:
# 결과 저장
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission['Click'] = pred[:, 1]
sample_submission.to_csv('lgbm_optuna.csv', index=False)