# 改善版: ワンホットエンコーディング + 交差検証 + LightGBM最適化

このノートブックでは以下の改善を実装します:
1. カテゴリ特徴量をワンホットエンコーディングで変換
2. StratifiedKFold交差検証で汎化性能を向上
3. LightGBMのハイパーパラメータ最適化

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    accuracy_score,
    confusion_matrix,
    classification_report
)
import optuna
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 再現性のためのシード設定
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("ライブラリのインポート完了")

In [None]:
# データ読み込み
train_df = pd.read_csv("/home/user/bank/data/train.csv")
test_df = pd.read_csv("/home/user/bank/data/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTarget distribution:")
print(train_df['y'].value_counts())
print(f"\nPositive rate: {train_df['y'].mean():.4f}")

In [None]:
def feature_engineering(df, is_train=True):
    """
    特徴量エンジニアリング関数（ワンホットエンコーディング版）
    
    Parameters:
    -----------
    df : DataFrame
        処理対象のデータフレーム
    is_train : bool
        訓練データの場合True、テストデータの場合False
    
    Returns:
    --------
    df : DataFrame
        特徴量エンジニアリング済みのデータフレーム
    """
    df = df.copy()
    
    # ===== 1. 数値特徴量の変換 =====
    # 年齢グループ
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], 
                              labels=['0-25', '26-35', '36-45', '46-55', '56-65', '65+'])
    
    # balance の対数変換（負の値があるため調整）
    df['balance_log'] = np.log1p(df['balance'] - df['balance'].min() + 1)
    df['balance_positive'] = (df['balance'] > 0).astype(int)
    df['balance_negative'] = (df['balance'] < 0).astype(int)
    
    # ===== 2. 時系列特徴量 =====
    # duration関連
    df['duration_per_day'] = df['duration'] / (df['day'] + 1)
    df['campaign_efficiency'] = df['duration'] / (df['campaign'] + 1)
    df['duration_log'] = np.log1p(df['duration'])
    
    # previous関連
    df['has_previous_contact'] = (df['pdays'] != -1).astype(int)
    df['previous_per_pdays'] = df['previous'] / (df['pdays'].replace(-1, 1) + 1)
    
    # ===== 3. 月のマッピングと周期性エンコーディング =====
    month_mapping = {
        'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4,
        'may': 5, 'jun': 6, 'jul': 7, 'aug': 8,
        'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12
    }
    df['month_numeric'] = df['month'].map(month_mapping)
    df['month_sin'] = np.sin(2 * np.pi * df['month_numeric'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month_numeric'] / 12)
    
    # ===== 4. ローン関連の特徴量 =====
    df['total_loans'] = (df['housing'] == 'yes').astype(int) + (df['loan'] == 'yes').astype(int)
    df['has_any_loan'] = (df['total_loans'] > 0).astype(int)
    
    # ===== 5. カテゴリカル特徴量の準備 =====
    # バイナリ変数を数値化
    binary_cols = ['default', 'housing', 'loan']
    for col in binary_cols:
        df[col] = df[col].map({'yes': 1, 'no': 0})
    
    # ワンホットエンコーディング対象のカテゴリカル変数
    categorical_cols = ['job', 'marital', 'education', 'contact', 'poutcome', 'age_group']
    
    # ===== 6. 相互作用特徴量（ワンホット化前に作成） =====
    df['job_education'] = df['job'].astype(str) + '_' + df['education'].astype(str)
    df['contact_month'] = df['contact'].astype(str) + '_' + df['month'].astype(str)
    
    # 相互作用特徴量もワンホット化対象に追加
    interaction_cols = ['job_education', 'contact_month']
    categorical_cols.extend(interaction_cols)
    
    # monthは既に周期性エンコーディングしたので削除
    df = df.drop(columns=['month', 'month_numeric'])
    
    return df, categorical_cols

# 特徴量エンジニアリングを適用
train_processed, categorical_cols = feature_engineering(train_df, is_train=True)
test_processed, _ = feature_engineering(test_df, is_train=False)

print("特徴量エンジニアリング完了")
print(f"Train shape: {train_processed.shape}")
print(f"\nカテゴリカル変数: {categorical_cols}")

In [None]:
# ワンホットエンコーディング実行
train_encoded = pd.get_dummies(train_processed, columns=categorical_cols, drop_first=True)
test_encoded = pd.get_dummies(test_processed, columns=categorical_cols, drop_first=True)

# 訓練データとテストデータのカラムを揃える
# テストデータに存在しないカラムを追加（0で埋める）
missing_cols = set(train_encoded.columns) - set(test_encoded.columns)
for col in missing_cols:
    if col != 'y':  # ターゲット変数以外
        test_encoded[col] = 0

# 訓練データに存在しないカラムを削除
extra_cols = set(test_encoded.columns) - set(train_encoded.columns)
test_encoded = test_encoded.drop(columns=list(extra_cols))

# カラムの順序を揃える
test_encoded = test_encoded[train_encoded.drop(columns=['y']).columns]

print(f"ワンホットエンコーディング後のTrain shape: {train_encoded.shape}")
print(f"ワンホットエンコーディング後のTest shape: {test_encoded.shape}")
print(f"\n総特徴量数: {train_encoded.shape[1] - 2}")  # id, yを除く

In [None]:
# ターゲットと特徴量の分離
y = train_encoded['y']
X = train_encoded.drop(columns=['id', 'y'])
X_test = test_encoded.drop(columns=['id'])

print(f"特徴量数: {X.shape[1]}")
print(f"訓練データサンプル数: {X.shape[0]}")
print(f"テストデータサンプル数: {X_test.shape[0]}")

## 交差検証を用いたLightGBMのハイパーパラメータ最適化

In [None]:
def objective_lgb_cv(trial):
    """
    LightGBMのハイパーパラメータ最適化（交差検証版）
    """
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 150),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "n_estimators": 3000,
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "random_state": RANDOM_STATE,
        "class_weight": "balanced",  # 不均衡データ対応
        "boosting_type": "gbdt"
    }
    
    # 5-Fold Stratified Cross Validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = []
    
    for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
        X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
        y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_valid_fold, y_valid_fold)],
            callbacks=[
                lgb.early_stopping(stopping_rounds=100, verbose=False),
                lgb.log_evaluation(period=0)
            ]
        )
        
        preds = model.predict_proba(X_valid_fold)[:, 1]
        auc = roc_auc_score(y_valid_fold, preds)
        cv_scores.append(auc)
    
    # 平均AUCを返す
    return np.mean(cv_scores)

# Optuna最適化実行
print("ハイパーパラメータ最適化を開始します...")
study_lgb = optuna.create_study(direction="maximize", study_name="lgbm_cv")
study_lgb.optimize(objective_lgb_cv, n_trials=50, show_progress_bar=True)

print(f"\nBest CV AUC: {study_lgb.best_value:.5f}")
print(f"\nBest params:")
for key, value in study_lgb.best_params.items():
    print(f"  {key}: {value}")

## 最適パラメータで交差検証学習

In [None]:
# 最適パラメータの設定
best_params = study_lgb.best_params.copy()
best_params.update({
    "n_estimators": 3000,
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "random_state": RANDOM_STATE,
    "class_weight": "balanced",
    "boosting_type": "gbdt"
})

# 交差検証で学習し、各フォールドの予測を保存
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(X_test))
cv_scores = []
models = []

print("交差検証で学習を開始します...\n")

for fold, (train_idx, valid_idx) in enumerate(cv.split(X, y)):
    print(f"Fold {fold + 1}/5")
    
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]
    
    model = lgb.LGBMClassifier(**best_params)
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_valid_fold, y_valid_fold)],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100, verbose=False),
            lgb.log_evaluation(period=100)
        ]
    )
    
    # Out-of-Fold予測
    oof_predictions[valid_idx] = model.predict_proba(X_valid_fold)[:, 1]
    
    # テストデータ予測（平均を取るため）
    test_predictions += model.predict_proba(X_test)[:, 1] / 5
    
    # スコア計算
    fold_auc = roc_auc_score(y_valid_fold, oof_predictions[valid_idx])
    cv_scores.append(fold_auc)
    models.append(model)
    
    print(f"  Fold {fold + 1} AUC: {fold_auc:.5f}")
    print()

# 全体のOOFスコア
overall_auc = roc_auc_score(y, oof_predictions)
print(f"\n{'='*50}")
print(f"Overall OOF AUC: {overall_auc:.5f}")
print(f"Mean CV AUC: {np.mean(cv_scores):.5f} ± {np.std(cv_scores):.5f}")
print(f"{'='*50}")

In [None]:
# 最適な閾値を探索
best_threshold = 0.5
best_f1 = 0

for threshold in np.arange(0.3, 0.8, 0.01):
    pred_binary = (oof_predictions > threshold).astype(int)
    f1 = f1_score(y, pred_binary)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\n最適閾値: {best_threshold:.3f}")
print(f"最適F1スコア: {best_f1:.5f}")

# 最適閾値での評価
oof_binary = (oof_predictions > best_threshold).astype(int)
print(f"\n=== 最適閾値での評価 ===")
print(f"Accuracy: {accuracy_score(y, oof_binary):.5f}")
print(f"\nConfusion Matrix:")
print(confusion_matrix(y, oof_binary))
print(f"\nClassification Report:")
print(classification_report(y, oof_binary))

In [None]:
# 特徴量重要度の可視化（最後のモデルを使用）
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': models[-1].feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'][:20], feature_importance['importance'][:20])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 20 重要な特徴量:")
print(feature_importance.head(20))

## テストデータ予測と提出ファイル作成

In [None]:
# 最適閾値で二値化
test_pred_binary = (test_predictions > best_threshold).astype(int)

# 提出ファイル作成
submission = pd.DataFrame({
    'id': test_df['id'],
    'y': test_pred_binary
})

submission.to_csv('/home/user/bank/data/improved_onehot_cv_submission.csv', index=False, header=False)

print("提出ファイルを作成しました: improved_onehot_cv_submission.csv")
print(f"\n予測分布:")
print(submission['y'].value_counts())
print(f"\nPositive予測率: {submission['y'].mean():.4f}")

In [None]:
# 確率値も保存（閾値調整用）
submission_proba = pd.DataFrame({
    'id': test_df['id'],
    'y_proba': test_predictions,
    'y_pred': test_pred_binary
})

submission_proba.to_csv('/home/user/bank/data/improved_onehot_cv_submission_with_proba.csv', index=False)
print("確率値付き提出ファイルも作成しました: improved_onehot_cv_submission_with_proba.csv")