# 高精度モデル構築

## 改善点
1. クロスバリデーションによる堅牢な評価
2. 特徴量エンジニアリング（組み合わせ特徴量、エンコーディング改善）
3. 複数アルゴリズムのアンサンブル（LightGBM、XGBoost、CatBoost）
4. より適切な評価指標（AUC、F1スコア）
5. Target Encodingの使用
6. より詳細なハイパーパラメータチューニング

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
    mean_squared_error, 
    roc_auc_score, 
    f1_score, 
    accuracy_score,
    confusion_matrix,
    classification_report
)
import optuna
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
import warnings
warnings.filterwarnings('ignore')

# 再現性のためのシード設定
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. データ読み込み

In [None]:
train_df = pd.read_csv("/home/takato/bita/bank/data/train.csv")
test_df = pd.read_csv("/home/takato/bita/bank/data/test.csv")

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")
print(f"\nTarget distribution:\n{train_df['y'].value_counts(normalize=True)}")

## 2. 特徴量エンジニアリング

In [None]:
def feature_engineering(df, is_train=True, target_encoders=None):
    """
    特徴量エンジニアリング関数
    """
    df = df.copy()
    
    # 1. 数値特徴量の変換
    # 年齢グループ
    df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], 
                              labels=['0-25', '26-35', '36-45', '46-55', '56-65', '65+'])
    
    # balance の対数変換（負の値があるため調整）
    df['balance_log'] = np.log1p(df['balance'] - df['balance'].min() + 1)
    
    # 2. 時系列特徴量
    # dayとduration の比率
    df['duration_per_day'] = df['duration'] / (df['day'] + 1)
    
    # campaign効率
    df['campaign_efficiency'] = df['duration'] / (df['campaign'] + 1)
    
    # previous の成功率（pdaysが999でない場合）
    df['has_previous_contact'] = (df['pdays'] != 999).astype(int)
    df['previous_success_rate'] = df['previous'] / (df['pdays'].replace(999, 1) + 1)
    
    # 3. カテゴリカル変数の組み合わせ
    df['job_education'] = df['job'].astype(str) + '_' + df['education'].astype(str)
    df['marital_education'] = df['marital'].astype(str) + '_' + df['education'].astype(str)
    df['contact_month'] = df['contact'].astype(str) + '_' + df['month'].astype(str)
    
    # 4. ローン関連の特徴量
    df['total_loans'] = (df['housing'] == 'yes').astype(int) + (df['loan'] == 'yes').astype(int)
    df['has_any_loan'] = (df['total_loans'] > 0).astype(int)
    
    # 5. カテゴリカル変数のリスト
    categorical_feats = ['job', 'marital', 'education', 'default', 'housing', 
                         'loan', 'contact', 'month', 'poutcome', 
                         'age_group', 'job_education', 'marital_education', 'contact_month']
    
    # 6. Target Encoding（訓練データのみで学習）
    if is_train:
        target_encoders = {}
        for col in categorical_feats:
            # 元のターゲット値で平均を計算
            if 'y' in df.columns:
                target_mean = df.groupby(col)['y'].mean()
                target_encoders[col] = target_mean
                # Smoothingを適用
                global_mean = df['y'].mean()
                counts = df.groupby(col).size()
                smoothing = 10
                smooth_target = (target_mean * counts + global_mean * smoothing) / (counts + smoothing)
                df[f'{col}_target_enc'] = df[col].map(smooth_target)
    else:
        # テストデータには学習済みのエンコーダーを適用
        for col in categorical_feats:
            if col in target_encoders:
                df[f'{col}_target_enc'] = df[col].map(target_encoders[col])
                # 未知のカテゴリには平均値を使用
                df[f'{col}_target_enc'].fillna(target_encoders[col].mean(), inplace=True)
    
    # 7. Frequency Encoding
    for col in categorical_feats:
        freq = df[col].value_counts(normalize=True)
        df[f'{col}_freq'] = df[col].map(freq)
    
    # 8. Label Encoding（モデル用）
    label_encoders = {}
    for col in categorical_feats:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        label_encoders[col] = le
    
    return df, target_encoders if is_train else label_encoders

# 特徴量エンジニアリングを適用
train_processed, target_encoders = feature_engineering(train_df, is_train=True)
print("Feature engineering completed!")
print(f"New train shape: {train_processed.shape}")

## 3. データの準備

In [None]:
# ターゲットと特徴量の分離
y = train_processed['y']
exclude_cols = ['id', 'y']
X = train_processed.drop(columns=exclude_cols)

print(f"Features: {X.shape[1]}")
print(f"\nFeature names: {list(X.columns)}")

# Train/Valid分割
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"\nTrain set: {X_train.shape}")
print(f"Valid set: {X_valid.shape}")

## 4. LightGBMモデルの最適化

In [None]:
def objective_lgb(trial):
    """
    LightGBMのハイパーパラメータ最適化
    """
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
        "n_estimators": 2000,
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "random_state": RANDOM_STATE,
        "class_weight": "balanced"  # 不均衡データ対応
    }
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

# 最適化実行（より多くのトライアル）
study_lgb = optuna.create_study(direction="maximize")
study_lgb.optimize(objective_lgb, n_trials=100, show_progress_bar=True)

print("✅ Best AUC (LightGBM):", study_lgb.best_value)
print("✅ Best params (LightGBM):", study_lgb.best_params)

## 5. XGBoostモデルの最適化

In [None]:
def objective_xgb(trial):
    """
    XGBoostのハイパーパラメータ最適化
    """
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "n_estimators": 2000,
        "objective": "binary:logistic",
        "eval_metric": "auc",
        "random_state": RANDOM_STATE,
        "tree_method": "hist",
        "scale_pos_weight": len(y_train[y_train==0]) / len(y_train[y_train==1])  # 不均衡対応
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False
    )
    
    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

# 最適化実行
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective_xgb, n_trials=100, show_progress_bar=True)

print("✅ Best AUC (XGBoost):", study_xgb.best_value)
print("✅ Best params (XGBoost):", study_xgb.best_params)

## 6. CatBoostモデルの最適化

In [None]:
def objective_cat(trial):
    """
    CatBoostのハイパーパラメータ最適化
    """
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-8, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 0.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "iterations": 2000,
        "random_state": RANDOM_STATE,
        "verbose": False,
        "early_stopping_rounds": 100,
        "auto_class_weights": "Balanced"  # 不均衡対応
    }
    
    model = CatBoostClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        verbose=False
    )
    
    preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(y_valid, preds)
    return auc

# 最適化実行
study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=50, show_progress_bar=True)

print("✅ Best AUC (CatBoost):", study_cat.best_value)
print("✅ Best params (CatBoost):", study_cat.best_params)

## 7. 最終モデルの訓練とアンサンブル

In [None]:
# 各モデルを最適パラメータで訓練
# LightGBM
best_params_lgb = study_lgb.best_params
best_params_lgb.update({
    "n_estimators": 2000,
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "random_state": RANDOM_STATE,
    "class_weight": "balanced"
})
model_lgb = lgb.LGBMClassifier(**best_params_lgb)
model_lgb.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

# XGBoost
best_params_xgb = study_xgb.best_params
best_params_xgb.update({
    "n_estimators": 2000,
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "random_state": RANDOM_STATE,
    "tree_method": "hist",
    "scale_pos_weight": len(y_train[y_train==0]) / len(y_train[y_train==1])
})
model_xgb = xgb.XGBClassifier(**best_params_xgb)
model_xgb.fit(
    X_train, y_train,
    eval_set=[(X_valid, y_valid)],
    early_stopping_rounds=100,
    verbose=False
)

# CatBoost
best_params_cat = study_cat.best_params
best_params_cat.update({
    "iterations": 2000,
    "random_state": RANDOM_STATE,
    "verbose": False,
    "early_stopping_rounds": 100,
    "auto_class_weights": "Balanced"
})
model_cat = CatBoostClassifier(**best_params_cat)
model_cat.fit(
    X_train, y_train,
    eval_set=(X_valid, y_valid),
    verbose=False
)

print("✅ All models trained!")

## 8. モデル評価とアンサンブル

In [None]:
# 各モデルの予測確率
pred_lgb = model_lgb.predict_proba(X_valid)[:, 1]
pred_xgb = model_xgb.predict_proba(X_valid)[:, 1]
pred_cat = model_cat.predict_proba(X_valid)[:, 1]

# 個別モデルの評価
print("Individual Model Performance:")
print(f"LightGBM AUC: {roc_auc_score(y_valid, pred_lgb):.5f}")
print(f"XGBoost AUC: {roc_auc_score(y_valid, pred_xgb):.5f}")
print(f"CatBoost AUC: {roc_auc_score(y_valid, pred_cat):.5f}")

# アンサンブル（加重平均）
# 各モデルのAUCに基づいた重み付け
auc_lgb = roc_auc_score(y_valid, pred_lgb)
auc_xgb = roc_auc_score(y_valid, pred_xgb)
auc_cat = roc_auc_score(y_valid, pred_cat)

total_auc = auc_lgb + auc_xgb + auc_cat
w_lgb = auc_lgb / total_auc
w_xgb = auc_xgb / total_auc
w_cat = auc_cat / total_auc

pred_ensemble = w_lgb * pred_lgb + w_xgb * pred_xgb + w_cat * pred_cat

print(f"\nEnsemble weights: LGB={w_lgb:.3f}, XGB={w_xgb:.3f}, CAT={w_cat:.3f}")
print(f"\nEnsemble AUC: {roc_auc_score(y_valid, pred_ensemble):.5f}")

# 閾値を最適化してF1スコアを計算
from sklearn.metrics import f1_score
best_threshold = 0.5
best_f1 = 0
for threshold in np.arange(0.3, 0.7, 0.01):
    pred_binary = (pred_ensemble > threshold).astype(int)
    f1 = f1_score(y_valid, pred_binary)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"\nBest threshold: {best_threshold:.3f}")
print(f"Best F1 Score: {best_f1:.5f}")

# 最適閾値での予測
pred_final = (pred_ensemble > best_threshold).astype(int)
print(f"\nAccuracy: {accuracy_score(y_valid, pred_final):.5f}")
print(f"\nClassification Report:\n{classification_report(y_valid, pred_final)}")

## 9. クロスバリデーション

In [None]:
# クロスバリデーションで堅牢性を確認
from sklearn.model_selection import cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

print("Cross-Validation AUC Scores:")

# LightGBM
cv_scores_lgb = cross_val_score(
    model_lgb, X, y, cv=cv, scoring='roc_auc', n_jobs=-1
)
print(f"LightGBM: {cv_scores_lgb.mean():.5f} (+/- {cv_scores_lgb.std():.5f})")

# XGBoost
cv_scores_xgb = cross_val_score(
    model_xgb, X, y, cv=cv, scoring='roc_auc', n_jobs=-1
)
print(f"XGBoost: {cv_scores_xgb.mean():.5f} (+/- {cv_scores_xgb.std():.5f})")

# CatBoost
cv_scores_cat = cross_val_score(
    model_cat, X, y, cv=cv, scoring='roc_auc', n_jobs=-1
)
print(f"CatBoost: {cv_scores_cat.mean():.5f} (+/- {cv_scores_cat.std():.5f})")

## 10. 特徴量重要度の可視化

In [None]:
# 特徴量重要度の取得
importance_lgb = pd.DataFrame({
    'feature': X.columns,
    'importance': model_lgb.feature_importances_
}).sort_values('importance', ascending=False)

# 上位20特徴量をプロット
plt.figure(figsize=(10, 8))
plt.barh(importance_lgb.head(20)['feature'], importance_lgb.head(20)['importance'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances (LightGBM)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 10 Important Features:")
print(importance_lgb.head(10))

## 11. テストデータへの予測

In [None]:
# テストデータに特徴量エンジニアリングを適用
test_processed, _ = feature_engineering(test_df, is_train=False, target_encoders=target_encoders)

# 特徴量の抽出
X_test = test_processed.drop(columns=['id'])

# 各モデルで予測
test_pred_lgb = model_lgb.predict_proba(X_test)[:, 1]
test_pred_xgb = model_xgb.predict_proba(X_test)[:, 1]
test_pred_cat = model_cat.predict_proba(X_test)[:, 1]

# アンサンブル予測
test_pred_ensemble = w_lgb * test_pred_lgb + w_xgb * test_pred_xgb + w_cat * test_pred_cat

# 最適閾値で二値化
test_pred_final = (test_pred_ensemble > best_threshold).astype(int)

# 結果を保存
submission = pd.DataFrame({
    'id': test_df['id'],
    'y': test_pred_final
})

submission.to_csv('/home/takato/bita/bank/data/high_accuracy_submission.csv', index=False, header=False)
print("✅ Predictions saved to 'high_accuracy_submission.csv'")
print(f"\nPrediction distribution:\n{submission['y'].value_counts(normalize=True)}")

## 12. モデルの保存

In [None]:
import pickle

# モデルとパラメータを保存
models = {
    'lgb': model_lgb,
    'xgb': model_xgb,
    'cat': model_cat,
    'weights': {'lgb': w_lgb, 'xgb': w_xgb, 'cat': w_cat},
    'threshold': best_threshold,
    'target_encoders': target_encoders
}

with open('/home/takato/bita/bank/models/ensemble_model.pkl', 'wb') as f:
    pickle.dump(models, f)

print("✅ Models saved successfully!")