## LightGBM用の調整

In [58]:
# ✅ 必要ライブラリ
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

# ✅ データ読み込み
PATH = '../data/'
X = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

# ✅ Age 欠損処理
X["Age_filled"] = X["Age"].fillna(-1)
X["Age_missing"] = X["Age"].isna().astype(int)
X = X.drop(columns=["Age"])
test["Age_filled"] = test["Age"].fillna(-1)
test["Age_missing"] = test["Age"].isna().astype(int)
test = test.drop(columns=["Age"])

# ✅ 数値カラム欠損補完
for col in X.columns:
    if X[col].isnull().sum() > 0:
        median = X[col].median()
        X[col] = X[col].fillna(median)
        test[col] = test[col].fillna(median)

# ✅ 不要カラム削除
X = X.drop(columns=["Id"])
test = test.drop(columns=["Id"])

# ✅ 目的変数分離
y = X["Drafted"]
X = X.drop(columns=["Drafted"])

# ✅ School, Player_Type, Position_Type 削除
X = X.drop(columns=["School", "Player_Type", "Position_Type"])
test = test.drop(columns=["School", "Player_Type", "Position_Type"])

# ✅ Position Target Encoding
position_stats = X.copy()
position_stats["Drafted"] = y
position_target_map = position_stats.groupby("Position")["Drafted"].mean()
X["Position_encoded"] = X["Position"].map(position_target_map)
test["Position_encoded"] = test["Position"].map(position_target_map)
test["Position_encoded"] = test["Position_encoded"].fillna(X["Position_encoded"].mean())

# ✅ Position Group Encoding (fold-safe)
def map_position_group(pos):
    if pos in ["K", "P", "LS"]:
        return "Specialist"
    elif pos in ["WR", "RB", "TE"]:
        return "OffensiveSkill"
    elif pos in ["OT", "OG", "C"]:
        return "OffensiveLine"
    elif pos in ["DE", "DT"]:
        return "DefensiveLine"
    elif pos in ["OLB", "ILB"]:
        return "Linebacker"
    elif pos in ["CB", "FS", "SS", "S", "DB"]:
        return "DefensiveBack"
    elif pos == "QB":
        return "Quarterback"
    elif pos == "FB":
        return "Fullback"
    else:
        return "Other"

X["Position_group"] = X["Position"].apply(map_position_group)
test["Position_group"] = test["Position"].apply(map_position_group)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X["Position_group_encoded"] = 0
for train_idx, val_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    group_map = X_train.assign(Drafted=y_train).groupby("Position_group")["Drafted"].mean()
    X.loc[val_idx, "Position_group_encoded"] = X.loc[val_idx, "Position_group"].map(group_map)
final_group_map = X.assign(Drafted=y).groupby("Position_group")["Drafted"].mean()
test["Position_group_encoded"] = test["Position_group"].map(final_group_map)
test["Position_group_encoded"] = test["Position_group_encoded"].fillna(X["Position_group_encoded"].mean())

X = X.drop(columns=["Position", "Position_group"])
test = test.drop(columns=["Position", "Position_group"])

# ✅ SpeedScore, BurstScore, AgilityScore, ASI, RSA特徴量
X["Weight_lbs"] = X["Weight"] * 2.20462
test["Weight_lbs"] = test["Weight"] * 2.20462

X["SpeedScore"] = X["Weight_lbs"] * (200 / X["Sprint_40yd"]**2)
test["SpeedScore"] = test["Weight_lbs"] * (200 / test["Sprint_40yd"]**2)

X["BurstScore"] = X["Vertical_Jump"] + X["Broad_Jump"]
test["BurstScore"] = test["Vertical_Jump"] + test["Broad_Jump"]

X["AgilityScore"] = X["Shuttle"] + X["Agility_3cone"]
test["AgilityScore"] = test["Shuttle"] + test["Agility_3cone"]

X["ASI"] = 0.5 * X["SpeedScore"] + 0.3 * X["BurstScore"] + 0.2 * X["AgilityScore"]
test["ASI"] = 0.5 * test["SpeedScore"] + 0.3 * test["BurstScore"] + 0.2 * test["AgilityScore"]

rsa_features = ["Sprint_40yd", "Vertical_Jump", "Bench_Press_Reps", "Shuttle", "Agility_3cone"]
for col in rsa_features:
    scaler = MinMaxScaler(feature_range=(0, 10))
    if col in ["Sprint_40yd", "Shuttle", "Agility_3cone"]:
        X[f"RSA_{col}"] = 10 - scaler.fit_transform(X[[col]])
        test[f"RSA_{col}"] = 10 - scaler.transform(test[[col]])
    else:
        X[f"RSA_{col}"] = scaler.fit_transform(X[[col]])
        test[f"RSA_{col}"] = scaler.transform(test[[col]])

# ✅ BMI
X["BMI"] = X["Weight"] / (X["Height"]/100)**2
test["BMI"] = test["Weight"] / (test["Height"]/100)**2

# ✅ School特徴量（Top, Drafted Count, Drafted Rate TE）
df_raw = pd.read_csv(PATH + 'train.csv')
test_raw = pd.read_csv(PATH + 'test.csv')
X["School"] = df_raw["School"]
test["School"] = test_raw["School"]

school_stats = X.copy()
school_stats["Drafted"] = y
school_agg = school_stats.groupby("School")["Drafted"].agg(["sum", "count"])
school_agg["Drafted_Rate"] = school_agg["sum"] / school_agg["count"]

top_n = 20
top_schools = school_agg["sum"].sort_values(ascending=False).head(top_n).index.tolist()

X["School_Top"] = X["School"].isin(top_schools).astype(int)
test["School_Top"] = test["School"].isin(top_schools).astype(int)

X["School_Drafted_Count"] = X["School"].map(school_agg["sum"])
test["School_Drafted_Count"] = test["School"].map(school_agg["sum"])
test["School_Drafted_Count"] = test["School_Drafted_Count"].fillna(0)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X["School_Drafted_Rate_TE"] = 0.0
for train_idx, val_idx in kf.split(X):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    rate_map = X_train.assign(Drafted=y_train).groupby("School")["Drafted"].mean()
    X.loc[val_idx, "School_Drafted_Rate_TE"] = X.loc[val_idx, "School"].map(rate_map)
final_rate_map = X.assign(Drafted=y).groupby("School")["Drafted"].mean()
test["School_Drafted_Rate_TE"] = test["School"].map(final_rate_map)
test["School_Drafted_Rate_TE"] = test["School_Drafted_Rate_TE"].fillna(y.mean())

X = X.drop(columns=["School"])
test = test.drop(columns=["School"])


X["Speed_BMI_Ratio"] = X["SpeedScore"] / X["BMI"]
test["Speed_BMI_Ratio"] = test["SpeedScore"] / test["BMI"]

X["Sprint_ASI"] = X["Sprint_40yd"] * X["ASI"]
test["Sprint_ASI"] = test["Sprint_40yd"] * test["ASI"]

X["Age_Speed"] = X["Age_filled"] * X["SpeedScore"]
test["Age_Speed"] = test["Age_filled"] * test["SpeedScore"]

 0.65632458 0.62895928 0.7109375  0.71884984 0.6        0.65632458
 0.65632458 0.71884984 0.62895928 0.62895928 0.65632458 0.71884984
 0.62895928 0.65632458 0.65632458 0.62895928 0.62895928 0.65632458
 0.6        0.62895928 0.62895928 0.65632458 0.71884984 0.7109375
 0.62895928 0.7109375  0.71884984 0.62895928 0.65632458 0.62895928
 0.2739726  0.65632458 0.65632458 0.2739726  0.68376068 0.65632458
 0.62895928 0.48275862 0.68376068 0.68376068 0.71884984 0.62895928
 0.62895928 0.62895928 0.7109375  0.62895928 0.71884984 0.65632458
 0.7109375  0.62895928 0.6        0.62895928 0.7109375  0.65632458
 0.62895928 0.62895928 0.71884984 0.71884984 0.68376068 0.7109375
 0.68376068 0.65632458 0.62895928 0.62895928 0.62895928 0.62895928
 0.68376068 0.6        0.71884984 0.65632458 0.6        0.71884984
 0.68376068 0.71884984 0.68376068 0.68376068 0.2739726  0.71884984
 0.71884984 0.65632458 0.62895928 0.68376068 0.65632458 0.68376068
 0.62895928 0.71884984 0.65632458 0.68376068 0.62895928 0.683760

In [59]:
# ✅ 高相関特徴量削除
drop_cols_high_corr = [
    "Weight_lbs",
    "RSA_Sprint_40yd",
    "RSA_Vertical_Jump",
    "RSA_Bench_Press_Reps",
    "RSA_Agility_3cone",
    "RSA_Shuttle",
    "SpeedScore",
    "Age_missing"
]
X = X.drop(columns=drop_cols_high_corr)
test = test.drop(columns=drop_cols_high_corr)

# CatBoostのOptunaによる最適化（特徴量などは今後いじる）

```python
import optuna
from catboost import CatBoostClassifier, Pool

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 200, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 10),
        "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_uniform("random_strength", 0, 1),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "task_type": "CPU",  # GPUがなければCPUに変更
        "verbose": 0,
        "random_state": 42
    }

    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=30, verbose=0)

        y_valid_pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_valid_pred)
        aucs.append(auc)

    return np.mean(aucs)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("✅ Best params:", study.best_params)
print("✅ Best CV AUC:", study.best_value)

✅ Best params: {'iterations': 993, 'depth': 5, 'learning_rate': 0.07455966036915365, 'l2_leaf_reg': 0.010557909406604652, 'bagging_temperature': 0.733088086114168, 'random_strength': 0.6639499886391435, 'border_count': 177}
✅ Best CV AUC: 0.8511843672980401

In [60]:
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import numpy as np

cat_model = CatBoostClassifier(
    iterations=993,
    depth=5,
    learning_rate=0.07455966036915365,
    l2_leaf_reg=0.010557909406604652,
    bagging_temperature=0.733088086114168,
    random_strength=0.6639499886391435,
    border_count=177,
    eval_metric='AUC',
    random_seed=42,
    verbose=0
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_aucs = []
val_aucs = []

print("✅ CatBoost 最適化パラメータで再学習開始")
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]
    
    cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=30, use_best_model=True)
    
    y_train_pred = cat_model.predict_proba(X_train)[:, 1]
    y_valid_pred = cat_model.predict_proba(X_valid)[:, 1]
    
    train_auc = roc_auc_score(y_train, y_train_pred)
    val_auc = roc_auc_score(y_valid, y_valid_pred)
    
    train_aucs.append(train_auc)
    val_aucs.append(val_auc)
    
    print(f"✅ [CatBoost] Fold {fold + 1} - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

print(f"\n✅ [CatBoost] Average Train AUC: {np.mean(train_aucs):.4f}")
print(f"✅ [CatBoost] Average Validation AUC: {np.mean(val_aucs):.4f}")


✅ CatBoost 最適化パラメータで再学習開始
✅ [CatBoost] Fold 1 - Train AUC: 0.9226, Validation AUC: 0.8684
✅ [CatBoost] Fold 2 - Train AUC: 0.9509, Validation AUC: 0.8543
✅ [CatBoost] Fold 3 - Train AUC: 0.9324, Validation AUC: 0.8344
✅ [CatBoost] Fold 4 - Train AUC: 0.8573, Validation AUC: 0.8132
✅ [CatBoost] Fold 5 - Train AUC: 0.8774, Validation AUC: 0.8568

✅ [CatBoost] Average Train AUC: 0.9081
✅ [CatBoost] Average Validation AUC: 0.8454


# XGBoostのOptunaによる最適化（特徴量などは今後いじる）

In [61]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import numpy as np


In [62]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 5,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1.0,
    'reg_lambda': 1.0,
    'random_state': 42
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_aucs = []
val_aucs = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=30,
        verbose_eval=100
    )
    
    y_train_pred = model.predict(dtrain)
    y_valid_pred = model.predict(dvalid)
    
    train_auc = roc_auc_score(y_train, y_train_pred)
    val_auc = roc_auc_score(y_valid, y_valid_pred)
    
    train_aucs.append(train_auc)
    val_aucs.append(val_auc)
    
    print(f"✅ [XGBoost] Fold {fold+1} - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

print(f"\n✅ [XGBoost] Average Train AUC: {np.mean(train_aucs):.4f}")
print(f"✅ [XGBoost] Average Validation AUC: {np.mean(val_aucs):.4f}")


[0]	valid-auc:0.80475
[100]	valid-auc:0.86342
[132]	valid-auc:0.86196
✅ [XGBoost] Fold 1 - Train AUC: 0.9619, Validation AUC: 0.8612
[0]	valid-auc:0.77742
[100]	valid-auc:0.85508
[142]	valid-auc:0.85547
✅ [XGBoost] Fold 2 - Train AUC: 0.9682, Validation AUC: 0.8555
[0]	valid-auc:0.76348
[35]	valid-auc:0.81223
✅ [XGBoost] Fold 3 - Train AUC: 0.9076, Validation AUC: 0.8122
[0]	valid-auc:0.79296
[30]	valid-auc:0.80043
✅ [XGBoost] Fold 4 - Train AUC: 0.8996, Validation AUC: 0.8014
[0]	valid-auc:0.81862
[100]	valid-auc:0.86026
[107]	valid-auc:0.86029
✅ [XGBoost] Fold 5 - Train AUC: 0.9500, Validation AUC: 0.8603

✅ [XGBoost] Average Train AUC: 0.9375
✅ [XGBoost] Average Validation AUC: 0.8381


```python

import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

# ✅ Optuna objective
def objective(trial):
    params = {
        "n_estimators": 1000,
        "max_depth": trial.suggest_int("max_depth", 3, 8),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "subsample": trial.suggest_uniform("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_uniform("gamma", 0, 5),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-3, 10.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-3, 10.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "random_state": 42,
        "tree_method": "hist",  # CPUなら "hist"
        "use_label_encoder": False,
        "eval_metric": "auc"
    }
    
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    aucs = []

    for train_idx, valid_idx in cv.split(X, y):
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = XGBClassifier(**params)

        # ✅ カラム数エラー防止のため eval_set 使用時は verbose=0
        model.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            verbose=0,
        )

        y_valid_pred = model.predict_proba(X_valid)[:, 1]
        auc = roc_auc_score(y_valid, y_valid_pred)
        aucs.append(auc)

    return np.mean(aucs)

# ✅ Optuna 実行
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

print(f"✅ Best params: {study.best_params}")
print(f"✅ Best CV AUC: {study.best_value:.6f}")


✅ Best params: {'max_depth': 3, 'learning_rate': 0.01813146526150014, 'subsample': 0.8306991416735662, 'colsample_bytree': 0.9931326288237515, 'gamma': 0.6283945388318324, 'reg_alpha': 0.08282377808150526, 'reg_lambda': 0.1838172412783487, 'min_child_weight': 1}
✅ Best CV AUC: 0.850278

In [63]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import numpy as np

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 3,
    'learning_rate': 0.01813146526150014,
    'subsample': 0.8306991416735662,
    'colsample_bytree': 0.9931326288237515,
    'gamma': 0.6283945388318324,
    'reg_alpha': 0.08282377808150526,
    'reg_lambda': 0.1838172412783487,
    'min_child_weight': 1,
    'random_state': 42
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_aucs = []
val_aucs = []

print("✅ XGBoost 最適化パラメータで再学習開始")
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[val_idx]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=2000,
        evals=[(dvalid, 'valid')],
        early_stopping_rounds=30,
        verbose_eval=100
    )
    
    y_train_pred = model.predict(dtrain)
    y_valid_pred = model.predict(dvalid)
    
    train_auc = roc_auc_score(y_train, y_train_pred)
    val_auc = roc_auc_score(y_valid, y_valid_pred)
    
    train_aucs.append(train_auc)
    val_aucs.append(val_auc)
    
    print(f"✅ [XGBoost] Fold {fold + 1} - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

print(f"\n✅ [XGBoost] Average Train AUC: {np.mean(train_aucs):.4f}")
print(f"✅ [XGBoost] Average Validation AUC: {np.mean(val_aucs):.4f}")


✅ XGBoost 最適化パラメータで再学習開始
[0]	valid-auc:0.81306
[100]	valid-auc:0.84525
[200]	valid-auc:0.85822
[300]	valid-auc:0.86526
[400]	valid-auc:0.86784
✅ [XGBoost] Fold 1 - Train AUC: 0.9146, Validation AUC: 0.8681
[0]	valid-auc:0.77620
[100]	valid-auc:0.84471
[200]	valid-auc:0.85515
[300]	valid-auc:0.86162
[400]	valid-auc:0.86351
[409]	valid-auc:0.86347
✅ [XGBoost] Fold 2 - Train AUC: 0.9192, Validation AUC: 0.8635
[0]	valid-auc:0.76874
[100]	valid-auc:0.82217
[200]	valid-auc:0.82742
[300]	valid-auc:0.83053
[322]	valid-auc:0.82979
✅ [XGBoost] Fold 3 - Train AUC: 0.9104, Validation AUC: 0.8298
[0]	valid-auc:0.75801
[39]	valid-auc:0.79494
✅ [XGBoost] Fold 4 - Train AUC: 0.8581, Validation AUC: 0.7949
[0]	valid-auc:0.77890
[100]	valid-auc:0.85050
[200]	valid-auc:0.86061
[300]	valid-auc:0.86528
[387]	valid-auc:0.86603
✅ [XGBoost] Fold 5 - Train AUC: 0.9136, Validation AUC: 0.8659

✅ [XGBoost] Average Train AUC: 0.9032
✅ [XGBoost] Average Validation AUC: 0.8444


## ✅ LightGBM 最適化パラメータ（直近ベストスコア使用）

In [64]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import numpy as np

# ✅ LightGBM 最適化パラメータ（直近ベストスコア使用）
model_lgb = LGBMClassifier(
    max_depth=6,
    num_leaves=10,
    min_child_samples=38,
    reg_alpha=8.18,
    reg_lambda=8.07,
    learning_rate=0.0442,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)
train_aucs_lgb = []
val_aucs_lgb = []

print("✅ LightGBM 最適化パラメータで再学習開始")

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    model_lgb.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=30), lgb.log_evaluation(100)]
    )

    y_train_pred = model_lgb.predict_proba(X_train)[:, 1]
    y_valid_pred = model_lgb.predict_proba(X_valid)[:, 1]

    train_auc = roc_auc_score(y_train, y_train_pred)
    val_auc = roc_auc_score(y_valid, y_valid_pred)

    train_aucs_lgb.append(train_auc)
    val_aucs_lgb.append(val_auc)

    print(f"✅ [LightGBM] Fold {fold + 1} - Train AUC: {train_auc:.4f}, Validation AUC: {val_auc:.4f}")

print(f"\n✅ [LightGBM] Average Train AUC: {np.mean(train_aucs_lgb):.4f}")
print(f"✅ [LightGBM] Average Validation AUC: {np.mean(val_aucs_lgb):.4f}")


✅ LightGBM 最適化パラメータで再学習開始
[LightGBM] [Info] Number of positive: 1445, number of negative: 779
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000961 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 2224, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.649730 -> initscore=0.617854
[LightGBM] [Info] Start training from score 0.617854
Training until validation scores don't improve for 30 rounds
[100]	valid_0's auc: 0.857416	valid_0's binary_logloss: 0.418876
Early stopping, best iteration is:
[112]	valid_0's auc: 0.858342	valid_0's binary_logloss: 0.416497
✅ [LightGBM] Fold 1 - Train AUC: 0.8853, Validation AUC: 0.8583
[LightGBM] [Info] Number of positive: 1448, number of negative: 777
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the ov

In [66]:
from catboost import CatBoostClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
import numpy as np

assert isinstance(y, pd.Series)  # 安全確認

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_cat = np.zeros(len(X))
oof_xgb = np.zeros(len(X))
oof_lgb = np.zeros(len(X))
y_true = np.zeros(len(X))

# ✅ CatBoost 最適化済パラメータ
cat_model = CatBoostClassifier(
    iterations=993,
    depth=5,
    learning_rate=0.07456,
    l2_leaf_reg=0.01056,
    bagging_temperature=0.7331,
    random_strength=0.6640,
    border_count=177,
    eval_metric='AUC',
    random_seed=42,
    verbose=0
)

# ✅ XGBoost 最適化済パラメータ
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 3,
    'learning_rate': 0.01813,
    'subsample': 0.8307,
    'colsample_bytree': 0.9931,
    'gamma': 0.6284,
    'reg_alpha': 0.0828,
    'reg_lambda': 0.1838,
    'min_child_weight': 1,
    'random_state': 42
}

# ✅ LightGBM 最適化済パラメータ
lgb_model = LGBMClassifier(
    max_depth=6,
    num_leaves=10,
    min_child_samples=38,
    reg_alpha=8.18,
    reg_lambda=8.07,
    learning_rate=0.0442,
    n_estimators=1000,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

print("✅ アンサンブル (Weighted Voting + Stacking) 開始")

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

    # CatBoost
    cat_model.fit(X_train, y_train, eval_set=(X_valid, y_valid),
                  early_stopping_rounds=30, use_best_model=True)
    oof_cat[valid_idx] = cat_model.predict_proba(X_valid)[:, 1]

    # XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=2000,
                           evals=[(dvalid, 'valid')], early_stopping_rounds=30, verbose_eval=100)
    oof_xgb[valid_idx] = xgb_model.predict(dvalid)

    # LightGBM
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        eval_metric='auc',
        callbacks=[lgb.early_stopping(stopping_rounds=30), lgb.log_evaluation(100)]
    )
    oof_lgb[valid_idx] = lgb_model.predict_proba(X_valid)[:, 1]

    y_true[valid_idx] = y_valid

# ✅ Weighted Soft Voting
weights = [1, 1, 1]  # 必要に応じて調整
ensemble_probs = (weights[0] * oof_cat + weights[1] * oof_xgb + weights[2] * oof_lgb) / sum(weights)
auc_voting = roc_auc_score(y_true, ensemble_probs)
print(f"\n✅ [Weighted Voting] Validation AUC: {auc_voting:.4f}")

# ✅ Stacking
stack_X = np.vstack([oof_cat, oof_xgb, oof_lgb]).T
meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(stack_X, y_true)
meta_preds = meta_model.predict_proba(stack_X)[:, 1]
auc_stacking = roc_auc_score(y_true, meta_preds)
print(f"✅ [Stacking] Validation AUC: {auc_stacking:.4f}")


✅ アンサンブル (Weighted Voting + Stacking) 開始
[0]	valid-auc:0.81306
[100]	valid-auc:0.84525
[200]	valid-auc:0.85823
[300]	valid-auc:0.86526
[400]	valid-auc:0.86783
[401]	valid-auc:0.86808
[LightGBM] [Info] Number of positive: 1445, number of negative: 779
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2589
[LightGBM] [Info] Number of data points in the train set: 2224, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.649730 -> initscore=0.617854
[LightGBM] [Info] Start training from score 0.617854
Training until validation scores don't improve for 30 rounds
[100]	valid_0's auc: 0.857416	valid_0's binary_logloss: 0.418876
Early stopping, best iteration is:
[112]	valid_0's auc: 0.858342	valid_0's binary_logloss: 0.416497
[0]	valid-auc:0.77620
[100]	valid-auc:0.84471
[200]	valid-auc:0.85516
[300]	valid-auc:0.86162
[400]	val

In [69]:
# ✅ 提出用ファイル作成処理（アンサンブル予測）

import pandas as pd
import numpy as np
import os
import datetime
import re

# ✅ テストデータ再読み込み（Id復元用）
original_test = pd.read_csv(PATH + "test.csv")

# ✅ 提出用特徴量列（現在のX.columnsで固定）
feature_cols = X.columns.tolist()
X_test = test[feature_cols]

# ✅ それぞれモデルで予測

# CatBoost
cat_preds = cat_model.predict_proba(X_test)[:, 1]

# XGBoost
dtest = xgb.DMatrix(X_test)
xgb_preds = xgb_model.predict(dtest)

# LightGBM
lgb_preds = lgb_model.predict_proba(X_test)[:, 1]

# ✅ Weighted Voting（等重み、必要に応じて調整可能）
weights = [1, 1, 1]
ensemble_preds = (weights[0] * cat_preds + weights[1] * xgb_preds + weights[2] * lgb_preds) / sum(weights)

# ✅ Stackingも作成可能（必要に応じて切替）
# stack_X_test = np.vstack([cat_preds, xgb_preds, lgb_preds]).T
# ensemble_preds = meta_model.predict_proba(stack_X_test)[:, 1]

# ✅ 提出用DataFrame
submission = pd.DataFrame({
    "Id": original_test["Id"],
    "Drafted": ensemble_preds
})

# ✅ 保存ディレクトリをプロジェクトルートに作成
root_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
save_dir = os.path.join(root_dir, "submissions")
os.makedirs(save_dir, exist_ok=True)

# ✅ Notebook名取得 → ファイル名決定
try:
    import ipynbname
    notebook_path = ipynbname.path()
    notebook_name = notebook_path.stem
except:
    notebook_name = "20_0705_notebook"

match = re.search(r"\d{2}_\d{4}", notebook_name)
tag = match.group() if match else notebook_name

filename = f"submission_{tag}().csv"
save_path = os.path.join(save_dir, filename)

# ✅ 書き出し
submission.to_csv(save_path, index=False)
print(f"✅ アンサンブル提出ファイルを保存しました: {save_path}")


✅ アンサンブル提出ファイルを保存しました: c:\Users\81807\Desktop\Kaggle\GCI②(NFL Draft Prediction)\submissions\submission_20_0705().csv


In [67]:
import pandas as pd
import numpy as np

# ✅ 相関係数計算
corr_matrix = X.corr().abs()

# ✅ 高相関ペア抽出
threshold = 0.90
high_corr = np.where(corr_matrix > threshold)
high_corr_pairs = []

for x, y in zip(*high_corr):
    if x < y:
        high_corr_pairs.append((
            X.columns[x],
            X.columns[y],
            corr_matrix.iloc[x, y]
        ))

# ✅ 結果表示
if high_corr_pairs:
    print(f"✅ 高相関ペア (|r| > {threshold}):")
    for col1, col2, corr in sorted(high_corr_pairs, key=lambda x: -x[2]):
        print(f"{col1} & {col2}: {corr:.4f}")
else:
    print(f"✅ 高相関ペアは存在しません (|r| > {threshold})")


✅ 高相関ペア (|r| > 0.9):
Broad_Jump & BurstScore: 0.9815
Agility_3cone & AgilityScore: 0.9775
Age_filled & Age_Speed: 0.9709
Weight & Sprint_ASI: 0.9630
Shuttle & AgilityScore: 0.9458
Weight & BMI: 0.9386
Position_encoded & Position_group_encoded: 0.9115
Vertical_Jump & BurstScore: 0.9050
BMI & Sprint_ASI: 0.9013
