<a href="https://colab.research.google.com/github/zaku2590/classGCI/blob/main/comp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna



In [None]:
# モジュールのインポート
import optuna
import numpy as np  # 数値計算や配列操作を行うためのライブラリ
import pandas as pd  # 表形式のデータを扱うためのライブラリ
import matplotlib.pyplot as plt  # データ可視化のための基本的なグラフ描画ライブラリ
import seaborn as sns  # 高機能な統計グラフを描画するライブラリ
from sklearn.preprocessing import LabelEncoder  # カテゴリ変数を数値に変換するエンコーダ
from sklearn.ensemble import RandomForestClassifier  # ランダムフォレストによる分類器
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold ,cross_val_score # 層化K分割交差検証を行うクラス
from sklearn.metrics import roc_auc_score  # ROC AUCスコアを計算する評価指標
from sklearn.impute import KNNImputer

In [None]:
PATH = '/content/'

train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')


In [None]:
# 使わない列の削除
train = train.drop(columns=["Id", "School"])
test = test.drop(columns=["Id","School"])

# 平均で補完する対象の列
cols_to_fill = ['Age', 'Sprint_40yd', 'Vertical_Jump', 'Bench_Press_Reps',
                'Broad_Jump', 'Agility_3cone', 'Shuttle']

# # positionTypeで平均を埋める
# for col in cols_to_fill:
#     # train[col + "_was_missing"] = train[col].isnull().astype(int)
#     # test[col + "_was_missing"] = test[col].isnull().astype(int)

#     group_mean = train.groupby("Position_Type")[col].mean()
#     train[col] = train[col].fillna(train["Position_Type"].map(group_mean))
#     test[col] = test[col].fillna(test["Position_Type"].map(group_mean))

# 補完器の定義（近傍5つで補完）
knn_imputer = KNNImputer(n_neighbors=5)

# 補完対象の列だけ抽出して補完
train[cols_to_fill] = knn_imputer.fit_transform(train[cols_to_fill])
test[cols_to_fill] = knn_imputer.transform(test[cols_to_fill])


# カテゴリデータをラベルエンコーディング
target_cols = ["Player_Type", "Position_Type", "Position"]

for col in target_cols:
    # trainデータで平均Draft率を計算
    target_mean = train.groupby(col)["Drafted"].mean()

    # 新しいエンコード列名（例：Player_Type_TE）
    new_col = col + "_TE"

    # train, test にmap（目的変数との平均を特徴にする）
    train[new_col] = train[col].map(target_mean)
    test[new_col] = test[col].map(target_mean)

    # 元のカテゴリ列を削除
    train = train.drop(columns=[col])
    test = test.drop(columns=[col])

In [None]:
for df in [train, test]:
    df['BMI'] = df['Weight'] / (df['Height'] ** 2)

# train = train.drop(columns=["Weight", "Height"])
# test = test.drop(columns=["Weight","Height"])

# train["Jump_per_kg"] = train["Vertical_Jump"] / train["Weight"]
# test["Jump_per_kg"] = test["Vertical_Jump"] / test["Weight"]

# # 2. Strength_per_kg = Bench_Press_Reps / Weight
# train["Strength_per_kg"] = train["Bench_Press_Reps"] / train["Weight"]
# test["Strength_per_kg"] = test["Bench_Press_Reps"] / test["Weight"]

# # 3. Agility_Diff = Shuttle - Agility_3cone
# train["Agility_Diff"] = train["Shuttle"] - train["Agility_3cone"]
# test["Agility_Diff"] = test["Shuttle"] - test["Agility_3cone"]

train.head()

Unnamed: 0,Year,Age,Height,Weight,Sprint_40yd,Vertical_Jump,Bench_Press_Reps,Broad_Jump,Agility_3cone,Shuttle,Drafted,Player_Type_TE,Position_Type_TE,Position_TE,BMI
0,2011,21.0,1.905,140.160042,5.39,59.69,29.0,251.46,7.91,4.94,1.0,0.63045,0.664368,0.642384,38.621956
1,2011,24.0,1.8288,87.089735,4.31,101.6,16.0,332.74,7.028157,4.287189,1.0,0.63045,0.615842,0.594937,26.039614
2,2018,21.0,1.8542,92.986436,4.51,91.44,10.0,309.88,6.95,4.37,1.0,0.63045,0.615842,0.594937,27.046212
3,2010,21.0,1.9304,148.778297,5.09,76.2,39.0,254.0,8.12,4.71,1.0,0.69863,0.70844,0.715,39.925004
4,2016,21.0,1.8796,92.079251,4.64,78.74,17.518142,281.94,7.13,4.2,1.0,0.63045,0.615842,0.594937,26.06339


In [None]:
# 特徴量と目的変数に分ける
X = train.drop(columns=["Drafted"])
y = train["Drafted"]

models = {
    "RandomForest": RandomForestClassifier(
        n_estimators=378,
        max_depth=7,
        min_samples_split=4,
        min_samples_leaf=5,
        max_features=None,
        random_state=2025,
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        learning_rate=0.010488294829397215,
        num_leaves=96,
        max_depth=3,
        min_child_samples=74,
        subsample=0.6219322010855215,
        colsample_bytree=0.6022862354233319,
        reg_alpha=0.8351184813440499,
        reg_lambda=0.6559328766031582,
        n_estimators=1000,
        random_state=2025,
        verbosity=-1
    )
}

# クロスバリデーション設定
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# モデルごとのスコア・予測格納用辞書
model_auc_dict = {}
model_test_pred_dict = {}

for name, model in models.items():
    print(f"\n=== Model: {name} ===")
    auc_scores = []
    test_pred_proba_list = []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X, y)):
        print(f" Fold {fold + 1}")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model.fit(X_train, y_train)
        y_valid_pred_proba = model.predict_proba(X_valid)[:, 1]

        auc = roc_auc_score(y_valid, y_valid_pred_proba)
        auc_scores.append(auc)
        print(f"  AUC: {round(auc, 4)}")

        test_pred_proba = model.predict_proba(test)[:, 1]
        test_pred_proba_list.append(test_pred_proba)

    mean_auc = np.mean(auc_scores)
    print(f"→ Average AUC for {name}: {round(mean_auc, 4)}")

    model_auc_dict[name] = mean_auc
    model_test_pred_dict[name] = np.mean(test_pred_proba_list, axis=0)

# AUC比較結果を表示
print("\n=== Model Comparison ===")
for name, auc in model_auc_dict.items():
    print(f"{name}: AUC = {round(auc, 4)}")


=== Model: RandomForest ===
 Fold 1
  AUC: 0.7884
 Fold 2
  AUC: 0.8365
 Fold 3
  AUC: 0.8344
 Fold 4
  AUC: 0.7736
 Fold 5
  AUC: 0.828
→ Average AUC for RandomForest: 0.8122

=== Model: LightGBM ===
 Fold 1
  AUC: 0.81
 Fold 2
  AUC: 0.8513
 Fold 3
  AUC: 0.8513
 Fold 4
  AUC: 0.7949
 Fold 5
  AUC: 0.8574
→ Average AUC for LightGBM: 0.833

=== Model Comparison ===
RandomForest: AUC = 0.8122
LightGBM: AUC = 0.833


In [None]:
submission_template = pd.read_csv(PATH + 'sample_submission.csv')

for name, test_pred in model_test_pred_dict.items():
    submission = submission_template.copy()
    submission["Drafted"] = test_pred
    submission.to_csv(PATH + f"{name.lower()}_submission.csv", index=False)