# Library

In [1]:
import pandas as pd
import numpy as np

import optuna
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from datetime import datetime
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


# Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Add New Feature

In [3]:
def add_features(df):
    df['assistRate'] = df['blueAssists'] / (df['blueKills'] + 1)
    df['blueHerald'] = (df['blueEliteMonsters'] - df['blueDragons']).clip(lower=0)
    df['eliteMonsterDragonRatio'] = df['blueEliteMonsters'] / (df['blueDragons'] + 1)
    df['logTotalGold'] = np.log1p(df['blueTotalGold'])
    df['logTotalExperience'] = np.log1p(df['blueTotalExperience'])
    df['firstBloodKillRatio'] = df['blueFirstBlood'] * df['blueKills']
    df['totalCombatPoints'] = df['blueKills'] + df['blueDeaths'] + df['blueAssists']

    total_combat_actions = df['blueKills'] + df['blueDeaths'] + df['blueAssists']
    df['killRatio'] = df['blueKills'] / total_combat_actions
    df['deathRatio'] = df['blueDeaths'] / total_combat_actions
    df['assistRatio'] = df['blueAssists'] / total_combat_actions

    df['teamStrength'] = df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']
    df['dragonKillImpact'] = df['blueDragons'] / (df['blueKills'] + 1)
    df['eliteMonsterUtilization'] = df['blueEliteMonsters'] / (df['blueKills'] + df['blueAssists'] + 1)
    df['goldExperienceRatio'] = df['blueTotalGold'] / (df['blueTotalExperience'] + 1)
    df['teamEfficiency'] = (df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']) / (df['blueDeaths'] + 1)
    df['killToMonsterRatio'] = df['blueKills'] / (df['blueEliteMonsters'] + 1)
    df['avgGoldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)
    df['expToKillRatio'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killsMinusDeaths'] = df['blueKills'] - df['blueDeaths']
    df['killDeathRatio'] = df['blueKills'] / (df['blueKills'] + df['blueDeaths'])
    df['avgExperiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killDeathDiff'] = (df['blueKills'] - df['blueDeaths']).clip(lower=0)
    df['eliteMonsterHeraldRatio'] = df['blueEliteMonsters'] / (df['blueHerald'] + 1)
    df['goldXexperience'] = np.log1p(df['blueTotalGold'] * df['blueTotalExperience'])
    df['goldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['experiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['eliteMonstersPerGold'] = df['blueEliteMonsters'] / df['blueTotalGold']
    df['eliteMonstersPerExperience'] = df['blueEliteMonsters'] / df['blueTotalExperience']

    
    return df

- apply

In [4]:
train = add_features(train)
test = add_features(test)

# Data

In [5]:
# データの準備
X = train.drop(['blueWins', 'gameId'], axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

test_drop_gameId = test.drop('gameId', axis=1)

# test_drop_gameId_temp = test.drop('gameId', axis=1)

# # 特徴量の標準化
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)  # 検証データも同じスケーラーを使用して標準化

# # テストデータも同じスケーラーを使用して標準化
# test_drop_gameId_scaled = scaler.transform(test_drop_gameId_temp)

# # NumPy配列からPandasデータフレームに変換
# X_train = pd.DataFrame(X_train, columns=X.columns)
# X_val = pd.DataFrame(X_val, columns=X.columns)
# test_drop_gameId = pd.DataFrame(test_drop_gameId_scaled, columns=test_drop_gameId_temp.columns)

# Model

- variable

In [6]:
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

# 各フォールドでのスコアを保存するリスト
lgb_fold_scores = []
xgb_fold_scores = []
cb_fold_scores = []

# testを保存するリスト
lgb_test_predictions = []
xgb_test_predictions = []
cb_test_predictions = []

## LightGBM

In [7]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # LightGBMではDMatrixの代わりに普通のデータフレームを使用します
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'feature_pre_filter': False,  # 特徴量の事前フィルタリングを無効化
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100),  # Optunaでこの値を調整する
        }

        callbacks = lgb.early_stopping(stopping_rounds=50, verbose=False)
        gbm = lgb.train(param, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=[callbacks])
        preds = gbm.predict(X_val_fold, num_iteration=gbm.best_iteration)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_val_fold, pred_labels)
        return accuracy

    lgb_study = optuna.create_study(direction='maximize')
    lgb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    lgb_best_params = lgb_study.best_params
    lgb_best_params['objective'] = 'binary'
    lgb_best_params['metric'] = 'binary_logloss'
    lgb_best_params['verbosity'] = -1

    lgb_final = lgb.train(lgb_best_params, train_data, num_boost_round=lgb_study.best_trial.number)

    # 各フォールドのモデル性能を評価
    lgb_y_val_pred = lgb_final.predict(X_val_fold, num_iteration=lgb_final.best_iteration)
    lgb_y_val_pred_int = np.rint(lgb_y_val_pred)
    lgb_fold_accuracy = accuracy_score(y_val_fold, lgb_y_val_pred_int)
    lgb_fold_scores.append(lgb_fold_accuracy)

    # テストデータセットでの予測
    lgb_y_test_pred_fold = lgb_final.predict(test_drop_gameId, num_iteration=lgb_final.best_iteration)
    lgb_test_predictions.append(lgb_y_test_pred_fold)

[I 2024-01-28 13:23:32,178] A new study created in memory with name: no-name-c16c8857-8c1a-460e-9460-36257569d4ac
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-01-28 13:23:32,577] Trial 0 finished with value: 0.7775 and parameters: {'lambda_l1': 0.14446063397028677, 'lambda_l2': 1.9540539107885817e-06, 'num_leaves': 206, 'feature_fraction': 0.823551803705272, 'bagging_fraction': 0.5578735099477603, 'bagging_freq': 3, 'min_child_samples': 52, 'min_data_in_leaf': 79}. Best is trial 0 with value: 0.7775.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_unifor

- lgb_平均スコア

In [8]:
# 全フォールドの平均スコアを計算
lgb_average_score = sum(lgb_fold_scores) / n_splits
print(f"LightGBM Average accuracy across all folds: {lgb_average_score}")

LightGBM Average accuracy across all folds: 0.7825


- lgb_予測平均

In [9]:
# 予測の平均を計算
lgb_y_test_pred_avg = np.mean(lgb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
lgb_y_test_pred_int = (lgb_y_test_pred_avg > 0.5).astype(int)

- output

In [20]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_LightGBM_cv{lgb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': lgb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

In [21]:
submit

Unnamed: 0,0,1
0,9,1
1,15,1
2,18,1
3,23,0
4,31,1
...,...,...
1995,9971,0
1996,9980,0
1997,9983,1
1998,9996,1


## XGBoost

In [11]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    dtest = xgb.DMatrix(test_drop_gameId)

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1)
        }

        bst = xgb.train(param, dtrain, num_boost_round=1000, evals=[(dval, "eval")], early_stopping_rounds=50, verbose_eval=False)
        preds = bst.predict(dval)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_val_fold, pred_labels)
        return accuracy

    xgb_study = optuna.create_study(direction='maximize')
    xgb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    xgb_best_params = xgb_study.best_params
    xgb_best_params['objective'] = 'binary:logistic'
    xgb_best_params['verbosity'] = 0

    xgb_final_bst = xgb.train(xgb_best_params, dtrain, num_boost_round=xgb_study.best_trial.number)

    # 各フォールドのモデル性能を評価
    xgb_y_val_pred = xgb_final_bst.predict(dval)
    xgb_y_val_pred_int = np.rint(xgb_y_val_pred)
    fold_accuracy = accuracy_score(y_val_fold, xgb_y_val_pred_int)
    xgb_fold_scores.append(fold_accuracy)

    # テストデータセットでの予測
    xgb_y_test_pred_fold = xgb_final_bst.predict(dtest)
    xgb_test_predictions.append(xgb_y_test_pred_fold)

[I 2024-01-28 13:25:13,061] A new study created in memory with name: no-name-7d55ae16-403b-4236-861a-fb1c9fca02de


  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-28 13:25:16,720] Trial 0 finished with value: 0.78125 and parameters: {'alpha': 0.1643802844301823, 'lambda': 0.4118773366204541, 'max_depth': 9, 'eta': 0.0018604438104187612, 'subsample': 0.8047077015188477, 'colsample_bytree': 0.895352962749242}. Best is trial 0 with value: 0.78125.
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-28 13:25:20,187] Trial 1 finished with value: 0.5225 and parameters: {'alpha': 3.342727372540969e-05, 'lambda': 0.05258742228446596, 'max_depth': 9, 'eta': 8.009544323282864e-07, 'subsample': 0.527858308344481, 'colsample_bytree': 0.9733937924212166}. Best is trial 0 with value: 0.78125.
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambd

- xgb_平均スコア

In [12]:
# 全フォールドの平均スコアを計算
xgb_average_score = sum(xgb_fold_scores) / n_splits
print(f"XGB Average accuracy across all folds: {xgb_average_score}")

XGB Average accuracy across all folds: 0.773875


- xgb_予測平均

In [13]:
# 予測の平均を計算
xgb_y_test_pred_avg = np.mean(xgb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
xgb_y_test_pred_int = (xgb_y_test_pred_avg > 0.5).astype(int)

- output

In [14]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_XGBoost_cv{xgb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': xgb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

## CatBoost

In [15]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'iterations': 1000,
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'random_strength': trial.suggest_int('random_strength', 1, 20),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 100),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
            'loss_function': 'Logloss'
        }

        model = cb.CatBoostClassifier(**param, verbose=False)
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50)
        preds = model.predict(X_val_fold)
        accuracy = accuracy_score(y_val_fold, preds)
        return accuracy

    cb_study = optuna.create_study(direction='maximize')
    cb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    cb_best_params = cb_study.best_params
    cb_best_params['loss_function'] = 'Logloss'
    model_fold = cb.CatBoostClassifier(**cb_best_params, verbose=False)
    model_fold.fit(X_train_fold, y_train_fold)

    # 各フォールドのモデル性能を評価
    cb_y_val_pred = model_fold.predict(X_val_fold)
    cb_fold_accuracy = accuracy_score(y_val_fold, cb_y_val_pred)
    cb_fold_scores.append(cb_fold_accuracy)

    # テストデータセットでの予測
    cb_y_test_pred_fold = model_fold.predict(test_drop_gameId)
    cb_test_predictions.append(cb_y_test_pred_fold)



[I 2024-01-28 13:35:32,361] A new study created in memory with name: no-name-5812fa33-4cd4-4c53-90a4-7ba530121dbb


[I 2024-01-28 13:35:32,817] Trial 0 finished with value: 0.72375 and parameters: {'depth': 5, 'learning_rate': 0.28225054627699603, 'random_strength': 7, 'bagging_temperature': 0.5363999562852277, 'l2_leaf_reg': 1.6144086252433827, 'scale_pos_weight': 4.367663233977826}. Best is trial 0 with value: 0.72375.
[I 2024-01-28 13:35:33,559] Trial 1 finished with value: 0.763125 and parameters: {'depth': 8, 'learning_rate': 0.13115271161798325, 'random_strength': 8, 'bagging_temperature': 0.32675740028195366, 'l2_leaf_reg': 31.365779274932223, 'scale_pos_weight': 1.8108466864660258}. Best is trial 1 with value: 0.763125.
[I 2024-01-28 13:35:34,244] Trial 2 finished with value: 0.769375 and parameters: {'depth': 5, 'learning_rate': 0.16939042752981975, 'random_strength': 16, 'bagging_temperature': 0.9630271431711523, 'l2_leaf_reg': 29.815379482464557, 'scale_pos_weight': 1.938591556660435}. Best is trial 2 with value: 0.769375.
[I 2024-01-28 13:35:35,057] Trial 3 finished with value: 0.690625 

- Catboost平均スコア

In [16]:
# 全フォールドの平均スコアを計算
cb_average_score = sum(cb_fold_scores) / n_splits
print(f"CatBoost Average accuracy across all folds: {cb_average_score}")

CatBoost Average accuracy across all folds: 0.7745


- CB_予測平均

In [17]:
# 予測の平均を計算
cb_y_test_pred_avg = np.mean(cb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
cb_y_test_pred_int = (cb_y_test_pred_avg > 0.5).astype(int)

- output

In [18]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_CatBoost_cv{cb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': cb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

In [19]:
submit

Unnamed: 0,0,1
0,9,1
1,15,1
2,18,1
3,23,0
4,31,0
...,...,...
1995,9971,0
1996,9980,0
1997,9983,1
1998,9996,1


## RandomForest