# Library

In [1]:
import pandas as pd
import numpy as np

import optuna
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from datetime import datetime
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler


# Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Add New Feature

In [3]:
def add_features(df):
    df['assistRate'] = df['blueAssists'] / (df['blueKills'] + 1)
    df['blueHerald'] = (df['blueEliteMonsters'] - df['blueDragons']).clip(lower=0)
    df['eliteMonsterDragonRatio'] = df['blueEliteMonsters'] / (df['blueDragons'] + 1)
    df['logTotalGold'] = np.log1p(df['blueTotalGold'])
    df['logTotalExperience'] = np.log1p(df['blueTotalExperience'])
    # df['firstBloodKillRatio'] = df['blueFirstBlood'] * df['blueKills']
    df['totalCombatPoints'] = df['blueKills'] + df['blueDeaths'] + df['blueAssists']

    total_combat_actions = df['blueKills'] + df['blueDeaths'] + df['blueAssists']
    df['killRatio'] = df['blueKills'] / total_combat_actions
    df['deathRatio'] = df['blueDeaths'] / total_combat_actions
    df['assistRatio'] = df['blueAssists'] / total_combat_actions

    df['teamStrength'] = df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']
    df['dragonKillImpact'] = df['blueDragons'] / (df['blueKills'] + 1)
    df['eliteMonsterUtilization'] = df['blueEliteMonsters'] / (df['blueKills'] + df['blueAssists'] + 1)
    df['goldExperienceRatio'] = df['blueTotalGold'] / (df['blueTotalExperience'] + 1)
    df['teamEfficiency'] = (df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']) / (df['blueDeaths'] + 1)
    df['killToMonsterRatio'] = df['blueKills'] / (df['blueEliteMonsters'] + 1)
    df['avgGoldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)
    df['expToKillRatio'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killsMinusDeaths'] = df['blueKills'] - df['blueDeaths']
    df['killDeathRatio'] = df['blueKills'] / (df['blueKills'] + df['blueDeaths'])
    df['avgExperiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killDeathDiff'] = (df['blueKills'] - df['blueDeaths']).clip(lower=0)
    df['eliteMonsterHeraldRatio'] = df['blueEliteMonsters'] / (df['blueHerald'] + 1)
    df['goldXexperience'] = np.log1p(df['blueTotalGold'] * df['blueTotalExperience'])
    df['goldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['experiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['eliteMonstersPerGold'] = df['blueEliteMonsters'] / df['blueTotalGold']
    df['eliteMonstersPerExperience'] = df['blueEliteMonsters'] / df['blueTotalExperience']

    
    return df

- apply

In [4]:
train = add_features(train)
test = add_features(test)

# Data

In [5]:
# データの準備
X = train.drop(['blueWins', 'gameId'], axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

test_drop_gameId = test.drop('gameId', axis=1)

# test_drop_gameId_temp = test.drop('gameId', axis=1)

# # 特徴量の標準化
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_val = scaler.transform(X_val)  # 検証データも同じスケーラーを使用して標準化

# # テストデータも同じスケーラーを使用して標準化
# test_drop_gameId_scaled = scaler.transform(test_drop_gameId_temp)

# # NumPy配列からPandasデータフレームに変換
# X_train = pd.DataFrame(X_train, columns=X.columns)
# X_val = pd.DataFrame(X_val, columns=X.columns)
# test_drop_gameId = pd.DataFrame(test_drop_gameId_scaled, columns=test_drop_gameId_temp.columns)

# Model

- variable

In [6]:
n_splits = 4
kf = KFold(n_splits=n_splits, shuffle=True)

lgb_threshold = 0.5
xgb_threshold = 0.5
cb_threshold = 0.5

# 各フォールドでのスコアを保存するリスト
lgb_fold_scores = []
xgb_fold_scores = []
cb_fold_scores = []

# 各フォール度での混同行列を保存するリスト
lgb_conf_scores = []
xgb_conf_scores = []
cb_conf_scores = []

# testを保存するリスト
lgb_test_predictions = []
xgb_test_predictions = []
cb_test_predictions = []

## LightGBM

In [7]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # LightGBMではDMatrixの代わりに普通のデータフレームを使用します
    train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
    val_data = lgb.Dataset(X_val_fold, label=y_val_fold, reference=train_data)

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'feature_pre_filter': False,  # 特徴量の事前フィルタリングを無効化
            'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 100),  # Optunaでこの値を調整する
        }

        callbacks = lgb.early_stopping(stopping_rounds=100, verbose=False)
        gbm = lgb.train(param, train_data, valid_sets=[val_data], num_boost_round=1000, callbacks=[callbacks])
        preds = gbm.predict(X_val_fold, num_iteration=gbm.best_iteration)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_val_fold, pred_labels)
        return accuracy

    lgb_study = optuna.create_study(direction='maximize')
    lgb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    lgb_best_params = lgb_study.best_params
    lgb_best_params['objective'] = 'binary'
    lgb_best_params['metric'] = 'binary_logloss'
    lgb_best_params['verbosity'] = -1

    lgb_final = lgb.train(lgb_best_params, train_data, num_boost_round=lgb_study.best_trial.number)

    # 各フォールドのモデル性能を評価
    lgb_y_val_pred = lgb_final.predict(X_val_fold, num_iteration=lgb_final.best_iteration)
    lgb_y_val_pred_int = np.rint(lgb_y_val_pred)
    lgb_fold_accuracy = accuracy_score(y_val_fold, lgb_y_val_pred_int)
    lgb_fold_scores.append(lgb_fold_accuracy)

    # 混同行列
    lgb_conf_matrix = confusion_matrix(y_val_fold, lgb_y_val_pred_int)
    lgb_conf_scores.append(lgb_conf_matrix)


    # テストデータセットでの予測
    lgb_y_test_pred_fold = lgb_final.predict(test_drop_gameId, num_iteration=lgb_final.best_iteration)
    lgb_test_predictions.append(lgb_y_test_pred_fold)

[I 2024-01-29 13:53:26,296] A new study created in memory with name: no-name-9afdaf51-ca45-40c6-a532-4b804403e6ae
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-01-29 13:53:26,503] Trial 0 finished with value: 0.78 and parameters: {'lambda_l1': 0.0005704789124811092, 'lambda_l2': 6.982185439265371e-06, 'num_leaves': 12, 'feature_fraction': 0.47445665684336463, 'bagging_fraction': 0.6262429194416748, 'bagging_freq': 3, 'min_child_samples': 74, 'min_data_in_leaf': 82}. Best is trial 0 with value: 0.78.


  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-01-29 13:53:26,817] Trial 1 finished with value: 0.7745 and parameters: {'lambda_l1': 0.009500953986325635, 'lambda_l2': 2.0339750850250913e-06, 'num_leaves': 28, 'feature_fraction': 0.9218119904306901, 'bagging_fraction': 0.41649334455740045, 'bagging_freq': 3, 'min_child_samples': 53, 'min_data_in_leaf': 71}. Best is trial 0 with value: 0.78.
  'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
  'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
  'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
  'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
[I 2024-01-29 13:53:27,126] Trial 2 finished with value: 0.7735 and parameters: {

- lgb_平均スコア

In [8]:
# 全フォールドの平均スコアを計算
lgb_average_score = sum(lgb_fold_scores) / n_splits
print(f"LightGBM Average accuracy across all folds: {lgb_average_score}")


lgb_average_conf = sum(lgb_conf_scores) / n_splits
print(f"LightGBM Average conf-matrix across all folds: {lgb_average_conf}")

LightGBM Average accuracy across all folds: 0.7706250000000001
LightGBM Average conf-matrix across all folds: [[770.75 245.75]
 [213.   770.5 ]]


- lgb_予測平均

In [9]:
# 予測の平均を計算
lgb_y_test_pred_avg = np.mean(lgb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
lgb_y_test_pred_int = (lgb_y_test_pred_avg > lgb_threshold).astype(int)

- output

In [10]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_LightGBM_cv{lgb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': lgb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

In [11]:
submit

Unnamed: 0,0,1
0,9,1
1,15,1
2,18,0
3,23,0
4,31,1
...,...,...
1995,9971,0
1996,9980,0
1997,9983,1
1998,9996,1


## XGBoost

In [12]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    dtest = xgb.DMatrix(test_drop_gameId)

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1)
        }

        bst = xgb.train(param, dtrain, num_boost_round=1000, evals=[(dval, "eval")], early_stopping_rounds=100, verbose_eval=False)
        preds = bst.predict(dval)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_val_fold, pred_labels)
        return accuracy

    xgb_study = optuna.create_study(direction='maximize')
    xgb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    xgb_best_params = xgb_study.best_params
    xgb_best_params['objective'] = 'binary:logistic'
    xgb_best_params['verbosity'] = 0

    xgb_final_bst = xgb.train(xgb_best_params, dtrain, num_boost_round=xgb_study.best_trial.number)

    # 各フォールドのモデル性能を評価
    xgb_y_val_pred = xgb_final_bst.predict(dval)
    xgb_y_val_pred_int = np.rint(xgb_y_val_pred)
    fold_accuracy = accuracy_score(y_val_fold, xgb_y_val_pred_int)
    xgb_fold_scores.append(fold_accuracy)

    # 混同行列
    xgb_conf_matrix = confusion_matrix(y_val_fold, xgb_y_val_pred_int)
    xgb_conf_scores.append(xgb_conf_matrix)

    # テストデータセットでの予測
    xgb_y_test_pred_fold = xgb_final_bst.predict(dtest)
    xgb_test_predictions.append(xgb_y_test_pred_fold)

[I 2024-01-29 13:55:39,710] A new study created in memory with name: no-name-0af07aa8-c57e-432c-8e30-ceade5fcc2ca
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),


[I 2024-01-29 13:55:45,279] Trial 0 finished with value: 0.7725 and parameters: {'alpha': 2.42106846133571e-05, 'lambda': 0.22401467487862906, 'max_depth': 9, 'eta': 0.0009695264879520253, 'subsample': 0.5969254685627335, 'colsample_bytree': 0.9637210467975192}. Best is trial 0 with value: 0.7725.
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-29 13:55:45,885] Trial 1 finished with value: 0.7785 and parameters: {'alpha': 0.004353632784853184, 'lambda': 0.005108251615192868, 'max_depth': 6, 'eta': 0.021413348490325758, 'subsample': 0.9710383168035224, 'colsample_bytree': 0.6832994390557645}. Best is trial 1 with value: 0.7785.
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-29 13:55:47,431] Trial 2 finished with value: 0.7755 

- xgb_平均スコア

In [13]:
# 全フォールドの平均スコアを計算
xgb_average_score = sum(xgb_fold_scores) / n_splits
print(f"XGB Average accuracy across all folds: {xgb_average_score}")

xgb_average_conf = sum(xgb_conf_scores) / n_splits
print(f"XGB Average conf-matrix across all folds: {xgb_average_conf}")

XGB Average accuracy across all folds: 0.77
XGB Average conf-matrix across all folds: [[762.5 254. ]
 [206.  777.5]]


- xgb_予測平均

In [14]:
# 予測の平均を計算
xgb_y_test_pred_avg = np.mean(xgb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
xgb_y_test_pred_int = (xgb_y_test_pred_avg > xgb_threshold).astype(int)

- output

In [15]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_XGBoost_cv{xgb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': xgb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

## CatBoost

In [16]:
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'iterations': 1000,
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'random_strength': trial.suggest_int('random_strength', 1, 20),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 100),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
            'loss_function': 'Logloss'
        }

        model = cb.CatBoostClassifier(**param, verbose=False)
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=100)
        preds = model.predict(X_val_fold)
        accuracy = accuracy_score(y_val_fold, preds)
        return accuracy

    cb_study = optuna.create_study(direction='maximize')
    cb_study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    cb_best_params = cb_study.best_params
    cb_best_params['loss_function'] = 'Logloss'
    model_fold = cb.CatBoostClassifier(**cb_best_params, verbose=False)
    model_fold.fit(X_train_fold, y_train_fold)

    # 各フォールドのモデル性能を評価
    cb_y_val_pred = model_fold.predict(X_val_fold)
    cb_fold_accuracy = accuracy_score(y_val_fold, cb_y_val_pred)
    cb_fold_scores.append(cb_fold_accuracy)

    # 混同行列
    cb_conf_matrix = confusion_matrix(y_val_fold, cb_y_val_pred)
    cb_conf_scores.append(cb_conf_matrix)

    # テストデータセットでの予測
    cb_y_test_pred_fold = model_fold.predict(test_drop_gameId)
    cb_test_predictions.append(cb_y_test_pred_fold)



[I 2024-01-29 14:04:11,935] A new study created in memory with name: no-name-1fa5c85d-d084-4387-8658-6f9418d34226


[I 2024-01-29 14:04:13,093] Trial 0 finished with value: 0.707 and parameters: {'depth': 7, 'learning_rate': 0.29148363477623773, 'random_strength': 14, 'bagging_temperature': 0.4348089347935692, 'l2_leaf_reg': 33.092886987383935, 'scale_pos_weight': 5.920971661138992}. Best is trial 0 with value: 0.707.
[I 2024-01-29 14:04:17,094] Trial 1 finished with value: 0.7125 and parameters: {'depth': 10, 'learning_rate': 0.19855954180469565, 'random_strength': 2, 'bagging_temperature': 0.3166317883746238, 'l2_leaf_reg': 91.2104658344991, 'scale_pos_weight': 4.479957075156587}. Best is trial 1 with value: 0.7125.
[I 2024-01-29 14:04:21,353] Trial 2 finished with value: 0.708 and parameters: {'depth': 10, 'learning_rate': 0.19952416607618192, 'random_strength': 19, 'bagging_temperature': 0.7332348831895464, 'l2_leaf_reg': 10.410608790077328, 'scale_pos_weight': 5.7636845575714934}. Best is trial 1 with value: 0.7125.
[I 2024-01-29 14:04:22,030] Trial 3 finished with value: 0.7355 and parameters:

- Catboost平均スコア

In [17]:
# 全フォールドの平均スコアを計算
cb_average_score = sum(cb_fold_scores) / n_splits
print(f"CatBoost Average accuracy across all folds: {cb_average_score}")

cb_average_conf = sum(cb_conf_scores) / n_splits
print(f"CatBoost Average conf-matrix across all folds: {cb_average_conf}")

CatBoost Average accuracy across all folds: 0.777875
CatBoost Average conf-matrix across all folds: [[799.5  217.  ]
 [227.25 756.25]]


- CB_予測平均

In [18]:
# 予測の平均を計算
cb_y_test_pred_avg = np.mean(cb_test_predictions, axis=0)

# 最終的な予測結果を整数に変換
cb_y_test_pred_int = (cb_y_test_pred_avg > cb_threshold).astype(int)

- output

In [19]:
# 現在の日時を取得してフォーマットする
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
output = f'../data/output/{current_time}_CatBoost_cv{cb_average_score:.4f}.csv'

submit = pd.DataFrame({
    '0': test['gameId'],
    '1': cb_y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

In [20]:
submit

Unnamed: 0,0,1
0,9,1
1,15,1
2,18,1
3,23,0
4,31,1
...,...,...
1995,9971,0
1996,9980,0
1997,9983,1
1998,9996,1


# 合成

- 予測リストに追加

In [21]:
# 予測結果をNumPy配列に変換
cb_predictions = np.array(cb_y_test_pred_int)  # CatBoostの予測結果
lgb_predictions = np.array(lgb_y_test_pred_int)  # LightGBMの予測結果
xgb_predictions = np.array(xgb_y_test_pred_int)  # XGBoostの予測結果

# 3つのモデルの予測結果を組み合わせる
combined_predictions = np.vstack((cb_predictions, lgb_predictions, xgb_predictions))

# 多数決で最終的な予測を決定（行方向に合計し、1.5より大きい場合は1とする）
final_predictions = np.mean(combined_predictions, axis=0) > 0.5
final_predictions = final_predictions.astype(int)

# 最終的な予測をCSVファイルに保存
merge_submit = pd.DataFrame({
    '0': test['gameId'],  # 適切なID列を使用
    '1': final_predictions
})
merge_submit.to_csv(f'../data/output/{current_time}_final_predictions.csv', index=False)

In [22]:
merge_submit

Unnamed: 0,0,1
0,9,1
1,15,1
2,18,0
3,23,0
4,31,1
...,...,...
1995,9971,0
1996,9980,0
1997,9983,1
1998,9996,1


# 

# Logistic classifier