# Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb
import catboost as cb


from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train

Unnamed: 0,gameId,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience,blueWins
0,0,0,5,8,6,0,0,14536,17256,0
1,1,1,10,1,5,0,0,14536,17863,0
2,2,0,3,10,2,0,0,17409,17256,0
3,3,1,7,10,8,0,0,19558,18201,0
4,4,0,4,9,4,0,0,17409,17256,0
...,...,...,...,...,...,...,...,...,...,...
7995,9993,1,9,6,14,0,0,18513,18201,1
7996,9994,0,3,10,4,0,0,17381,19797,0
7997,9995,1,9,3,9,2,1,18274,18491,1
7998,9997,0,5,10,6,1,1,18274,18491,1


In [4]:
test

Unnamed: 0,gameId,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience
0,9,0,7,6,6,0,0,16961,18201
1,15,0,6,6,6,2,1,18513,18021
2,18,1,6,4,3,0,0,13475,17256
3,23,0,5,4,7,0,0,17409,17256
4,31,0,10,8,9,0,0,18117,18472
...,...,...,...,...,...,...,...,...,...
1995,9971,0,6,3,7,0,0,17409,17256
1996,9980,0,4,4,4,0,0,17409,17256
1997,9983,1,6,3,3,1,1,18513,18201
1998,9996,1,10,9,9,1,1,18513,18201


# Feature

- 各特長量の説明

| ヘッダ名称            | データ型 | 説明                                                       |
|---------------------|-------|----------------------------------------------------------|
| gameId             | int   | ゲームID                                                    |
| blueFirstBlood      | int   | ゲームの最初のキル。青チームが最初のキルを行った場合は1、それ以外の場合は0 |
| blueKills           | int   | 青チームによって殺された敵の数                                     |
| blueDeaths          | int   | 青チームの死亡者数                                            |
| blueAssists         | int   | 青チームのキルアシストの数                                      |
| blueEliteMonsters   | int   | 青チームによって殺されたエリートモンスターの数（ドラゴンとヘラルド）       |
| blueDragons         | int   | 青チームによって殺されたドラゴンの数                               |
| blueTotalGold       | int   | 青チームの得たゴールド合計                                       |
| blueTotalExperience | int   | 青チームの得た経験値合計                                       |
| blueWins            | int   | 目的変数（青チームが勝った場合は1、それ以外の場合は0。）                |


gameIdとgoldは相関無さそう

# Feature Engineering

- Add new feature

In [5]:
def add_features(df):
    df['assistRate'] = df['blueAssists'] / (df['blueKills'] + 1)
    df['blueHerald'] = (df['blueEliteMonsters'] - df['blueDragons']).clip(lower=0)
    df['eliteMonsterDragonRatio'] = df['blueEliteMonsters'] / (df['blueDragons'] + 1)
    df['logTotalGold'] = np.log1p(df['blueTotalGold'])
    df['logTotalExperience'] = np.log1p(df['blueTotalExperience'])
    df['firstBloodKillRatio'] = df['blueFirstBlood'] * df['blueKills']
    df['totalCombatPoints'] = df['blueKills'] + df['blueDeaths'] + df['blueAssists']

    total_combat_actions = df['blueKills'] + df['blueDeaths'] + df['blueAssists']
    df['killRatio'] = df['blueKills'] / total_combat_actions
    df['deathRatio'] = df['blueDeaths'] / total_combat_actions
    df['assistRatio'] = df['blueAssists'] / total_combat_actions

    df['teamStrength'] = df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']
    df['dragonKillImpact'] = df['blueDragons'] / (df['blueKills'] + 1)
    df['eliteMonsterUtilization'] = df['blueEliteMonsters'] / (df['blueKills'] + df['blueAssists'] + 1)
    df['goldExperienceRatio'] = df['blueTotalGold'] / (df['blueTotalExperience'] + 1)
    df['teamEfficiency'] = (df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']) / (df['blueDeaths'] + 1)
    df['killToMonsterRatio'] = df['blueKills'] / (df['blueEliteMonsters'] + 1)
    df['avgGoldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)
    df['expToKillRatio'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killsMinusDeaths'] = df['blueKills'] - df['blueDeaths']
    df['killDeathRatio'] = df['blueKills'] / (df['blueKills'] + df['blueDeaths'])
    df['avgExperiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killDeathDiff'] = (df['blueKills'] - df['blueDeaths']).clip(lower=0)
    df['eliteMonsterHeraldRatio'] = df['blueEliteMonsters'] / (df['blueHerald'] + 1)
    df['goldXexperience'] = np.log1p(df['blueTotalGold'] * df['blueTotalExperience'])
    df['goldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['experiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['eliteMonstersPerGold'] = df['blueEliteMonsters'] / df['blueTotalGold']
    df['eliteMonstersPerExperience'] = df['blueEliteMonsters'] / df['blueTotalExperience']

    
    return df

- Apply data

In [6]:
train = add_features(train)
test = add_features(test)

# model

- CatBoost

In [7]:


# データの準備
X = train.drop(['blueWins', 'gameId'], axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

test_drop_gameId = test.drop('gameId', axis=1)


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True)

# 各フォールドでのスコアを保存するリスト
fold_scores = []

# testを保存するリスト
test_predictions = []

for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'iterations': 1000,
            'depth': trial.suggest_int('depth', 4, 10),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'random_strength': trial.suggest_int('random_strength', 1, 20),
            'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
            'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-3, 100),
            'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.1, 10.0),
            'loss_function': 'Logloss'
        }

        model = cb.CatBoostClassifier(**param, verbose=False)
        model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), early_stopping_rounds=50)
        preds = model.predict(X_val_fold)
        accuracy = accuracy_score(y_val_fold, preds)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    best_params = study.best_params
    best_params['loss_function'] = 'Logloss'
    model_fold = cb.CatBoostClassifier(**best_params, verbose=False)
    model_fold.fit(X_train_fold, y_train_fold)

    # 各フォールドのモデル性能を評価
    y_val_pred = model_fold.predict(X_val_fold)
    fold_accuracy = accuracy_score(y_val_fold, y_val_pred)
    fold_scores.append(fold_accuracy)

    # テストデータセットでの予測
    y_test_pred_fold = model_fold.predict(test_drop_gameId)
    test_predictions.append(y_test_pred_fold)

# 全フォールドの平均スコアを計算
average_score = sum(fold_scores) / n_splits
print(f"Average accuracy across all folds: {average_score}")


[I 2024-01-26 13:45:06,194] A new study created in memory with name: no-name-03ba8500-8dc0-45f3-9060-abaaa2550fc8


[I 2024-01-26 13:45:06,832] Trial 0 finished with value: 0.679375 and parameters: {'depth': 6, 'learning_rate': 0.2568664087633512, 'random_strength': 12, 'bagging_temperature': 0.4099488159839112, 'l2_leaf_reg': 0.08237487033800113, 'scale_pos_weight': 7.679838581717268}. Best is trial 0 with value: 0.679375.
[I 2024-01-26 13:45:08,954] Trial 1 finished with value: 0.69375 and parameters: {'depth': 5, 'learning_rate': 0.07428770523636587, 'random_strength': 4, 'bagging_temperature': 0.6683468229259301, 'l2_leaf_reg': 79.7687445742275, 'scale_pos_weight': 7.206332760936996}. Best is trial 1 with value: 0.69375.
[I 2024-01-26 13:45:11,062] Trial 2 finished with value: 0.769375 and parameters: {'depth': 8, 'learning_rate': 0.10111671326445153, 'random_strength': 7, 'bagging_temperature': 0.2244449877243473, 'l2_leaf_reg': 64.27916871568031, 'scale_pos_weight': 1.9224050338207286}. Best is trial 2 with value: 0.769375.
[I 2024-01-26 13:45:11,793] Trial 3 finished with value: 0.755625 and 

Average accuracy across all folds: 0.7789999999999999


- KFoldの結果確認

In [8]:
# 全フォールドの平均スコアを計算
average_score = sum(fold_scores) / n_splits
print(f"Average accuracy across all folds: {average_score}")

Average accuracy across all folds: 0.7789999999999999


- testの平均を計算

In [9]:
# 予測の平均を計算
y_test_pred_avg = np.mean(test_predictions, axis=0)

# 最終的な予測結果を整数に変換
y_test_pred_int = (y_test_pred_avg > 0.5).astype(int)

- testにy_test_pred結合

In [10]:
output = f'../data/output/KFold_val{average_score:.4f}.csv'

In [11]:
submit = pd.DataFrame({
    '0': test['gameId'],
    '1': y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

- 出力csvチェック

In [12]:
check = pd.read_csv(output)

In [13]:
check

Unnamed: 0,9,1
0,15,1
1,18,1
2,23,0
3,31,1
4,32,1
...,...,...
1994,9971,0
1995,9980,0
1996,9983,1
1997,9996,1
