# Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import optuna
import xgboost as xgb


from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import f1_score


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

  from .autonotebook import tqdm as notebook_tqdm


# Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

# Feature

- 各特長量の説明

| ヘッダ名称            | データ型 | 説明                                                       |
|---------------------|-------|----------------------------------------------------------|
| gameId             | int   | ゲームID                                                    |
| blueFirstBlood      | int   | ゲームの最初のキル。青チームが最初のキルを行った場合は1、それ以外の場合は0 |
| blueKills           | int   | 青チームによって殺された敵の数                                     |
| blueDeaths          | int   | 青チームの死亡者数                                            |
| blueAssists         | int   | 青チームのキルアシストの数                                      |
| blueEliteMonsters   | int   | 青チームによって殺されたエリートモンスターの数（ドラゴンとヘラルド）       |
| blueDragons         | int   | 青チームによって殺されたドラゴンの数                               |
| blueTotalGold       | int   | 青チームの得たゴールド合計                                       |
| blueTotalExperience | int   | 青チームの得た経験値合計                                       |
| blueWins            | int   | 目的変数（青チームが勝った場合は1、それ以外の場合は0。）                |


gameIdとgoldは相関無さそう

# Feature Engineering

- Add new feature

In [3]:
def add_features(df):
    df['assistRate'] = df['blueAssists'] / (df['blueKills'] + 1)
    df['blueHerald'] = (df['blueEliteMonsters'] - df['blueDragons']).clip(lower=0)
    df['eliteMonsterDragonRatio'] = df['blueEliteMonsters'] / (df['blueDragons'] + 1)
    df['logTotalGold'] = np.log1p(df['blueTotalGold'])
    df['logTotalExperience'] = np.log1p(df['blueTotalExperience'])
    df['firstBloodKillRatio'] = df['blueFirstBlood'] * df['blueKills']
    df['totalCombatPoints'] = df['blueKills'] + df['blueDeaths'] + df['blueAssists']

    total_combat_actions = df['blueKills'] + df['blueDeaths'] + df['blueAssists']
    df['killRatio'] = df['blueKills'] / total_combat_actions
    df['deathRatio'] = df['blueDeaths'] / total_combat_actions
    df['assistRatio'] = df['blueAssists'] / total_combat_actions

    df['teamStrength'] = df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']
    df['dragonKillImpact'] = df['blueDragons'] / (df['blueKills'] + 1)
    df['eliteMonsterUtilization'] = df['blueEliteMonsters'] / (df['blueKills'] + df['blueAssists'] + 1)
    df['goldExperienceRatio'] = df['blueTotalGold'] / (df['blueTotalExperience'] + 1)
    df['teamEfficiency'] = (df['blueKills'] + df['blueAssists'] + df['blueEliteMonsters']) / (df['blueDeaths'] + 1)
    df['killToMonsterRatio'] = df['blueKills'] / (df['blueEliteMonsters'] + 1)
    df['avgGoldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)
    df['expToKillRatio'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killsMinusDeaths'] = df['blueKills'] - df['blueDeaths']
    df['killDeathRatio'] = df['blueKills'] / (df['blueKills'] + df['blueDeaths'])
    df['avgExperiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)
    df['killDeathDiff'] = (df['blueKills'] - df['blueDeaths']).clip(lower=0)
    df['eliteMonsterHeraldRatio'] = df['blueEliteMonsters'] / (df['blueHerald'] + 1)
    df['goldXexperience'] = np.log1p(df['blueTotalGold'] * df['blueTotalExperience'])
    df['goldPerKill'] = df['blueTotalGold'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['experiencePerKill'] = df['blueTotalExperience'] / (df['blueKills'] + 1)  # ゼロ除算を防ぐために+1
    df['eliteMonstersPerGold'] = df['blueEliteMonsters'] / df['blueTotalGold']
    df['eliteMonstersPerExperience'] = df['blueEliteMonsters'] / df['blueTotalExperience']

    
    return df

- Apply data

In [4]:
train = add_features(train)
test = add_features(test)

# model

- XGBoost

In [5]:


# データの準備
X = train.drop(['blueWins', 'gameId'], axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

test_drop_gameId = test.drop('gameId', axis=1)


# KFoldの設定
n_splits = 3
kf = KFold(n_splits=n_splits, shuffle=True)

# 各フォールドでのスコアを保存するリスト
fold_scores = []

# testを保存するリスト
test_predictions = []

for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold, y_val_fold = y[train_index], y[val_index]

    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
    dtest = xgb.DMatrix(test_drop_gameId)

    # Optunaでのハイパーパラメータチューニング
    def objective(trial):
        param = {
            'verbosity': 0,
            'objective': 'binary:logistic',
            'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
            'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
            'max_depth': trial.suggest_int('max_depth', 3, 9),
            'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1)
        }

        bst = xgb.train(param, dtrain, num_boost_round=1000, evals=[(dval, "eval")], early_stopping_rounds=50, verbose_eval=False)
        preds = bst.predict(dval)
        pred_labels = np.rint(preds)
        accuracy = accuracy_score(y_val_fold, pred_labels)
        return accuracy

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    # 各フォールドの最適パラメータでモデルを再学習
    best_params = study.best_params
    best_params['objective'] = 'binary:logistic'
    best_params['verbosity'] = 0

    final_bst = xgb.train(best_params, dtrain, num_boost_round=study.best_trial.number)

    # 各フォールドのモデル性能を評価
    y_val_pred = final_bst.predict(dval)
    y_val_pred_int = np.rint(y_val_pred)
    fold_accuracy = accuracy_score(y_val_fold, y_val_pred_int)
    fold_scores.append(fold_accuracy)

    # テストデータセットでの予測
    y_test_pred_fold = final_bst.predict(dtest)
    test_predictions.append(y_test_pred_fold)

# 全フォールドの平均スコアを計算
average_score = sum(fold_scores) / n_splits
print(f"Average accuracy across all folds: {average_score}")


[I 2024-01-27 13:56:03,859] A new study created in memory with name: no-name-5468a5a8-a2a9-46ff-a762-f4ab1068c18d


  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-27 13:56:05,848] Trial 0 finished with value: 0.4878140232470941 and parameters: {'alpha': 0.8702602880551715, 'lambda': 0.2881187026303871, 'max_depth': 6, 'eta': 1.5570622733291593e-07, 'subsample': 0.9229222716902389, 'colsample_bytree': 0.9170908936994595}. Best is trial 0 with value: 0.4878140232470941.
  'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0),
  'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
  'eta': trial.suggest_loguniform('eta', 1e-8, 1.0),
[I 2024-01-27 13:56:06,919] Trial 1 finished with value: 0.7409073865766779 and parameters: {'alpha': 2.5895743597260727e-05, 'lambda': 1.712647816359415e-07, 'max_depth': 4, 'eta': 0.00019659568591924578, 'subsample': 0.8090455135070684, 'colsample_bytree': 0.7400448704884199}. Best is trial 1 with value: 0.7409073865766779.
  'alpha': tr

Average accuracy across all folds: 0.7738735396260014


- KFoldの結果確認

In [6]:
# 全フォールドの平均スコアを計算
average_score = sum(fold_scores) / n_splits
print(f"Average accuracy across all folds: {average_score}")

Average accuracy across all folds: 0.7738735396260014


- testの平均を計算

In [7]:
# 予測の平均を計算
y_test_pred_avg = np.mean(test_predictions, axis=0)

# 最終的な予測結果を整数に変換
y_test_pred_int = (y_test_pred_avg > 0.5).astype(int)

- testにy_test_pred結合

In [8]:
output = f'../data/output/KFold_val{average_score:.4f}.csv'

In [9]:
submit = pd.DataFrame({
    '0': test['gameId'],
    '1': y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

- 出力csvチェック

In [10]:
check = pd.read_csv(output)

In [11]:
check

Unnamed: 0,9,1
0,15,1
1,18,0
2,23,0
3,31,1
4,32,1
...,...,...
1994,9971,0
1995,9980,0
1996,9983,1
1997,9996,1
