# Library

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler


# Data Loading

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
train

Unnamed: 0,gameId,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience,blueWins
0,0,0,5,8,6,0,0,14536,17256,0
1,1,1,10,1,5,0,0,14536,17863,0
2,2,0,3,10,2,0,0,17409,17256,0
3,3,1,7,10,8,0,0,19558,18201,0
4,4,0,4,9,4,0,0,17409,17256,0
...,...,...,...,...,...,...,...,...,...,...
7995,9993,1,9,6,14,0,0,18513,18201,1
7996,9994,0,3,10,4,0,0,17381,19797,0
7997,9995,1,9,3,9,2,1,18274,18491,1
7998,9997,0,5,10,6,1,1,18274,18491,1


In [4]:
test

Unnamed: 0,gameId,blueFirstBlood,blueKills,blueDeaths,blueAssists,blueEliteMonsters,blueDragons,blueTotalGold,blueTotalExperience
0,9,0,7,6,6,0,0,16961,18201
1,15,0,6,6,6,2,1,18513,18021
2,18,1,6,4,3,0,0,13475,17256
3,23,0,5,4,7,0,0,17409,17256
4,31,0,10,8,9,0,0,18117,18472
...,...,...,...,...,...,...,...,...,...
1995,9971,0,6,3,7,0,0,17409,17256
1996,9980,0,4,4,4,0,0,17409,17256
1997,9983,1,6,3,3,1,1,18513,18201
1998,9996,1,10,9,9,1,1,18513,18201


# Feature

- 各特長量の説明

| ヘッダ名称            | データ型 | 説明                                                       |
|---------------------|-------|----------------------------------------------------------|
| gameId             | int   | ゲームID                                                    |
| blueFirstBlood      | int   | ゲームの最初のキル。青チームが最初のキルを行った場合は1、それ以外の場合は0 |
| blueKills           | int   | 青チームによって殺された敵の数                                     |
| blueDeaths          | int   | 青チームの死亡者数                                            |
| blueAssists         | int   | 青チームのキルアシストの数                                      |
| blueEliteMonsters   | int   | 青チームによって殺されたエリートモンスターの数（ドラゴンとヘラルド）       |
| blueDragons         | int   | 青チームによって殺されたドラゴンの数                               |
| blueTotalGold       | int   | 青チームの得たゴールド合計                                       |
| blueTotalExperience | int   | 青チームの得た経験値合計                                       |
| blueWins            | int   | 目的変数（青チームが勝った場合は1、それ以外の場合は0。）                |


# Feature Engineering

- Add new feature

In [5]:
def create_features(df):
    """
    与えられたデータフレームに対して特徴量エンジニアリングを行う関数。

    Args:
    df (DataFrame): 特徴量エンジニアリングを行うデータフレーム

    Returns:
    DataFrame: 新しい特徴量が追加されたデータフレーム
    """
    # キルとデスの比率
    df['killDeathRatio'] = df['blueKills'] / (df['blueDeaths'] + 1)  # ゼロ除算を防ぐために+1

    # アシストの割合
    df['assistRate'] = df['blueAssists'] / (df['blueKills'] + 1)

    # ヘラルドの討伐数
    df['blueHerald'] = df['blueEliteMonsters'] - df['blueDragons']
    # blueHerald列の負の値を0で置き換え
    df['blueHerald'] = df['blueHerald'].clip(lower=0)

    # エリートモンスターとドラゴンの比率
    df['eliteMonsterDragonRatio'] = df['blueEliteMonsters'] / (df['blueDragons'] + 1)
    # エリートモンスターとヘラルドの比率
    df['eliteMonsterHeraldRatio'] = df['blueEliteMonsters'] / (df['blueHerald'] + 1)

    # ゴールドと経験値の対数変換
    df['logTotalGold'] = np.log1p(df['blueTotalGold'])
    df['logTotalExperience'] = np.log1p(df['blueTotalExperience'])

    # ゴールドと経験値の相互作用
    df['goldXexperience'] = df['blueTotalGold'] * df['blueTotalExperience']
    
    return df

- Apply data

In [6]:
train = create_features(train)
test = create_features(test)

In [7]:
train['eliteMonsterHeraldRatio'].isna().sum()

0

# model

- LightGBM

In [8]:
# データの準備
X = train.drop('blueWins', axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# X_trainの欠損値を確認
print("X_trainの欠損値数:")
print(X_train.isna().sum())

# X_valの欠損値を確認
print("\nX_valの欠損値数:")
print(X_val.isna().sum())

# testデータの欠損値を確認
print("\ntestデータの欠損値数:")
print(test.isna().sum())

X_trainの欠損値数:
gameId                     0
blueFirstBlood             0
blueKills                  0
blueDeaths                 0
blueAssists                0
blueEliteMonsters          0
blueDragons                0
blueTotalGold              0
blueTotalExperience        0
killDeathRatio             0
assistRate                 0
blueHerald                 0
eliteMonsterDragonRatio    0
eliteMonsterHeraldRatio    0
logTotalGold               0
logTotalExperience         0
goldXexperience            0
dtype: int64

X_valの欠損値数:
gameId                     0
blueFirstBlood             0
blueKills                  0
blueDeaths                 0
blueAssists                0
blueEliteMonsters          0
blueDragons                0
blueTotalGold              0
blueTotalExperience        0
killDeathRatio             0
assistRate                 0
blueHerald                 0
eliteMonsterDragonRatio    0
eliteMonsterHeraldRatio    0
logTotalGold               0
logTotalExperience         0
gol

In [10]:
# X_trainの欠損値を含む行を表示
print("X_trainの欠損値を含む行:")
print(X_train[X_train.isna().any(axis=1)])

# X_valの欠損値を含む行を表示
print("\nX_valの欠損値を含む行:")
print(X_val[X_val.isna().any(axis=1)])

# testデータの欠損値を含む行を表示
print("\ntestデータの欠損値を含む行:")
print(test[test.isna().any(axis=1)])


X_trainの欠損値を含む行:
Empty DataFrame
Columns: [gameId, blueFirstBlood, blueKills, blueDeaths, blueAssists, blueEliteMonsters, blueDragons, blueTotalGold, blueTotalExperience, killDeathRatio, assistRate, blueHerald, eliteMonsterDragonRatio, eliteMonsterHeraldRatio, logTotalGold, logTotalExperience, goldXexperience]
Index: []

X_valの欠損値を含む行:
Empty DataFrame
Columns: [gameId, blueFirstBlood, blueKills, blueDeaths, blueAssists, blueEliteMonsters, blueDragons, blueTotalGold, blueTotalExperience, killDeathRatio, assistRate, blueHerald, eliteMonsterDragonRatio, eliteMonsterHeraldRatio, logTotalGold, logTotalExperience, goldXexperience]
Index: []

testデータの欠損値を含む行:
Empty DataFrame
Columns: [gameId, blueFirstBlood, blueKills, blueDeaths, blueAssists, blueEliteMonsters, blueDragons, blueTotalGold, blueTotalExperience, killDeathRatio, assistRate, blueHerald, eliteMonsterDragonRatio, eliteMonsterHeraldRatio, logTotalGold, logTotalExperience, goldXexperience]
Index: []


- ensemble

In [11]:
# データの準備
X = train.drop('blueWins', axis=1)
y = train['blueWins']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# スケーリング
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test)

# ベースモデルの設定
base_estimators = [
    ('lgb', lgb.LGBMClassifier(n_estimators=100, learning_rate=0.05)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=0)),
    ('svc', SVC(probability=True))
]

# スタッキング分類器の作成
stacking_clf = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

# スタッキング分類器のハイパーパラメータチューニング
params = {
    'final_estimator__C': [0.1, 1.0],
    'svc__C': [0.1, 1.0]
}
grid = GridSearchCV(estimator=stacking_clf, param_grid=params, cv=5, scoring='accuracy')
grid.fit(X_train_scaled, y_train)

print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

# 最適なパラメータでモデルを再訓練
stacking_clf_best = grid.best_estimator_
stacking_clf_best.fit(X_train_scaled, y_train)

# valデータに対する予測
y_val_pred = stacking_clf_best.predict(X_val_scaled)
y_val_pred_int = [int(pred) for pred in y_val_pred]

[LightGBM] [Info] Number of positive: 2515, number of negative: 2605
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 705
[LightGBM] [Info] Number of data points in the train set: 5120, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491211 -> initscore=-0.035160
[LightGBM] [Info] Start training from score -0.035160


[LightGBM] [Info] Number of positive: 2012, number of negative: 2084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 694
[LightGBM] [Info] Number of data points in the train set: 4096, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491211 -> initscore=-0.035160
[LightGBM] [Info] Start training from score -0.035160
[LightGBM] [Info] Number of positive: 2012, number of negative: 2084
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 690
[LightGBM] [Info] Number of data points in the train set: 4096, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.491211 -> initscore=-0.035160
[LightGBM] [Info] Start training from score -0.035160
[LightGBM] [Info] Nu

- 結果確認

In [16]:
# valデータのaccuracy確認
accuracy = accuracy_score(y_val, y_val_pred_int)
print("Validation Accuracy:", accuracy)

# testデータに対する予測
y_test_pred = stacking_clf_best.predict(test_scaled)
y_test_pred_int = [int(pred) for pred in y_test_pred]

# 変換された予測結果の表示
print(y_test_pred_int)

Validation Accuracy: 0.80875
[1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1

- testにy_test_pred結合

In [17]:
output = '../data/output/submit_ensemble_lgb-rf-lr_gridsearch.csv'

In [18]:
submit = pd.DataFrame({
    '0': test['gameId'],
    '1': y_test_pred_int
})

# カラムヘッダー消去して出力
# ヘッダーなしでCSVファイルとして保存
submit.to_csv(output, header=False, index=False)

- 出力csvチェック

In [19]:
check = pd.read_csv(output)

In [20]:
check

Unnamed: 0,9,1
0,15,1
1,18,0
2,23,0
3,31,0
4,32,1
...,...,...
1994,9971,0
1995,9980,0
1996,9983,1
1997,9996,1
