# 2024/02/14更新
- 書籍発売後にライブラリのバージョンアップが生じたため、書籍のコードを動かすためには下記が必要です。
    - (手順1) 下記URLをクリックしてノートブックにアクセス
        - https://www.kaggle.com/code/moromoromoro/kaggle-mlb-environment
    - (手順2) ノートブックの右上をクリックして、「Copy & edit notebook」を選択してノートブックを起動
    - (手順3) この環境上で書籍のコードを実行（これでmlbライブラリのインポートが可能となるはずです）
    - (手順4) 本notebookをコピーしたい場合は、起動したノートブックの「File」メニューの「Import Notebook」をクリックしてインポート
- このような操作が必要な理由ですが、mlbライブラリはpython=3.7である必要があるためです。
    - 現行版ではpython=3.10となっており、「import mlb」を実行した時点でエラーとなります。
    - これを回避するために、python=3.7環境を用意しました。
    - また、サブミット時は「インターネット非接続(Internet off)」が必須のため注意してください（用意したノートブックはインターネット非接続に設定しています）

# Kaggleで磨く 機械学習の実践力
# 第8章 回帰問題のコンペ (MLB Player Digital Engagement Forecasting)

# 8.3 ベースライン作成
## 8.3.2 データ前処理

### ● train_updated.csvの読み込みと加工
#### スクリプト8-1: ライブラリのインポート

In [None]:
import numpy as np
import pandas as pd
import gc
import pickle
import os
import datetime as dt

# plot
import matplotlib.pyplot as plt

# LightGBM
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

import warnings
warnings.simplefilter("ignore")

# 表示桁数の指定
pd.options.display.float_format = '{:10.4f}'.format

#### スクリプト8-2: train_updated.csvファイルの読み込み

In [None]:
train = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/train_updated.csv")
print(train.shape)
train.head()

#### スクリプト8-3: 処理速度を上げるためにデータを絞り込む

In [None]:
train = train.loc[train["date"]>=20200401, :].reset_index(drop=True)
print(train.shape)

#### スクリプト8-4: train_updated.csv専用の変換関数の作成

In [None]:
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)

def extract_data(input_df, col="events", show=False):
    output_df = pd.DataFrame()
    for i in np.arange(len(input_df)):
        if show: print("\r{}/{}".format(i+1, len(input_df)), end="")
        try:
            output_df = pd.concat([
                output_df,
                unpack_json(input_df[col].iloc[i])
            ], axis=0, ignore_index=True)
        except:
            pass
    if show: print("")
    if show: print(output_df.shape)
    if show: display(output_df.head())
    return output_df

#### スクリプト8-5: train_updated.csvから「nextDayPlayerEngagement」を取り出して表形式に変換

In [None]:
df_engagement = extract_data(train, col="nextDayPlayerEngagement", show=True)

#### スクリプト8-6: 結合キーであるdate_playerIdの作成

In [None]:
df_engagement["date_playerId"] = df_engagement["engagementMetricsDate"].str.replace("-", "") + "_" + df_engagement["playerId"].astype(str)
df_engagement.head()

#### スクリプト8-7: 日付から簡単な特徴量を作成

In [None]:
# 推論実施日のカラム作成（推論実施日＝推論対象日の前日）
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"], format="%Y-%m-%d") + dt.timedelta(days=-1)

# 推論実施日から「曜日」と「年月」の特徴量作成
df_engagement["dayofweek"] = df_engagement["date"].dt.dayofweek
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])
df_engagement.head()

### ● players.csvの読み込みと加工
#### スクリプト8-8: players.csvの読み込み

In [None]:
df_players = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/players.csv")
print(df_players.shape)
print(df_players["playerId"].agg("nunique"))
df_players.head()

#### スクリプト8-9: 評価対象の人数確認

In [None]:
df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"]==True, 1, 0)
print(df_players["playerForTestSetAndFuturePreds"].sum())
print(df_players["playerForTestSetAndFuturePreds"].mean())

## 8.3.3 データセット作成
#### スクリプト8-10: テーブル結合 

In [None]:
df_train = pd.merge(df_engagement, df_players, on=["playerId"], how="left")
print(df_train.shape)

#### スクリプト8-11: 学習用データセットの作成

In [None]:
x_train = df_train[[
    "playerId", "dayofweek",
    "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
    "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"]]
y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]
print(x_train.shape, y_train.shape, id_train.shape)
x_train.head()

#### スクリプト8-12: カテゴリ変数をcategory型に変換

In [None]:
for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"]:
    x_train[col] = x_train[col].astype("category")

## 8.3.4 バリデーション設計

#### スクリプト8-13: 学習データと検証データの期間の設定

In [None]:
list_cv_month = [
    [["2020-05","2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01","2021-02","2021-03","2021-04"], ["2021-05"]],
    [["2020-06","2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01","2021-02","2021-03","2021-04","2021-05"], ["2021-06"]],
    [["2020-07","2020-08","2020-09","2020-10","2020-11","2020-12","2021-01","2021-02","2021-03","2021-04","2021-05","2021-06"], ["2021-07"]],
]

#### スクリプト8-14: 学習データと検証データのindexリストの作成

In [None]:
cv = []
for month_tr, month_va in list_cv_month:
    cv.append([
        id_train.index[id_train["yearmonth"].isin(month_tr)],
        id_train.index[id_train["yearmonth"].isin(month_va) & (id_train["playerForTestSetAndFuturePreds"]==1)],
    ])
# fold0のindexのリスト
cv[0]

## 8.3.5 モデル学習
#### スクリプト8-15: 学習データと検証データに分離

In [None]:
# 目的変数は「target1」で，foldは「fold0」の場合とする
target = "target1"
nfold = 0

# trainとvalidのindex取得
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# trainデータとvalidデータに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, target], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train.loc[idx_va, target], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

#### スクリプト8-16: モデル学習

In [None]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1', 
    'metric': 'mean_absolute_error',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 50,
    'n_estimators': 1000,
    "random_state": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMRegressor(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr,y_tr), (x_va,y_va)],
          early_stopping_rounds=50,
          verbose=100,
         )

# モデルの保存
with open("model_lgb_target1_fold0.h5", "wb") as f:
    pickle.dump(model, f, protocol=4)

#### スクリプト8-17: モデル評価

In [None]:
# validデータの推論値取得
y_va_pred = model.predict(x_va)

# 全target/foldの推論値を格納する変数の作成
df_valid_pred = pd.DataFrame()

# 推論値を格納
tmp_pred = pd.concat([
    id_va,
    pd.DataFrame({"target": target, "nfold": 0, "true": y_va, "pred": y_va_pred}),
], axis=1)
df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

# 全target/foldの評価値を入れる変数の作成
metrics = []

# 評価値の算出
metric_va = mean_absolute_error(y_va, y_va_pred)
# 評価値を格納
metrics.append([target, nfold, metric_va])
metrics

#### スクリプト8-18: 説明変数の重要度取得

In [None]:
# 重要度の取得
tmp_imp = pd.DataFrame({"col":x_tr.columns, "imp":model.feature_importances_, "target":"target1", "nfold":nfold})
# 確認（重要度の上位10個）
display(tmp_imp.sort_values("imp", ascending=False))

# 全target/foldの重要度を格納するデータフレームの作成
df_imp = pd.DataFrame()                                                                                                                                                                                                                                                                                                                
# imp_foldをdf_impに結合
df_imp = pd.concat([df_imp, tmp_imp], axis=0, ignore_index=True)

#### スクリプト8-19: モデルの評価（全target/foldのサマリ）

In [None]:
# リスト型をデータフレームに変換
df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
display(df_metrics.head())

# 評価値
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

#### スクリプト8-20: 検証データの推論値の形式変換（全target/foldのサマリ）

In [None]:
df_valid_pred_all = pd.pivot_table(df_valid_pred, index=["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"], columns=["target",  "nfold"], values=["true", "pred"], aggfunc=np.sum)
df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i) for i,j,k in df_valid_pred_all.columns]
df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)
df_valid_pred_all.head()

#### スクリプト8-21: 説明変数の重要度取得（全target/foldのサマリ）

In [None]:
df_imp.groupby(["col"])["imp"].agg(["mean", "std"]).sort_values("mean", ascending=False)

#### スクリプト8-22: 学習用関数の作成

In [None]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2],
              mode_train="train",
             ):
    # 推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    # 評価値を入れる変数の作成
    metrics = []
    # 重要度を格納するデータフレームの作成
    df_imp = pd.DataFrame() 

    # validation
    cv = []
    for month_tr, month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) & (input_id["playerForTestSetAndFuturePreds"]==1)],
        ])
    
    # モデル学習 (target/foldごとに学習)
    for nfold in list_nfold:
        for i, target in enumerate(["target1", "target2", "target3", "target4"]):
            print("-"*20, target, ", fold:", nfold, "-"*20)
            # trainとvalid1に分離
            idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
            x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, target], id_train.loc[idx_tr, :]
            x_va, y_va, id_va = x_train.loc[idx_va, :], y_train.loc[idx_va, target], id_train.loc[idx_va, :]
            print(x_tr.shape, y_tr.shape, id_tr.shape)
            print(x_va.shape, y_va.shape, id_va.shape)
            
            # 保存するモデルのファイル名
            filepath = "model_lgb_{}_fold{}.h5".format(target, nfold)

            if mode_train=="train":
                print("training start.")
                model = lgb.LGBMRegressor(**params)
                model.fit(x_tr,
                          y_tr,
                          eval_set=[(x_tr,y_tr), (x_va,y_va)],
                          early_stopping_rounds=50,
                          verbose=100,
                         )
                with open(filepath, "wb") as f:
                    pickle.dump(model, f, protocol=4)
            else:
                print("model load.")
                with open(filepath, "rb") as f:
                    model = pickle.load(f)
                print("Done.")
                
            # validの推論値取得
            y_va_pred = model.predict(x_va)
            tmp_pred = pd.concat([
                id_va,
                pd.DataFrame({"target": target, "nfold": 0, "true": y_va, "pred": y_va_pred}),
            ], axis=1)
            df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)
            
            # 評価値の算出
            metric_va = mean_absolute_error(y_va, y_va_pred)
            metrics.append([target, nfold, metric_va])
            
            # 重要度の取得
            tmp_imp = pd.DataFrame({"col":x_tr.columns, "imp":model.feature_importances_, "target":target, "nfold":nfold})
            df_imp = pd.concat([df_imp, tmp_imp], axis=0, ignore_index=True)
        
    print("-"*10, "result", "-"*10)
    # 評価値
    df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))
    
    # validの推論値
    df_valid_pred_all = pd.pivot_table(df_valid_pred, index=["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"], columns=["target",  "nfold"], values=["true", "pred"], aggfunc=np.sum)
    df_valid_pred_all.columns = ["{}_fold{}_{}".format(j,k,i) for i,j,k in df_valid_pred_all.columns]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics, df_imp

#### スクリプト8-23: モデル学習

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1', 
    'metric': 'mean_absolute_error',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 50,
    'n_estimators': 1000,
    "random_state": 123,
    "importance_type": "gain",
}

df_valid_pred, df_metrics, df_imp = train_lgb(x_train,
                                              y_train,
                                              id_train,
                                              params,
                                              list_nfold=[0,1,2],
                                              mode_train="train",
                                             )

#### スクリプト8-24: 評価値（MCMAE）の確認

In [None]:
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))
display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

#### スクリプト8-25: 説明変数の重要度の確認

In [None]:
df_imp.groupby(["col"])["imp"].agg(["mean", "std"]).sort_values("mean", ascending=False)

## 8.3.6 モデル推論
### **パート１：推論用データセットの作成**

#### スクリプト8-26: 推論時に受け取るデータのフォーマット確認①（サブミット時はコメントアウト）

In [None]:
# import mlb

# env = mlb.make_env()
# iter_test = env.iter_test()

# for (test_df, prediction_df) in iter_test:
#     # forループで受け取るデータの確認
#     display(test_df.head())
#     display(prediction_df.head())
#     break

#### スクリプト8-27: 推論時に受け取るデータのフォーマット確認②（サブミット時はコメントアウト）

In [None]:
# # forループで受け取るtest_dfのサンプルデータ
# test_df = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_test.csv")
# display(test_df.head())

# # forループで受け取るprediction_dfのサンプルデータ
# prediction_df = pd.read_csv("../input/mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
# display(prediction_df.head())

#### スクリプト8-28: 推論時に受け取るデータの疑似生成（2021/4/26分）

In [None]:
# test_dfの疑似生成（4/26に受け取るデータを想定）
test_df = train.loc[train["date"]==20210426, :]
display(test_df.head())

# prediction_dfの疑似生成（4/26に受け取るデータを想定）
prediction_df = df_engagement.loc[df_engagement["date"]=="2021-04-26", ["date","date_playerId"]].reset_index(drop=True)
prediction_df["date"] = prediction_df["date"].apply(lambda x: int(str(x).replace("-","")[:8]))
for col in ["target1","target2","target3","target4"]:
    prediction_df[col] = 0
display(prediction_df.head())

#### スクリプト8-29: 推論用データセット作成の関数

In [None]:
def makedataset_for_predict(input_test, input_prediction):
    test = input_test.copy()
    prediction = input_prediction.copy()
    
    # dateを日付型に変換
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d") 
    # 推論対象日(engagementMetricsDate)と選手ID(playerId)のカラムを作成
    prediction["engagementMetricsDate"] = prediction["date_playerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d") 
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: int(x[9:]))
    
    # 日付から曜日と年月を作成
    prediction["dayofweek"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])
    
    # テーブルの結合
    df_test = pd.merge(prediction, df_players, on=["playerId"], how="left")
    
    # 説明変数の作成
    x_test = df_test[[
        "playerId", "dayofweek",
        "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
        "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"]]
    id_test = df_test[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]

    # カテゴリ変数をcategory型に変換
    for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"]:
        x_test[col] = x_test [col].astype("category")

    return x_test, id_test

#### スクリプト8-30: 推論用データセット作成の実行

In [None]:
x_test, id_test = makedataset_for_predict(test_df, prediction_df)
display(x_test.head())
display(id_test.head())

### **パート２：モデル推論**
#### スクリプト8-xx: モデルの読み込み

In [None]:
with open("model_lgb_target1_fold0.h5", "rb") as f:
    model = pickle.load(f)

#### スクリプト8-32: モデルを用いた推論

In [None]:
pred = model.predict(x_test)

df_test_pred = id_test.copy()
df_test_pred["target1_fold0"] = pred

#### スクリプト8-33: 推論値の計算

In [None]:
# target1の推論値： 各foldの平均値
df_test_pred["target1"] = df_test_pred[df_test_pred.columns[df_test_pred.columns.str.contains("target1")]].mean(axis=1)
# target2,3,4についても同様の方法で計算します。(ここでは省略)

print(df_test_pred.shape)
df_test_pred.head()

#### スクリプト8-34: 推論処理の関数

In [None]:
def predict_lgb(input_test,
                input_id,
                list_nfold=[0,1,2],
               ):
    df_test_pred = id_test.copy()
    
    for target in ["target1","target2","target3","target4"]:
        for nfold in list_nfold:
            # モデルのロード
            with open("model_lgb_{}_fold{}.h5".format(target, nfold), "rb") as f:
                    model = pickle.load(f)

            # 推論
            pred = model.predict(input_test)
            # 予測値の格納
            df_test_pred["{}_fold{}".format(target, nfold)] = pred
            
    # 推論値の取得： 各foldの平均値
    for target in ["target1","target2","target3","target4"]:
        df_test_pred[target] = df_test_pred[df_test_pred.columns[df_test_pred.columns.str.contains(target)]].mean(axis=1)
    
    return df_test_pred

#### スクリプト8-35: モデル推論の実行

In [None]:
df_test_pred = predict_lgb(x_test, id_test)
df_test_pred.head()

### **パート３：提出用フォーマットへの変換**
#### スクリプト8-36: 提出用フォーマットへの変換

In [None]:
df_submit = df_test_pred[["date_playerId", "target1","target2","target3","target4"]]
df_submit.head()

#### スクリプト8-37: 推論処理の実行
- mlbライブラリは一度しか実行できません。再度実行したい場合はカーネルを再起動する必要があります。
- 本notebookではmlbライブラリを3ヶ所で実行しています。実行したいセル以外はコメントアウトしてから実行してください。
    - 8.3 ベースライン作成: スクリプト8-37
    - 8.4 特徴量エンジニアリング: スクリプト8-47
    - 8.5 モデルチューニング: スクリプト8-62

In [None]:
import mlb

env = mlb.make_env()
iter_test = env.iter_test()

for (test_df, prediction_df) in iter_test:
    test = test_df.copy()
    prediction = prediction_df.copy()
    prediction = prediction.reset_index(drop=False)
    
    print("date:", prediction["date"][0])
    
    # データセット作成
    x_test, id_test = makedataset_for_predict(test, prediction)
    
    # 推論処理
    df_test_pred = predict_lgb(x_test, id_test)
    
    # 提出データの作成
    df_submit = df_test_pred[["date_playerId", "target1","target2","target3","target4"]]
    
    # 後処理：欠損値埋め，0-100の範囲外のデータをクリッピング
    for i,col in enumerate(["target1","target2","target3","target4"]):
        df_submit[col] = df_submit[col].fillna(0.)
        df_submit[col] = df_submit[col].clip(0, 100)

    # 予測値データの提出
    env.predict(df_submit)
print("Done.")

# 8.4 特徴量エンジニアリング
## 8.4.1 データ前処理

#### スクリプト8-38: train_updated.csvからrostersカラムのデータ取り出し

In [None]:
df_rosters = extract_data(train, col="rosters", show=True)

#### スクリプト8-39: rostersのデータ前処理加工

In [None]:
# dateカラムの作成・加工
df_rosters = df_rosters.rename(columns={"gameDate":"date"})
df_rosters["date"] = pd.to_datetime(df_rosters["date"], format="%Y-%m-%d")

# 追加するカラムリストの作成 (dateとplayerIdは結合キー)
col_rosters = ["teamId","statusCode","status"]

df_rosters.head()

#### スクリプト8-40: targetの特徴量の計算

In [None]:
df_agg_target = df_train.groupby(["yearmonth", "playerId"])[["target1", "target2", "target3", "target4"]].agg(["mean", "median", "std", "min", "max"])
df_agg_target.columns = ["{}_{}".format(i,j) for i,j in df_agg_target.columns]
df_agg_target = df_agg_target.reset_index(drop=False)
df_agg_target.head()

#### スクリプト8-41: ラグ特徴量の作成

In [None]:
# 年月でソート（時系列順に並んでいないとシフト時におかしくなるので）
df_agg_target = df_agg_target.sort_values("yearmonth").reset_index(drop=True)

# yearmonthを1ヶ月シフト過去にさせる
df_agg_target["yearmonth"] = df_agg_target.groupby(["playerId"])["yearmonth"].shift(-1)
# yearmonthの欠損値を「2021-08」で埋める
df_agg_target["yearmonth"] = df_agg_target["yearmonth"].fillna("2021-08")

# 集計値がラグ特徴量と分かるように名称を変更
df_agg_target.columns = [col+"_lag1month" if col not in ["playerId","yearmonth"] else col for col in df_agg_target.columns ]

# 追加したカラムリスト作成
col_agg_target = list(df_agg_target.columns[df_agg_target.columns.str.contains("lag1month")])

df_agg_target.head()

## 8.4.2 データセット作成
#### スクリプト8-42: 学習用データセットの作成

In [None]:
# データを結合
df_train = pd.merge(df_engagement, df_players, on=["playerId"], how="left")
df_train = pd.merge(df_train, df_rosters, on=["date", "playerId"], how="left")
df_train = pd.merge(df_train, df_agg_target, on=["playerId", "yearmonth"], how="left")

# 説明変数と目的変数の作成
x_train = df_train[[
    "playerId", "dayofweek",
    "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
    "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"
] + col_rosters + col_agg_target]
y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]

# カテゴリ変数をcategory型に変換
for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"] + col_rosters:
    x_train[col] = x_train[col].astype("category")

print(x_train.shape, y_train.shape, id_train.shape)
x_train.head()

## 8.4.3 モデル学習
#### スクリプト8-43: モデル学習の実行

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression_l1', 
    'metric': 'mean_absolute_error',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'subsample': 0.7,
    'subsample_freq': 1,
    'feature_fraction': 0.8,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 50,
    'n_estimators': 10000,
    "random_state": 123,
    "importance_type": "gain",
}

df_valid_pred, df_metrics, df_imp = train_lgb(x_train,
                                              y_train,
                                              id_train,
                                              params,
                                              list_nfold=[0,1,2],
                                              mode_train="train",
                                             )

#### スクリプト8-44: 評価値の取得

In [None]:
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))
display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

#### スクリプト8-45: 説明変数の重要度の確認

In [None]:
df_imp.groupby(["col"])["imp"].agg(["mean", "std"]).sort_values("mean", ascending=False)[:10]

## 8.4.4 モデル推論
#### スクリプト8-46: 推論用データセット作成の関数

In [None]:
def makedataset_for_predict(input_x, input_prediction):
    test = input_x.copy()
    prediction = input_prediction.copy()
    
    # 日付型に変換
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d") 
    # engagementMetricsDateとplayerIdを取り出す
    prediction["engagementMetricsDate"] = prediction["date_playerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d") 
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: int(x[9:]))
    
    # dateから特徴量を作成
    prediction["dayofweek"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])
    
    # dateカラムの作成・加工
    df_rosters = extract_data(test, col="rosters")
    df_rosters = df_rosters.rename(columns={"gameDate":"date"})
    df_rosters["date"] = pd.to_datetime(df_rosters["date"], format="%Y-%m-%d")
    
    # テーブルの結合
    df_test = pd.merge(prediction, df_players, on=["playerId"], how="left")
    df_test = pd.merge(df_test, df_rosters, on=["date", "playerId"], how="left")
    df_test = pd.merge(df_test, df_agg_target, on=["playerId", "yearmonth"], how="left")
    
    # 説明変数の作成
    x_test = df_test[[
        "playerId", "dayofweek",
        "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
        "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"
    ] + col_rosters + col_agg_target]
    id_test = df_test[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]

    # カテゴリ変数をcategory型に変換
    for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"] + col_rosters:
        x_test[col] = x_test [col].astype("category")

    return x_test, id_test

#### スクリプト8-47: 推論処理の実行（ベースラインと同一）
- mlbライブラリは一度しか実行できません。再度実行したい場合はカーネルを再起動する必要があります。
- 本notebookではmlbライブラリを3ヶ所で実行しています。実行したいセル以外はコメントアウトしてから実行してください。
    - 8.3 ベースライン作成: スクリプト8-37
    - 8.4 特徴量エンジニアリング: スクリプト8-47
    - 8.5 モデルチューニング: スクリプト8-62

In [None]:
# import mlb

# env = mlb.make_env()
# iter_test = env.iter_test()

# for (test_df, sample_prediction_df) in iter_test:
#     test = test_df.copy()
#     prediction = sample_prediction_df.copy()
#     prediction = prediction.reset_index(drop=False)
    
#     print("date:", prediction["date"][0])
    
#     # データセット作成
#     x_test, id_test = makedataset_for_predict(test, prediction)
    
#     # 推論処理
#     df_test_pred = predict_lgb(x_test, id_test)
    
#     # 提出データの作成
#     df_submit = df_test_pred[["date_playerId", "target1","target2","target3","target4"]]
    
#     # 後処理：欠損値埋め，0-100の範囲外のデータをクリッピング
#     for i,col in enumerate(["target1","target2","target3","target4"]):
#         df_submit[col] = df_submit[col].fillna(0.)
#         df_submit[col] = df_submit[col].clip(0, 100)
    
#     # 予測値データの提出
#     env.predict(df_submit)
# print("Done.")

# 8.5 モデルチューニング
#### スクリプト8-48: 目的変数間の相関係数の算出

In [None]:
df_engagement[["target1", "target2", "target3", "target4"]].corr()

#### スクリプト8-49: ライブラリのインポート

In [None]:
from sklearn.preprocessing import LabelEncoder

# import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Activation, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Embedding, Flatten

#### スクリプト8-xx: 再現性のためのシート指定

In [None]:
def seed_everything(seed):
    import random
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(
        intra_op_parallelism_threads=1,
        inter_op_parallelism_threads=1
    )
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

## 8.5.1 データセット作成
#### スクリプト8-50: 学習用データセットの作成

In [None]:
x_train = df_train[[
    "playerId", "dayofweek",
    "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
    "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"
] + col_rosters + col_agg_target]
y_train = df_train[["target1","target2","target3","target4"]]
id_train = df_train[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]

print(x_train.shape, y_train.shape, id_train.shape)

#### スクリプト8-51: 数値とカテゴリ変数のカラムリストを作成

In [None]:
col_num = ["heightInches", "weight","playerForTestSetAndFuturePreds"] + col_agg_target
col_cat = ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"] + col_rosters
print(len(col_num), len(col_cat))

#### スクリプト8-52: 数値データの欠損値補間・数値化

In [None]:
dict_num = {}
for col in col_num:
    print(col)
#     # 欠損値補間：平均値で埋める
#     value_fillna = x_train[col].mean()
    # 欠損値補間：0で埋める
    value_fillna = 0
    x_train[col] = x_train[col].fillna(value_fillna)
    
    # 正規化（0～1になるように変換）
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)
    
    # testデータにも適用できるように保存
    dict_num[col] = {}
    dict_num[col]["fillna"] = value_fillna
    dict_num[col]["min"] = value_min
    dict_num[col]["max"] = value_max
    
print("Done.")

#### スクリプト8-53: カテゴリ変数の欠損値補間・数値化

In [None]:
dict_cat = {}
for col in col_cat:
    print(col)
    # 欠損値補間：unknownで埋める
    value_fillna = "unknown"
    x_train[col] = x_train[col].fillna(value_fillna)
    
    # str型に変換
    x_train[col] = x_train[col].astype(str)
    
    # ラベルエンコーダー：0からはじまる整数に変換
    le = LabelEncoder()
    le.fit(x_train[col])
    # 推論時に未知の値があっても対応できるように未知ラベル(unknown)を用意。
    list_label = sorted(list(set(le.classes_) | set(["unknown"])))
    map_label = {j:i for i,j in enumerate(list_label)}
    x_train[col] = x_train[col].map(map_label)
    
    # testデータにも適用できるように保存
    dict_cat[col] = {}
    dict_cat[col]["fillna"] = value_fillna
    dict_cat[col]["map_label"] = map_label
    dict_cat[col]["num_label"] = len(list_label)
    
print("Done.")

#### スクリプト8-54: 欠損値補間・正規化の関数化（推論用）

In [None]:
def transform_data(input_x):
    output_x = input_x.copy()
    
    # 数値データの欠損値補間・正規化
    for col in col_num:
        # 欠損値補間：平均値で埋める
        value_fillna = dict_num[col]["fillna"]
        output_x[col] = output_x[col].fillna(value_fillna)
        
        # 正規化（0～1になるように変換）
        value_min = dict_num[col]["min"]
        value_max = dict_num[col]["max"]
        output_x[col] = (output_x[col] - value_min) / (value_max - value_min)
    
    # カテゴリ変数の欠損値補間・正規化
    for col in col_cat:
        # 欠損値補間：unknownで埋める
        value_fillna = dict_cat[col]["fillna"]
        output_x[col] = output_x[col].fillna(value_fillna)
        
        # str型に変換
        output_x[col] = output_x[col].astype(str)
        
        # ラベルエンコーダー：0からはじまる整数に変換
        map_label = dict_cat[col]["map_label"]
        output_x[col] = output_x[col].map(map_label)
        # 対応するものが無い場合はunknownのラベルで埋める
        output_x[col] = output_x[col].fillna(map_label["unknown"])

    return output_x

## 8.5.2 モデル学習
#### スクリプト8-55: ニューラルネットワークのモデル定義

In [None]:
def create_model(col_num=["heightInches", "weight"],
                 col_cat=["playerId", "teamId", "dayofweek"], 
                 show=False,
                ):
    input_num = Input(shape=(len(col_num),))
    input_cat = Input(shape=(len(col_cat),))
    
    # numeric
    x_num = input_num #Dense(30, activation="relu")(input_num)
    
    # category
    for i,col in enumerate(col_cat):
        tmp_cat = input_cat[:, i]
        input_dim = dict_cat[col]["num_label"]
        output_dim = int(input_dim/2)
        tmp_cat = Embedding(input_dim=input_dim, output_dim=output_dim)(tmp_cat)
        tmp_cat = Dropout(0.2)(tmp_cat)
        tmp_cat = Flatten()(tmp_cat)
        if i==0:
            x_cat = tmp_cat
        else:
            x_cat = Concatenate()([x_cat, tmp_cat])

    # concat
    x = Concatenate()([x_num, x_cat])
    
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)
    output = Dense(4, activation="linear")(x)
    
    model = Model(inputs=[input_num, input_cat], outputs=output)
    model.compile(optimizer="Adam", loss="mae", metrics=["mae"])
    
    if show:
        print(model.summary())
    else:
        return model

#### スクリプト8-56: モデル構造の確認

In [None]:
create_model(col_num=col_num,
             col_cat=col_cat,
             show=True)

#### スクリプト8-57: 学習用の関数をニューラルネットワーク用にカスタマイズ

In [None]:
def train_tf(input_x,
             input_y,
             input_id,
             list_nfold=[0,1,2],
             mode_train="train",
             batch_size=1024,
             epochs=100,
            ):
    # 推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    # 評価値を入れる変数の作成
    metrics = []
    
    # validation
    cv = []
    for month_tr, month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) & (input_id["playerForTestSetAndFuturePreds"]==1)],
        ])
    
    # モデル学習 (foldごとに学習)
    for nfold in list_nfold:
        print("-"*20, "fold:", nfold, "-"*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        
        x_num_tr, x_cat_tr, y_tr = input_x.loc[idx_tr, col_num].values, input_x.loc[idx_tr, col_cat].values, input_y.loc[idx_tr, :].values
        x_num_va, x_cat_va, y_va = input_x.loc[idx_va, col_num].values, input_x.loc[idx_va, col_cat].values, input_y.loc[idx_va, :].values
        print(x_num_tr.shape, x_cat_tr.shape, y_tr.shape)
        print(x_num_va.shape, x_cat_va.shape, y_va.shape)
        
        filepath = "model_tf_fold{}.h5".format(nfold)
        
        if mode_train=="train":
            print("training start.")
            seed_everything(seed=123)
            model = create_model(col_num=col_num, col_cat=col_cat, show=False)
            model.fit(x=[x_num_tr, x_cat_tr],
                      y=y_tr,
                      validation_data=([x_num_va, x_cat_va], y_va),
                      batch_size=batch_size,
                      epochs=epochs,
                      callbacks=[
                          ModelCheckpoint(filepath= filepath, monitor="val_loss", mode="min", verbose=1, save_best_only=True, save_weights_only=True),
                          EarlyStopping(monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True),
                          ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
                      ],
                      verbose=1,
                     )
        else:
            print("model load.")
            model = create_model(col_num=col_num, col_cat=col_cat, show=False)
            model.load_weights(filepath)
            print("Done.")
    
        # validの推論値取得
        y_va_pred = model.predict([x_num_va, x_cat_va])
        tmp_pred = pd.concat([
            id_va,
            pd.DataFrame(y_va, columns=["target1_true","target2_true","target3_true","target4_true"]),
            pd.DataFrame(y_va_pred, columns=["target1_pred","target2_pred","target3_pred","target4_pred"]),
        ], axis=1)
        tmp_pred["nfold"] = nfold
        df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)
        
        # 評価値の算出
        metrics.append(["target1", nfold, np.mean(np.abs(y_va[:,0] - y_va_pred[:,0]))])
        metrics.append(["target2", nfold, np.mean(np.abs(y_va[:,1] - y_va_pred[:,1]))])
        metrics.append(["target3", nfold, np.mean(np.abs(y_va[:,2] - y_va_pred[:,2]))])
        metrics.append(["target4", nfold, np.mean(np.abs(y_va[:,3] - y_va_pred[:,3]))])
    
    print("-"*10, "result", "-"*10)
    # 評価値
    df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))
    
    # validの推論値
    df_valid_pred_all = pd.pivot_table(df_valid_pred,
                                       index=["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"],
                                       columns=["nfold"], values=list(df_valid_pred.columns[df_valid_pred.columns.str.contains("target")]), aggfunc=np.sum)
    df_valid_pred_all.columns = ["{}_fold{}_{}".format(i.split("_")[0], j,i.split("_")[1]) for i,j in df_valid_pred_all.columns]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)
    
    return df_valid_pred_all, df_metrics

#### スクリプト8-58: 学習の実行

In [None]:
df_valid_pred, df_metrics = train_tf(x_train,
                                     y_train,
                                     id_train,
                                     list_nfold=[0,1,2],
                                     mode_train="train",
                                     batch_size=1024,
                                     epochs=1000,
                                    )

#### スクリプト8-59: 評価値の確認

In [None]:
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))
display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

## 8.5.3 モデル推論
#### スクリプト8-60: データセット作成関数をニューラルネットワーク用にカスタマイズ

In [None]:
def makedataset_for_predict(input_x, input_prediction):
    test = input_x.copy()
    prediction = input_prediction.copy()
    
    # 日付型に変換
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d") 
    # engagementMetricsDateとplayerIdを取り出す
    prediction["engagementMetricsDate"] = prediction["date_playerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d") 
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: int(x[9:]))
    
    # dateから特徴量を作成
    prediction["dayofweek"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])
    
    # dateカラムの作成・加工
    df_rosters = extract_data(test, col="rosters")
    df_rosters = df_rosters.rename(columns={"gameDate":"date"})
    df_rosters["date"] = pd.to_datetime(df_rosters["date"], format="%Y-%m-%d")
    
    # テーブルの結合
    df_test = pd.merge(prediction, df_players, on=["playerId"], how="left")
    df_test = pd.merge(df_test, df_rosters, on=["date", "playerId"], how="left")
    df_test = pd.merge(df_test, df_agg_target, on=["playerId", "yearmonth"], how="left")
    
    # 説明変数の作成
    x_test = df_test[[
        "playerId", "dayofweek",
        "birthCity", "birthStateProvince", "birthCountry", "heightInches", "weight", 
        "primaryPositionCode", "primaryPositionName", "playerForTestSetAndFuturePreds"
    ] + col_rosters + col_agg_target]
    id_test = df_test[["engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"]]

#     # カテゴリ変数をcategory型に変換
#     for col in ["playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"] + col_rosters:
#         x_test[col] = x_test [col].astype("category")

    return x_test, id_test

#### スクリプト8-61: 推論用関数をニューラルネットワーク用にカスタマイズ

In [None]:
def predict_tf(input_x,
               input_id,
               list_nfold=[0,1,2],
              ):
    # 推論値を入れる変数の作成
    test_pred = np.zeros((len(input_x), 4))
    
    # 数値とカテゴリ変数に分離
    x_num_test, x_cat_test = input_x[col_num], input_x[col_cat]
    
    for nfold in list_nfold:
        # モデルのロード
        filepath = "model_tf_fold{}.h5".format(nfold)
        model = create_model(col_num=col_num, col_cat=col_cat, show=False)
        model.load_weights(filepath)
        
        # validの推論値取得
        pred = model.predict([x_num_test, x_cat_test], batch_size=512, verbose=0)
        test_pred += pred / len(list_nfold)
    
    # 推論値の格納
    df_test_pred = pd.concat([
        input_id,
        pd.DataFrame(test_pred, columns=["target1","target2","target3","target4"]),
    ], axis=1)
    
    return df_test_pred

#### スクリプト8-62: 推論処理の実行
- mlbライブラリは一度しか実行できません。再度実行したい場合はカーネルを再起動する必要があります。
- 本notebookではmlbライブラリを3ヶ所で実行しています。実行したいセル以外はコメントアウトしてから実行してください。
    - 8.3 ベースライン作成: スクリプト8-37
    - 8.4 特徴量エンジニアリング: スクリプト8-47
    - 8.5 モデルチューニング: スクリプト8-62

In [None]:
# import mlb

# env = mlb.make_env()
# iter_test = env.iter_test()

# for (test_df, sample_prediction_df) in iter_test:
#     test = test_df.copy()
#     prediction = sample_prediction_df.copy()
#     prediction = prediction.reset_index(drop=False)
    
#     print("date:", prediction["date"][0])
    
#     # データセット作成
#     x_test, id_test = makedataset_for_predict(test, prediction)
    
#     # 欠損値補間・正規化
#     x_test = transform_data(x_test)
        
#     # 推論処理
#     df_test_pred = predict_tf(x_test, id_test)
    
#     # 提出データの作成
#     df_submit = df_test_pred[["date_playerId", "target1","target2","target3","target4"]]
    
#     # 後処理：欠損値埋め，0-100の範囲外のデータをクリッピング
#     for i,col in enumerate(["target1","target2","target3","target4"]):
#         df_submit[col] = df_submit[col].fillna(0.)
#         df_submit[col] = df_submit[col].clip(0, 100)
    
#     # 予測値データの提出
#     env.predict(df_submit)
# print("Done.")