# exp084

xgbのハイパラチューニング

In [1]:
import os
import sys
import traceback
import gc
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
import polars as pl
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import warnings
warnings.simplefilter('ignore')



In [2]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp084"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
    base_exp = None # 特徴量重要度を使う元のexp
    n_features = 500 # 特徴量削減の数
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
    import cudf

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [3]:
params = {
        'objective' : 'binary:logistic',
        'tree_method': 'gpu_hist', 
        'eval_metric':'logloss',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'n_estimators': 100000, 
        'early_stopping_rounds': 100,
        'subsample':0.8,
        'colsample_bytree': 0.4,
        'seed': cfg.seed,
        "enable_categorical": True
    }

In [4]:
level_group_list = ['0-4', '5-12', '13-22']
level_group_map = {
    "q1":"0-4", "q2":"0-4", "q3":"0-4",
    "q4":"5-12", "q5":"5-12", "q6":"5-12", "q7":"5-12", "q8":"5-12", "q9":"5-12", "q10":"5-12", "q11":"5-12", "q12":"5-12", "q13":"5-12",
    "q14":"13-22", "q15":"13-22", "q16":"13-22", "q17":"13-22", "q18":"13-22"  
}

In [5]:
if cfg.mode == "local_cv":
    with open(cfg.prep_dir + 'cat_col_lists_v2.pkl', 'rb') as f:
        cat_col_lists = pickle.load(f) 

elif cfg.mode == "kaggle_inf":
    with open("/kaggle/input/psp-cat-col-lists/cat_col_lists_v2.pkl", 'rb') as f:
        cat_col_lists = pickle.load(f) 

In [6]:
def transform_labels_df_train(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels


def transform_labels_df_inf(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    return labels

In [7]:
class Features:
    def __init__(self, sessions_df, need_create_features=None):
        self.sessions_df = pl.from_pandas(sessions_df).sort(["session_id", "index"])
        self.group = sessions_df["level_group"].values[0]
        self.need_create_features = need_create_features

    def prep(self):
        self.sessions_df = self.sessions_df.with_columns(
            [(pl.col("elapsed_time") - pl.col("elapsed_time").shift(1)).clip(0, 1e9).fill_null(0).over(["session_id"]).alias("time_diff"),
             (pl.col("event_name") + "_" + pl.col("name")).alias("event_name+name"),
             (pl.col("event_name") + "_" + pl.col("room_fqid")).alias("event_name+room_fqid"),
             (pl.col("event_name") + "_" + pl.col("fqid")).alias("event_name+fqid")
             ]
        )

    def get_aggs(self):
        g = self.group
        cats = cat_col_lists[g]
        aggs = []
        feats = []

        # トータルレコード数
        feats += [f"{g}_record_cnt"]
        aggs += [pl.col("index").count().alias(f"{g}_record_cnt")]

        # グループ全体の経過時間
        feats += [f"{g}_elapsed_time"]
        aggs += [pl.col("elapsed_time").apply(lambda s:s.max() - s.min()).alias(f"{g}_elapsed_time")]

        # 各categoryごとのレコード数
        for c in ["event_name", "name", "page", "level", "room_fqid", "fqid", "event_name+name", "event_name+room_fqid", "event_name+fqid"]:
            feats += [f"{g}_{c}_{str(v)}_record_cnt" for v in cats[c]]
            aggs += [pl.col("index").filter(pl.col(c)==v).count().alias(f"{g}_{c}_{str(v)}_record_cnt") for v in cats[c]]
        
        # 各categoryごとのユニーク数
        for c in ["event_name", "name", "page", "level", "room_fqid", "fqid", "event_name+name", "event_name+room_fqid", "event_name+fqid"]:
            feats += [f"{g}_{c}_nunique"]
            aggs += [pl.col(c).drop_nulls().n_unique().alias(f"{g}_{c}_nunique")]

        # 集計量
        for v in ["elapsed_time", "index"]:
            feats += [f"{g}_{v}_max", f"{g}_{v}_min"]
            aggs += [pl.col(v).max().alias(f"{g}_{v}_max").cast(pl.Float32), pl.col(v).max().alias(f"{g}_{v}_min").cast(pl.Float32)]

        for v in ["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"]:
            feats += [f"{g}_{v}_mean"]
            aggs += [pl.col(v).mean().alias(f"{g}_{v}_mean").cast(pl.Float32)]

        for v in ["time_diff", "hover_duration"]:
            feats += [f"{g}_{v}_max", f"{g}_{v}_min", f"{g}_{v}_std", f"{g}_{v}_mean", f"{g}_{v}_sum", f"{g}_{v}_median"]
            aggs += [pl.col(v).max().alias(f"{g}_{v}_max").cast(pl.Float32), pl.col(v).min().alias(f"{g}_{v}_min").cast(pl.Float32), pl.col(v).std().alias(f"{g}_{v}_std").cast(pl.Float32),
                     pl.col(v).mean().alias(f"{g}_{v}_mean").cast(pl.Float32), pl.col(v).sum().alias(f"{g}_{v}_sum").cast(pl.Float32), pl.col(v).mean().alias(f"{g}_{v}_median").cast(pl.Float32)]
            
            #feats += [f"{g}_{v}_quantile01", f"{g}_{v}_quantile02", f"{g}_{v}_quantile03", f"{g}_{v}_quantile04"
            #          f"{g}_{v}_quantile06", f"{g}_{v}_quantile07", f"{g}_{v}_quantile08", f"{g}_{v}_quantile09"]
            #aggs += [pl.col(v).quantile(0.1, "nearest").alias(f"{g}_{v}_quantile01"),
            #         pl.col(v).quantile(0.2, "nearest").alias(f"{g}_{v}_quantile02"),
            #         pl.col(v).quantile(0.3, "nearest").alias(f"{g}_{v}_quantile03"),
            #         pl.col(v).quantile(0.4, "nearest").alias(f"{g}_{v}_quantile04"),
            #         pl.col(v).quantile(0.6, "nearest").alias(f"{g}_{v}_quantile06"),
            #         pl.col(v).quantile(0.7, "nearest").alias(f"{g}_{v}_quantile07"),
            #         pl.col(v).quantile(0.8, "nearest").alias(f"{g}_{v}_quantile08"),
            #         pl.col(v).quantile(0.9, "nearest").alias(f"{g}_{v}_quantile09")]

        # カテゴリ×集計量
        cs = ["event_name", "room_fqid", "fqid", "text_fqid", "level", "name", "event_name+name", "event_name+fqid", "event_name+room_fqid"]
        vs = ["time_diff", "elapsed_time"]
        for c, v in itertools.product(cs, vs):
            feats += [f"{g}_{c}_{cat}_{v}_max" for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).max().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_max").cast(pl.Float32) for cat in cats[c]]
            feats += [f"{g}_{c}_{cat}_{v}_min" for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).min().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_min").cast(pl.Float32) for cat in cats[c]]
            feats += [f"{g}_{c}_{cat}_{v}_std" for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).std().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_std").cast(pl.Float32) for cat in cats[c]]
            feats += [f"{g}_{c}_{cat}_{v}_median" for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).median().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_median").cast(pl.Float32) for cat in cats[c]]

        # カテゴリの変化回数
        for c in ["room_fqid", "text_fqid"]:
            feats += [f"{g}_{c}_change_cnt"]
            aggs += [(pl.col(c) != pl.col(c).shift(1)).sum().alias(f"{g}_{c}_change_cnt")]

        # 道中のminiゲーム特徴量
        
        # 生成する特徴量を限定
        if self.need_create_features is not None:
            aggs = [aggs[i] for i, f in enumerate(feats) if f in self.need_create_features]

        return aggs

    def get_features(self):
        self.prep()
        aggs = self.get_aggs()
        features = self.sessions_df.groupby(["session_id"], maintain_order=True).agg(aggs)
        return features.to_pandas()

In [8]:
def get_train_dataset(sessions, labels):
    # labelデータの整形
    labels = transform_labels_df_train(labels)

    # 特徴量生成
    features = Features(sessions).get_features()
    train = labels.merge(features, on=["session_id"], how="left")
    train["question"] = train["question"].astype("category")

    return train

def get_test_dataset(sessions, labels, feature_select=False, need_create_features=None):
    # labelデータの整形
    labels = transform_labels_df_inf(labels)

    # 特徴量生成
    features = Features(sessions, need_create_features).get_features()
    test = labels.merge(features, on=["session_id"], how="left")
    test["question"] = test["question"].astype("category")

    return test   

In [9]:
def calc_metrics(oof):
    logloss = log_loss(oof["correct"], oof["pred"])

    # find best th
    scores = []; thresholds = []
    best_score = 0; best_threshold = 0

    for threshold in np.arange(0.4,0.81,0.01):
        preds = (oof["pred"].values>threshold).astype(int)
        m = f1_score(oof["correct"].values, preds, average='macro')   
        scores.append(m)
        thresholds.append(threshold)
        if m>best_score:
            best_score = m
            best_threshold = threshold
    print("logloss", format(logloss, ".6f"))
    print("best_score", format(best_score, ".6f"))
    print("best_threshold", format(best_threshold, ".3f"))

    # Q別スコア
    print("---"*10)
    for q in range(18):
        q = q + 1
        preds = (oof[oof["question"]==q]["pred"].values>threshold).astype(int)
        m = f1_score(oof[oof["question"]==q]["correct"].values, preds, average='macro')
        print(f"Q{q} : F1 = {format(m, '.6f')}")
    return best_threshold

In [10]:
class FeaturesSelect:
    def __init__(self, df, init_features, corr_th=0.99):
        self.init_features = init_features
        self.df = cudf.from_pandas(df)
        self.corr_th = corr_th
        self.drop_cols = []
    
    def _high_corr_features_drop(self):
        num_cols = self.df[self.init_features].select_dtypes(include="number").columns

        # 特徴量間の相関行列を計算
        corr_matrix = self.df[num_cols].fillna(-1).corr().abs().to_pandas()
        # 相関行列の上三角行列を取得します。（相関行列が対称であるため、重複する相関を取り除くため）
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        drop_cols = []
        for c in num_cols:
            if any(upper[c] > self.corr_th):
                drop_cols.append(c)
                upper = upper.drop(index=c)
        print(f"特徴量間の相関性が高い特徴量を{str(len(drop_cols))}個抽出")
        self.df = self.df.drop(columns=drop_cols)
        self.drop_cols = list(set(self.drop_cols + drop_cols))

    def features_select(self):
        self._high_corr_features_drop()
        selected_features = list(set(self.init_features) - set(self.drop_cols))
        print(f"{str(len(self.init_features))} -> {str(len(selected_features))}")

        return selected_features

In [11]:
oofs = []
prev_features_df = None # 次のlevel_groupで特徴量を使うための保持データ。0-4は前のlevel_groupがないので初期値はNone
for group in level_group_list:
    print(group)
    # データ読み込み
    train_sessions = pd.read_csv(cfg.prep_dir + f"train{group}_cleaned.csv")
    labels = pd.read_csv(cfg.prep_dir + f"train_labels{group}.csv")
    train = get_train_dataset(train_sessions, labels)

    # 一つ前のlevel_groupの特徴量を追加
    if prev_features_df is not None:
        train = train.merge(prev_features_df, on=["session_id"], how="left")
    else:
        pass

    # 前のlevel_groupのquestionパートの経過時間特徴量
    if group == "5-12":
        train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
        train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
    elif group == "13-22":
        train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
        train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]

    target = "correct"
    not_use_cols = [target, "session_id", "level_group"]
    features = [c for c in train.columns if c not in not_use_cols]

    # 特徴量選択
    if cfg.base_exp is None:
        features = FeaturesSelect(train, features).features_select()
    else:
        # 使用する特徴量の抽出
        features = pd.read_csv(cfg.output_dir + f"{cfg.base_exp}/fi_{group}.csv").head(cfg.n_features)["feature"].tolist()


    # 次のlevel_groupで使う用に特徴量を保持
    prev_features_df = train.groupby("session_id").head(1).drop(columns=["question", "correct", "level_group"])

    if group == "5-12":
        break
gkf = GroupKFold(n_splits=cfg.n_splits)
for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
    tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
    vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]
    break

0-4
特徴量間の相関性が高い特徴量を1164個抽出
2106 -> 942
5-12
特徴量間の相関性が高い特徴量を2910個抽出
5614 -> 2704


In [12]:
import optuna

def objective(trial):

    params = {
        'objective' : 'binary:logistic',
        'tree_method': 'gpu_hist', 
        'eval_metric':'logloss',
        'n_estimators': 100000, 
        'early_stopping_rounds': 100,
        'learning_rate': 0.02,
        'seed': cfg.seed,
        "enable_categorical": True,
        'max_depth': trial.suggest_int("max_depth", 1, 9),
        "min_child_weight": trial.suggest_loguniform("min_child_weight", 0.1, 10.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.2, 0.95),
        'subsample': trial.suggest_float("subsample", 0.2, 0.95),
        "alpha": trial.suggest_loguniform("alpha", 1e-5, 100),
        "lambda": trial.suggest_loguniform("lambda", 1e-5, 100)
    }

    model = XGBClassifier(**params)
    model.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)], verbose=0)
    return model.best_score

study = optuna.create_study(direction="minimize")
study.optimize(objective, timeout=60*60*4)

[32m[I 2023-06-10 02:06:18,244][0m A new study created in memory with name: no-name-07f58c68-5021-4ffb-9ecb-de1a0f8e7471[0m
[32m[I 2023-06-10 02:12:15,127][0m Trial 0 finished with value: 0.533478978902213 and parameters: {'max_depth': 9, 'min_child_weight': 0.11005567814844323, 'gamma': 1.1112942482127768e-07, 'colsample_bytree': 0.8376411395542278, 'subsample': 0.6255749420459336, 'alpha': 4.7609718676302644e-05, 'lambda': 0.0026482586431938434}. Best is trial 0 with value: 0.533478978902213.[0m
[32m[I 2023-06-10 02:18:23,824][0m Trial 1 finished with value: 0.5319427119505151 and parameters: {'max_depth': 9, 'min_child_weight': 0.3350250728472122, 'gamma': 1.6275125686935526e-08, 'colsample_bytree': 0.285176159911557, 'subsample': 0.9109787057229395, 'alpha': 10.664486176933273, 'lambda': 0.1322323519824662}. Best is trial 1 with value: 0.5319427119505151.[0m
[32m[I 2023-06-10 02:21:59,066][0m Trial 2 finished with value: 0.531780328431459 and parameters: {'max_depth': 6,

In [13]:
study.best_params

{'max_depth': 8,
 'min_child_weight': 0.633766964312668,
 'gamma': 0.1299699438623672,
 'colsample_bytree': 0.7992922523509169,
 'subsample': 0.7061042367364462,
 'alpha': 2.0781568344639023,
 'lambda': 4.600879934143353}

In [14]:
study.best_value

0.5305948065927399