# exp085

特徴量検討 5-12のfold0のみで

|  取り組み  | logloss |　採否 |
| ---- |  ---- | ----- |
| init |  0.532449 | ----- |  
| object_hoverのduration関連の特徴量（各fqidごと） |  0.532201 | ----- |
| agg修正・追加 |  0.532384 | ----- |
| quantile025,075追加 |  0.532052 | ----- |
| 相関係数の足切りを0.95に変更 |  0.532390 | ----- |  
| カテゴリ集計からevent_name+fqid等を削除 | 0.532147  | ----- |
| quantileを0.1刻みに変更 | 0.532385 | ----- |

|  取り組み  | 元の特徴量数 | logloss | 特徴量100 | 特徴量250 | 特徴量500 | 採否 |
|  ----  | ---- | ---- | ---- | ---- | ---- | ---- |  
quantileを0.1刻みに変更|3229|0.532385|0.535384|0.532597|0.531612|
quantileを0.25,0.75に戻す|2454|0.532302|0.534048|0.532634|0.531285| 〇
座標の集計特徴量を削除|2446|0.532329|0.534069|0.532793|0.531133| 〇
minigame特徴量|2458|0.532352|0.534219|0.532282|0.531338|
minigameのクリック座標特徴量追加|2494|0.532106|0.534204|0.532412|0.530986|〇
map hoverの特徴量追加|2561|0.532412|0.533874|0.532410|0.531198|
minigameクリックの正解時の座標|2506|0.532053|0.534096|0.532521|0.531153|
session_idの分解特徴量を追加|2498|0.532178|0.534549|0.532432|0.531023|
group跨ぎの特徴量加工|2496|0.532015|0.534258|0.532286|0.531266|

|  取り組み  | 特徴量数 | logloss | 採否 |
|  ----  | ---- | ---- | ---- |
|init|2494|0.532105|
|カテゴリの変化回数にfqidを追加|2496|0.532127|
|level+fqidの集計特徴量追加|4070|0.532216|
|level+room_fqidの集計特徴量追加|3095|0.531910|〇
|event_name+levelの集計特徴量追加|3589|0.532076|
|level+room_fqidの変化量|3095|0.531910|
|room_fqidの系列|3097|0.534676|

In [59]:
memo = "room_fqidの系列"

In [60]:
import os
import sys
import traceback
import gc
import re
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
import polars as pl
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import warnings
warnings.simplefilter('ignore')

In [61]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp085"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
    base_exp = None # 特徴量重要度を使う元のexp
    n_features = 500 # 特徴量削減の数
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
    import cudf

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [62]:
params = {
    'objective' : 'binary:logistic',
    'tree_method': 'gpu_hist', 
    'eval_metric':'logloss',
    'n_estimators': 100000, 
    'early_stopping_rounds': 100,
    'learning_rate': 0.02,
    'seed': cfg.seed,
    "enable_categorical": True,
    'max_depth': 8,
    'min_child_weight': 0.633766964312668,
    'gamma': 0.1299699438623672,
    'colsample_bytree': 0.7992922523509169,
    'subsample': 0.7061042367364462,
    'alpha': 2.0781568344639023,
    'lambda': 4.600879934143353
}

In [63]:
level_group_list = ['0-4', '5-12', '13-22']
level_group_map = {
    "q1":"0-4", "q2":"0-4", "q3":"0-4",
    "q4":"5-12", "q5":"5-12", "q6":"5-12", "q7":"5-12", "q8":"5-12", "q9":"5-12", "q10":"5-12", "q11":"5-12", "q12":"5-12", "q13":"5-12",
    "q14":"13-22", "q15":"13-22", "q16":"13-22", "q17":"13-22", "q18":"13-22"  
}

In [64]:
if cfg.mode == "local_cv":
    with open(cfg.prep_dir + 'cat_col_lists_v3.pkl', 'rb') as f:
        cat_col_lists = pickle.load(f) 
    with open(cfg.prep_dir + 'sequence_encoders.pkl', 'rb') as f:
        sequence_encoders = pickle.load(f) 
    with open(cfg.prep_dir + 'room_fqid_encoder.pkl', 'rb') as f:
        room_fqid_encoder = pickle.load(f) 

elif cfg.mode == "kaggle_inf":
    with open("/kaggle/input/psp-cat-col-lists/cat_col_lists_v3.pkl", 'rb') as f:
        cat_col_lists = pickle.load(f) 

In [65]:
def transform_labels_df_train(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels


def transform_labels_df_inf(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    return labels

In [66]:
def diff_maxmin(s):
    try:
        return s.max() - s.min()
    except:
        return -1

In [67]:
class Features:
    def __init__(self, sessions_df, need_create_features=None):
        self.sessions_df = pl.from_pandas(sessions_df).sort(["session_id", "index"])
        self.group = sessions_df["level_group"].values[0]
        self.need_create_features = need_create_features
        self.room_fqid_seq_encoder = sequence_encoders[group]["room_fqid"]

    def prep(self):
        self.sessions_df = self.sessions_df.with_columns(
            [(pl.col("elapsed_time") - pl.col("elapsed_time").shift(1)).clip(0, 1e9).fill_null(0).over(["session_id"]).alias("time_diff"),
             (pl.col("event_name") + "_" + pl.col("name")).alias("event_name+name"),
             (pl.col("event_name") + "_" + pl.col("room_fqid")).alias("event_name+room_fqid"),
             (pl.col("event_name") + "_" + pl.col("fqid")).alias("event_name+fqid"),
             (pl.col("level").cast(pl.Utf8) + "_" + pl.col("room_fqid")).alias("level+room_fqid"),
             (pl.col("event_name") + "_" + pl.col("level").cast(pl.Utf8)).alias("event_name+level"),
             (pl.col("room_fqid").map_dict(room_fqid_encoder).alias("room_fqid_encode"))
             ]
        )

    def get_aggs(self):
        g = self.group
        cats = cat_col_lists[g]
        aggs = []

        # トータルレコード数
        aggs += [pl.col("index").count().alias(f"{g}_record_cnt")]

        # グループ全体の経過時間
        aggs += [pl.col("elapsed_time").apply(lambda s:s.max() - s.min()).alias(f"{g}_elapsed_time")]

        # 各categoryごとのレコード数
        for c in ["event_name", "name", "page", "level", "room_fqid", "fqid", "event_name+name", "event_name+room_fqid", "event_name+fqid", "level+room_fqid"]:
            aggs += [pl.col("index").filter(pl.col(c)==v).count().alias(f"{g}_{c}_{str(v)}_record_cnt") for v in cats[c]]
        
        # 各categoryごとのユニーク数
        for c in ["event_name", "name", "page", "level", "room_fqid", "fqid", "event_name+name", "event_name+room_fqid", "event_name+fqid", "level+room_fqid"]:
            aggs += [pl.col(c).drop_nulls().n_unique().alias(f"{g}_{c}_nunique")]

        # 集計量
        for v in ["elapsed_time", "index"]:
            aggs += [pl.col(v).max().alias(f"{g}_{v}_max").cast(pl.Float32), 
                     pl.col(v).max().alias(f"{g}_{v}_min").cast(pl.Float32)]

        for v in ["time_diff", "hover_duration"]:
            aggs += [pl.col(v).max().alias(f"{g}_{v}_max").cast(pl.Float32), 
                     pl.col(v).min().alias(f"{g}_{v}_min").cast(pl.Float32), 
                     pl.col(v).std().alias(f"{g}_{v}_std").cast(pl.Float32),
                     pl.col(v).mean().alias(f"{g}_{v}_mean").cast(pl.Float32), 
                     pl.col(v).sum().alias(f"{g}_{v}_sum").cast(pl.Float32), 
                     pl.col(v).median().alias(f"{g}_{v}_median").cast(pl.Float32)]
            
            aggs += [pl.col(v).quantile(0.25, "nearest").alias(f"{g}_{v}_quantile025"),
                     pl.col(v).quantile(0.75, "nearest").alias(f"{g}_{v}_quantile075")
            ]
            
        # カテゴリ×集計量
        cs = ["event_name", "room_fqid", "fqid", "text_fqid", "level", "name", "event_name+name", "event_name+room_fqid", "level+room_fqid"]
        vs = ["time_diff"]
        for c, v in itertools.product(cs, vs):
            aggs += [pl.col(v).filter(pl.col(c)==cat).max().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_max").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).min().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_min").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).std().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_std").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).mean().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_mean").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).median().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_median").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).sum().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_sum").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).quantile(0.25, "nearest").fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_quantile025").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).quantile(0.75, "nearest").fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_quantile075").cast(pl.Float32) for cat in cats[c]]

        cs = ["event_name", "room_fqid", "fqid", "text_fqid", "level", "name", "event_name+name", "event_name+room_fqid", "level+room_fqid"]
        vs = ["elapsed_time"]
        for c, v in itertools.product(cs, vs):
            aggs += [pl.col(v).filter(pl.col(c)==cat).max().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_max").cast(pl.Float32) for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).min().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_min").cast(pl.Float32) for cat in cats[c]]

        # カテゴリの変化回数
        for c in ["room_fqid", "text_fqid"]:
            aggs += [(pl.col(c) != pl.col(c).shift(1)).sum().alias(f"{g}_{c}_change_cnt")]

        # object_hoverのduration関連の特徴量（各fqidごと）
        fqids = [c.removeprefix("object_hover_") for c in cats["event_name+fqid"] if "object_hover" in c]
        for fqid in fqids:
            aggs += [pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).max().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_max"),
                     pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).min().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_min"),
                     pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).std().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_std"),
                     pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).mean().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_mean"),
                     pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).median().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_median"),
                     pl.col("hover_duration").filter((pl.col("event_name")=="object_hover") & (pl.col("fqid")==fqid)).sum().fill_null(-1).cast(pl.Float32).alias(f"{g}_object_hover_{fqid}_hover_duration_sum")
                    ]

        # miniゲーム中のクリック座標
        if g == "0-4":
            fqids = ["tunic", "plaque"]
        elif g == "5-12":
            fqids = ["businesscards", "logbook", "reader", "journals"]
        elif g == "13-22":
            fqids = ["tracks", "reader_flag", "journals_flag", "directory"]

        for fqid in fqids:
            aggs += [
                pl.col("room_coor_x").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).first().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_first_click_room_coor_x"),
                pl.col("room_coor_y").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).first().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_first_click_room_coor_y"),
                pl.col("room_coor_x").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).last().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_last_click_room_coor_x"),
                pl.col("room_coor_y").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).last().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_last_click_room_coor_y"),
                pl.col("room_coor_x").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).mean().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_click_room_coor_x_mean"),
                pl.col("room_coor_y").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).mean().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_click_room_coor_y_mean"),
                pl.col("room_coor_x").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).std().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_click_room_coor_x_std"),
                pl.col("room_coor_y").filter((pl.col("event_name")=="object_click")&(pl.col("fqid")==fqid)).std().fill_null(-1).cast(pl.Float32).alias(f"{g}_{fqid}_click_room_coor_y_std"),
            ]
      
        aggs += [pl.col("room_fqid_encode").filter(pl.col("room_fqid_encode")!=pl.col("room_fqid_encode").shift(1)).str.concat("").map_dict(self.room_fqid_seq_encoder).alias(f"{g}_room_fqid_seq")]


        # 生成する特徴量を限定
        if self.need_create_features is not None:
            feats = [re.findall(r'alias\("(.*)"\)', str(a))[0] for a in aggs]
            aggs = [aggs[i] for i, f in enumerate(feats) if f in self.need_create_features]

        return aggs

    def get_features(self):
        self.prep()
        aggs = self.get_aggs()
        features = self.sessions_df.groupby(["session_id"], maintain_order=True).agg(aggs)
        features = features.to_pandas()
        features[f"{self.group}_room_fqid_seq"] = features[f"{self.group}_room_fqid_seq"].astype("category")
        return features

In [68]:
def get_train_dataset(sessions, labels):
    # labelデータの整形
    labels = transform_labels_df_train(labels)

    # 特徴量生成
    features = Features(sessions).get_features()
    train = labels.merge(features, on=["session_id"], how="left")
    train["question"] = train["question"].astype("category")

    return train

def get_test_dataset(sessions, labels, feature_select=False, need_create_features=None):
    # labelデータの整形
    labels = transform_labels_df_inf(labels)

    # 特徴量生成
    features = Features(sessions, need_create_features).get_features()
    test = labels.merge(features, on=["session_id"], how="left")
    test["question"] = test["question"].astype("category")

    return test   

In [69]:
def calc_metrics(oof):
    logloss = log_loss(oof["correct"], oof["pred"])

    # find best th
    scores = []; thresholds = []
    best_score = 0; best_threshold = 0

    for threshold in np.arange(0.4,0.81,0.01):
        preds = (oof["pred"].values>threshold).astype(int)
        m = f1_score(oof["correct"].values, preds, average='macro')   
        scores.append(m)
        thresholds.append(threshold)
        if m>best_score:
            best_score = m
            best_threshold = threshold
    print("logloss", format(logloss, ".6f"))
    print("best_score", format(best_score, ".6f"))
    print("best_threshold", format(best_threshold, ".3f"))

    # Q別スコア
    print("---"*10)
    for q in range(18):
        q = q + 1
        preds = (oof[oof["question"]==q]["pred"].values>threshold).astype(int)
        m = f1_score(oof[oof["question"]==q]["correct"].values, preds, average='macro')
        print(f"Q{q} : F1 = {format(m, '.6f')}")
    return best_threshold

In [70]:
class FeaturesSelect:
    def __init__(self, df, init_features, corr_th=0.99):
        self.init_features = init_features
        self.df = cudf.from_pandas(df)
        self.corr_th = corr_th
        self.drop_cols = []
    
    def _high_corr_features_drop(self):
        num_cols = self.df[self.init_features].select_dtypes(include="number").columns

        # 特徴量間の相関行列を計算
        corr_matrix = self.df[num_cols].fillna(-1).corr().abs().to_pandas()
        # 相関行列の上三角行列を取得します。（相関行列が対称であるため、重複する相関を取り除くため）
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        drop_cols = []
        for c in num_cols:
            if any(upper[c] > self.corr_th):
                drop_cols.append(c)
                upper = upper.drop(index=c)
        print(f"特徴量間の相関性が高い特徴量を{str(len(drop_cols))}個抽出")
        self.df = self.df.drop(columns=drop_cols)
        self.drop_cols = list(set(self.drop_cols + drop_cols))

    def features_select(self):
        self._high_corr_features_drop()
        selected_features = list(set(self.init_features) - set(self.drop_cols))
        print(f"{str(len(self.init_features))} -> {str(len(selected_features))}")

        return selected_features

In [71]:
def run_train():
    oofs = []
    prev_features_df = None # 次のlevel_groupで特徴量を使うための保持データ。0-4は前のlevel_groupがないので初期値はNone
    for group in level_group_list:
        print(group)
        # データ読み込み
        train_sessions = pd.read_csv(cfg.prep_dir + f"train{group}_cleaned.csv")
        labels = pd.read_csv(cfg.prep_dir + f"train_labels{group}.csv")
        train = get_train_dataset(train_sessions, labels)

        # 一つ前のlevel_groupの特徴量を追加
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
        
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]

        # 特徴量選択
        if cfg.base_exp is None:
            features = FeaturesSelect(train, features).features_select()
        else:
            # 使用する特徴量の抽出
            features = pd.read_csv(cfg.output_dir + f"{cfg.base_exp}/fi_{group}.csv").head(cfg.n_features)["feature"].tolist()

        gkf = GroupKFold(n_splits=cfg.n_splits)
        fis = []
        
        oof_groups = []
        for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
            model_path = cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.json"
            
            print(f"fold : {i}")
            tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
            vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]


            model = XGBClassifier(**params)
            model.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)], verbose=500)
            # モデル出力
            model.save_model(cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.json")
        
            # valid_pred
            oof_fold = train.iloc[vl_idx].copy()
            oof_fold["pred"] = model.predict_proba(vl_x)[:,1]
            oof_groups.append(oof_fold)

            # 特徴量重要度
            fi_fold = pd.DataFrame()
            fi_fold["feature"] = model.feature_names_in_
            fi_fold["importance"] = model.feature_importances_
            fi_fold["fold"] = i
            fis.append(fi_fold)

        fi = pd.concat(fis)    
        fi = fi.groupby("feature")["importance"].mean().reset_index()
        fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
        fi.to_csv(cfg.output_dir + f"{cfg.exp_name}/fi_{group}.csv", index=False)

        oof_group = pd.concat(oof_groups)
        oofs.append(oof_group)

        # 次のlevel_groupで使う用に特徴量を保持
        prev_features_df = train.groupby("session_id").head(1).drop(columns=["question", "correct", "level_group"])

        # meta_featureの付与
        meta_df = oof_group.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{group}_pred_mean", "max":f"{group}_pred_max", "min":f"{group}_pred_min", "std":f"{group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

    # cv
    oof = pd.concat(oofs)
    best_threshold = calc_metrics(oof)
    cfg.best_threshold = best_threshold
    oof[["session_id", "question", "pred", "correct"]].to_csv(cfg.output_dir + f"{cfg.exp_name}/oof.csv.gz", compression="gzip", index=False)

In [72]:
def get_mock_iter_train():
    """trainデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    sub["level_group"] = sub["session_level"].apply(lambda x: x.split("_")[-1])
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["level_group2"] = test["level_group"].str.replace("13-22", "6")
    sub["level_group2"] = sub["level_group"].str.replace("13-22", "6")

    tests = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in test.groupby("level_group2")]
    subs = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in sub.groupby("level_group2")]
    return zip(tests, subs)

def get_mock_iter_test():
    """testデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["session_level"] = test["session_level"].str.replace("13-22", "6")
    sub["session_level"] = sub["session_level"].str.replace("13-22", "6")

    tests = [df[1].drop(columns="session_level").reset_index(drop=True) for df in test.groupby("session_level")]
    subs = [df[1].drop(columns="session_level").reset_index(drop=True) for df in sub.groupby("session_level")]
    return zip(tests, subs)

In [73]:
def inference(mode):
    if mode == "local_cv":
        # time series apiを模したiterをモックとして用意する
        iter_test = get_mock_iter_test()
        start_time = time.time()
    elif mode == "kaggle_inf":
        env = jo_wilder_310.make_env()
        iter_test = env.iter_test()
        
    model_dict = {}
    features_dict = {}
    for g in level_group_list:
        if mode == "local_cv":
            model_paths = [cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.json" for i in range(cfg.n_splits)]
        elif mode == "kaggle_inf":
            model_paths = [f"/kaggle/input/jo-wilder-{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.json" for i in range(cfg.n_splits)]
        models = []
        for i in range(cfg.n_splits):
            model = XGBClassifier()
            model.load_model(model_paths[i])
            # 推論はCPU
            model.get_booster().set_param({'tree_method': 'hist'})
            models.append(model)
        model_dict[g] = models
        features_dict[g] = list(model.feature_names_in_)
    need_create_features = features_dict["0-4"] + features_dict["5-12"] + features_dict["13-22"]
    not_drop_cols = ["0-4_elapsed_time_max", "0-4_index_max", "5-12_elapsed_time_max", "5-12_index_max", "13-22_elapsed_time_max", "13-22_index_max",
                     "0-4_elapsed_time_min", "0-4_index_min", "5-12_elapsed_time_min", "5-12_index_min", "13-22_elapsed_time_min", "13-22_index_min"]
    need_create_features = need_create_features + not_drop_cols
    need_create_features = list(set(need_create_features))
    
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        test = get_test_dataset(test_sessions, sample_submission, feature_select=True, need_create_features=need_create_features)
        features = features_dict[level_group]
        preds = np.zeros(len(test))

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]

        prev_features_df = test.groupby("session_id").head(1).drop(columns=["question", "correct"])

        for i in range(cfg.n_splits):
            model = model_dict[level_group][i]
            preds += model.predict_proba(test[features])[:,1] / cfg.n_splits
        test["pred"] = preds
        preds = (preds>cfg.best_threshold).astype(int)
        sample_submission["correct"] = preds

        # meta_featureの付与
        meta_df = test.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{level_group}_pred_mean", "max":f"{level_group}_pred_max", "min":f"{level_group}_pred_min", "std":f"{level_group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

        if mode == "local_cv":
            print(sample_submission["correct"].values)
        elif mode == "kaggle_inf":
            env.predict(sample_submission)
    if mode == "local_cv":
        process_time = format(time.time() - start_time, ".1f")
        print("sample_inf処理時間 : ", process_time, "秒")

In [74]:
def valid_train_test_process_identity():
    iter_train = get_mock_iter_train()
    iter_test = get_mock_iter_test()

    print("train_iter")
    train_df_dict = {}
    train_features_dict = {}
    prev_features_df = None
    for (sessions, sub) in iter_train:
        group = sessions["level_group"].values[0]
        print(group)
        train = get_train_dataset(sessions, sub)
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass
            # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]
        train_df_dict[group] = train[["session_id"]+features].sort_values(["session_id", "question"], ignore_index=True)
        prev_features_df = train[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        train_features_dict[group] = features


    print("test_iter")
    test_dfs_0_4 = []
    test_dfs_5_12 = []
    test_dfs_13_22 = []
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        session_id = test_sessions["session_id"].values[0]
        print(session_id, level_group)
        features = train_features_dict[level_group]
        test = get_test_dataset(test_sessions, sample_submission)

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in test.columns if c not in not_use_cols]
        prev_features_df = test[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        if level_group == "0-4":
            test_dfs_0_4.append(test[["session_id"]+features])
        elif level_group == "5-12":
            test_dfs_5_12.append(test[["session_id"]+features])
        elif level_group == "13-22":
            test_dfs_13_22.append(test[["session_id"]+features])
        

    test_dfs_0_4 = pd.concat(test_dfs_0_4, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_5_12 = pd.concat(test_dfs_5_12, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_13_22 = pd.concat(test_dfs_13_22, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)

    assert train_df_dict["0-4"][train_features_dict["0-4"]].equals(test_dfs_0_4[train_features_dict["0-4"]])
    assert train_df_dict["5-12"][train_features_dict["5-12"]].equals(test_dfs_5_12[train_features_dict["5-12"]])
    assert train_df_dict["13-22"][train_features_dict["13-22"]].equals(test_dfs_13_22[train_features_dict["13-22"]])

In [75]:
oofs = []
prev_features_df = None # 次のlevel_groupで特徴量を使うための保持データ。0-4は前のlevel_groupがないので初期値はNone
for group in level_group_list:
    print(group)
    # データ読み込み
    train_sessions = pd.read_csv(cfg.prep_dir + f"train{group}_cleaned.csv")
    labels = pd.read_csv(cfg.prep_dir + f"train_labels{group}.csv")
    train = get_train_dataset(train_sessions, labels)

    # 一つ前のlevel_groupの特徴量を追加
    if prev_features_df is not None:
        train = train.merge(prev_features_df, on=["session_id"], how="left")
    else:
        pass

    # 前のlevel_groupのquestionパートの経過時間特徴量
    if group == "5-12":
        train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
        train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
    elif group == "13-22":
        train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
        train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]

    target = "correct"
    not_use_cols = [target, "session_id", "level_group"]
    features = [c for c in train.columns if c not in not_use_cols]
    # 特徴量選択
    if cfg.base_exp is None:
        features = FeaturesSelect(train, features).features_select()
    else:
        # 使用する特徴量の抽出
        features = pd.read_csv(cfg.output_dir + f"{cfg.base_exp}/fi_{group}.csv").head(cfg.n_features)["feature"].tolist()

    # 次のlevel_groupで使う用に特徴量を保持
    prev_features_df = train.groupby("session_id").head(1).drop(columns=["question", "correct", "level_group"])

    if group == "5-12":
        break

gkf = GroupKFold(n_splits=cfg.n_splits)
fis = []
total_score = 0.0
for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
    model_path = cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.json"
    
    print(f"fold : {i}")
    tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
    vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]


    model = XGBClassifier(**params)
    model.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)], verbose=500)
    print(f"fold{i} score", format(model.best_score, ".6f"))
    total_score += model.best_score / cfg.n_splits

    # 特徴量重要度
    fi_fold = pd.DataFrame()
    fi_fold["feature"] = model.feature_names_in_
    fi_fold["importance"] = model.feature_importances_
    fi_fold["fold"] = i
    fis.append(fi_fold)

fi = pd.concat(fis)    
fi = fi.groupby("feature")["importance"].mean().reset_index()
fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
fi.to_csv(cfg.output_dir + f"{cfg.exp_name}/fi_{group}.csv", index=False)

print(f"total score", format(total_score, ".6f"))
org_score = format(total_score, ".6f")

0-4


特徴量間の相関性が高い特徴量を1145個抽出
2089 -> 944
5-12
特徴量間の相関性が高い特徴量を2894個抽出
5991 -> 3097
fold : 0
[0]	validation_0-logloss:0.68809
[500]	validation_0-logloss:0.53543
[1000]	validation_0-logloss:0.53486
[1046]	validation_0-logloss:0.53486
fold0 score 0.534654
fold : 1
[0]	validation_0-logloss:0.68815
[500]	validation_0-logloss:0.53630
[863]	validation_0-logloss:0.53611
fold1 score 0.535984
fold : 2
[0]	validation_0-logloss:0.68809
[500]	validation_0-logloss:0.53451
[665]	validation_0-logloss:0.53444
fold2 score 0.534396
fold : 3
[0]	validation_0-logloss:0.68812
[500]	validation_0-logloss:0.53373
[854]	validation_0-logloss:0.53343
fold3 score 0.533312
fold : 4
[0]	validation_0-logloss:0.68812
[500]	validation_0-logloss:0.53556
[942]	validation_0-logloss:0.53519
fold4 score 0.535033
total score 0.534676


In [76]:
null_imp_params = {
    'objective' : 'binary:logistic',
    'tree_method': 'gpu_hist', 
    'eval_metric':'logloss',
    'n_estimators': 100, 
    'learning_rate': 0.1,
    'seed': cfg.seed,
    "enable_categorical": True,
    'max_depth': 8,
    'min_child_weight': 0.633766964312668,
    'gamma': 0.1299699438623672,
    'colsample_bytree': 0.7992922523509169,
    'subsample': 0.7061042367364462,
    'alpha': 2.0781568344639023,
    'lambda': 4.600879934143353
}

features = fi["feature"].tolist()
model = XGBClassifier(**null_imp_params)
model.fit(train[features], train[target].values, verbose=100)
fi_org = pd.DataFrame()
fi_org["feature"] = model.feature_names_in_
fi_org["importance"] = model.feature_importances_
fi_org = fi_org.sort_values("feature", ignore_index=True)

null_imps = []
for i in range(10):
    model = XGBClassifier(**null_imp_params)
    model.fit(train[features], train[target].sample(frac=1).values, verbose=100)
    fi_tmp = pd.DataFrame()
    fi_tmp["feature"] = model.feature_names_in_
    fi_tmp["importance"] = model.feature_importances_
    null_imps.append(fi_tmp)
null_imp = pd.concat(null_imps)
null_imp_mean = null_imp.groupby("feature")["importance"].mean().reset_index().sort_values("feature", ignore_index=True)
fi_org["null_imp"] = null_imp_mean["importance"]
selected_features = fi_org[fi_org["importance"]>fi_org["null_imp"]]["feature"].tolist()

null_imp_mean = null_imp.groupby("feature")["importance"].quantile(0.8).reset_index().sort_values("feature", ignore_index=True)
fi_org["null_imp"] = null_imp_mean["importance"]
selected_features = fi_org[fi_org["importance"]>fi_org["null_imp"]]["feature"].tolist()

null_imp_score = 0.0
for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
    tr_x, tr_y = train.iloc[tr_idx][selected_features], train.iloc[tr_idx][target]
    vl_x, vl_y = train.iloc[vl_idx][selected_features], train.iloc[vl_idx][target]

    model = XGBClassifier(**params)
    model.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)], verbose=0)
    print(f"fold{i} score", format(model.best_score, ".6f"))
    null_imp_score += model.best_score / cfg.n_splits
print("null_imp", format(total_score, ".6f"))

In [77]:
print(f"|{memo}|{str(len(fi))}|{org_score}|")

|room_fqidの系列|3097|0.534676|


In [78]:
#if cfg.mode == "local_cv":
#    valid_train_test_process_identity()
#    run_train()
#inference(cfg.mode)