# exp083

polarsで書き換え

In [16]:
import os
import sys
import traceback
import gc
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
import polars as pl
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import warnings
warnings.simplefilter('ignore')

In [17]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp083"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
    base_exp = None # 特徴量重要度を使う元のexp
    n_features = 500 # 特徴量削減の数
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
    import cudf

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [18]:
params = {
    'objective': 'binary', 
    'boosting': 'gbdt', 
    'learning_rate': 0.01, 
    'metric': 'binary_logloss', 
    'seed': cfg.seed, 
    'feature_pre_filter': False, 
    'lambda_l1': 4.134488140102331, 
    'lambda_l2': 0.007775200046481757, 
    'num_leaves': 75, 
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.7036110805680353, 
    'bagging_freq': 3, 
    'min_data_in_leaf': 50, 
    'min_child_samples': 100
} 

In [19]:
level_group_list = ['0-4', '5-12', '13-22']
level_group_map = {
    "q1":"0-4", "q2":"0-4", "q3":"0-4",
    "q4":"5-12", "q5":"5-12", "q6":"5-12", "q7":"5-12", "q8":"5-12", "q9":"5-12", "q10":"5-12", "q11":"5-12", "q12":"5-12", "q13":"5-12",
    "q14":"13-22", "q15":"13-22", "q16":"13-22", "q17":"13-22", "q18":"13-22"  
}

In [20]:
if cfg.mode == "local_cv":
    with open(cfg.prep_dir + 'cat_col_lists_v2.pkl', 'rb') as f:
        cat_col_lists = pickle.load(f) 

elif cfg.mode == "kaggle_inf":
    with open("/kaggle/input/psp-cat-col-lists/cat_col_lists_v2.pkl", 'rb') as f:
        cat_col_lists = pickle.load(f) 

In [21]:
def transform_labels_df_train(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels


def transform_labels_df_inf(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    return labels

In [22]:
class Features:
    def __init__(self, sessions_df, need_create_features=None):
        self.sessions_df = pl.from_pandas(sessions_df).sort(["session_id", "index"])
        self.group = sessions_df["level_group"].values[0]
        self.need_create_features = need_create_features

    def prep(self):
        self.sessions_df = self.sessions_df.with_columns(
            [(pl.col("elapsed_time") - pl.col("elapsed_time").shift(1)).clip(0, 1e9).fill_null(0).over(["session_id"]).alias("time_diff"),
             (pl.col("event_name") + "_" + pl.col("name")).alias("event_name+name"),
             (pl.col("event_name") + "_" + pl.col("room_fqid")).alias("event_name+room_fqid"),
             (pl.col("event_name") + "_" + pl.col("fqid")).alias("event_name+fqid")
             ]
        )

    def get_aggs(self):
        g = self.group
        cats = cat_col_lists[g]
        aggs = []
        feats = []

        # トータルレコード数
        feats += [f"{g}_record_cnt"]
        aggs += [pl.col("index").count().alias(f"{g}_record_cnt")]

        # グループ全体の経過時間
        feats += [f"{g}_elapsed_time"]
        aggs += [pl.col("elapsed_time").apply(lambda s:s.max() - s.min()).alias(f"{g}_elapsed_time")]

        # 各categoryごとのレコード数
        for c in ["event_name"]:
            feats += [f"{g}_{c}_{str(v)}_record_cnt" for v in cats[c]]
            aggs += [pl.col("index").filter(pl.col(c)==v).count().alias(f"{g}_{c}_{str(v)}_record_cnt") for v in cats[c]]
        
        # 各categoryごとのユニーク数
        for c in ["event_name"]:
            feats += [f"{g}_{c}_nunique"]
            aggs += [pl.col(c).drop_nulls().n_unique().alias(f"{g}_{c}_nunique")]

        # 集計量
        for v in ["elapsed_time", "index"]:
            feats += [f"{g}_{v}_max", f"{g}_{v}_min"]
            aggs += [pl.col(v).max().alias(f"{g}_{v}_max"), pl.col(v).max().alias(f"{g}_{v}_min")]
        # for v in ["room_coor_x", "room_coor_y", "screen_coor_x", "screen_coor_y"]:
        #    feats += [f"{group}_{v}_mean"]
        #    aggs += [pl.col(v).mean().alias(f"{group}_{v}_mean")]
        #for v in ["time_diff", "hover_duration"]:
        #    feats += [f"{group}_{v}_max", f"{group}_{v}_min", f"{group}_{v}_std", f"{group}_{v}_mean", f"{group}_{v}_sum", f"{group}_{v}_median"]
        #    aggs += [pl.col(v).max().alias(f"{group}_{v}_max"), pl.col(v).min().alias(f"{group}_{v}_min"), pl.col(v).std().alias(f"{group}_{v}_std"),
        #             pl.col(v).mean().alias(f"{group}_{v}_mean"), pl.col(v).sum().alias(f"{group}_{v}_sum"), pl.col(v).mean().alias(f"{group}_{v}_mean")]
            
        #    feats += [f"{group}_{v}_quantile01", f"{group}_{v}_quantile02", f"{group}_{v}_quantile03", f"{group}_{v}_quantile04"
        #              f"{group}_{v}_quantile06", f"{group}_{v}_quantile07", f"{group}_{v}_quantile08", f"{group}_{v}_quantile09"]
        #    aggs += [pl.col(v).quantile(0.1, "nearest").alias(f"{group}_{v}_quantile01"),
        #             pl.col(v).quantile(0.2, "nearest").alias(f"{group}_{v}_quantile02"),
        #             pl.col(v).quantile(0.3, "nearest").alias(f"{group}_{v}_quantile03"),
        #             pl.col(v).quantile(0.4, "nearest").alias(f"{group}_{v}_quantile04"),
        #             pl.col(v).quantile(0.6, "nearest").alias(f"{group}_{v}_quantile06"),
        #             pl.col(v).quantile(0.7, "nearest").alias(f"{group}_{v}_quantile07"),
        #             pl.col(v).quantile(0.8, "nearest").alias(f"{group}_{v}_quantile08"),
        #             pl.col(v).quantile(0.9, "nearest").alias(f"{group}_{v}_quantile09")]

        # カテゴリ×集計量
        for c, v in itertools.product(["fqid"], ["elapsed_time"]):
            feats += [f"{g}_{c}_{cat}_{v}_max" for cat in cats[c]]
            aggs += [pl.col(v).filter(pl.col(c)==cat).max().fill_null(-1).alias(f"{g}_{c}_{cat}_{v}_max") for cat in cats[c]]

        # カテゴリの変化回数
        for c in ["room_fqid"]:
            feats += [f"{g}_{c}_change_cnt"]
            aggs += [(pl.col(c) != pl.col(c).shift(1)).sum().alias(f"{g}_{c}_change_cnt")]

        # 道中のminiゲーム特徴量
        
        # 生成する特徴量を限定
        if self.need_create_features is not None:
            aggs = [aggs[i] for i, f in enumerate(feats) if f in self.need_create_features]

        return aggs

    def get_features(self):
        self.prep()
        aggs = self.get_aggs()
        features = self.sessions_df.groupby(["session_id"], maintain_order=True).agg(aggs)
        return features.to_pandas()

In [23]:
def get_train_dataset(sessions, labels):
    # labelデータの整形
    labels = transform_labels_df_train(labels)

    # 特徴量生成
    features = Features(sessions).get_features()
    train = labels.merge(features, on=["session_id"], how="left")
    train["question"] = train["question"].astype("category")

    return train

def get_test_dataset(sessions, labels, feature_select=False, need_create_features=None):
    # labelデータの整形
    labels = transform_labels_df_inf(labels)

    # 特徴量生成
    features = Features(sessions, need_create_features).get_features()
    test = labels.merge(features, on=["session_id"], how="left")
    test["question"] = test["question"].astype("category")

    return test   

In [24]:
def calc_metrics(oof):
    logloss = log_loss(oof["correct"], oof["pred"])

    # find best th
    scores = []; thresholds = []
    best_score = 0; best_threshold = 0

    for threshold in np.arange(0.4,0.81,0.01):
        preds = (oof["pred"].values>threshold).astype(int)
        m = f1_score(oof["correct"].values, preds, average='macro')   
        scores.append(m)
        thresholds.append(threshold)
        if m>best_score:
            best_score = m
            best_threshold = threshold
    print("logloss", format(logloss, ".6f"))
    print("best_score", format(best_score, ".6f"))
    print("best_threshold", format(best_threshold, ".3f"))

    # Q別スコア
    print("---"*10)
    for q in range(18):
        q = q + 1
        preds = (oof[oof["question"]==q]["pred"].values>threshold).astype(int)
        m = f1_score(oof[oof["question"]==q]["correct"].values, preds, average='macro')
        print(f"Q{q} : F1 = {format(m, '.6f')}")
    return best_threshold

In [25]:
class FeaturesSelect:
    def __init__(self, df, init_features, corr_th=0.99):
        self.init_features = init_features
        self.df = cudf.from_pandas(df)
        self.corr_th = corr_th
        self.drop_cols = []
    
    def _high_corr_features_drop(self):
        num_cols = self.df[self.init_features].select_dtypes(include="number").columns

        # 特徴量間の相関行列を計算
        corr_matrix = self.df[num_cols].fillna(-1).corr().abs().to_pandas()
        # 相関行列の上三角行列を取得します。（相関行列が対称であるため、重複する相関を取り除くため）
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

        drop_cols = []
        for c in num_cols:
            if any(upper[c] > self.corr_th):
                drop_cols.append(c)
                upper = upper.drop(index=c)
        print(f"特徴量間の相関性が高い特徴量を{str(len(drop_cols))}個抽出")
        self.df = self.df.drop(columns=drop_cols)
        self.drop_cols = list(set(self.drop_cols + drop_cols))

    def features_select(self):
        self._high_corr_features_drop()
        selected_features = list(set(self.init_features) - set(self.drop_cols))
        print(f"{str(len(self.init_features))} -> {str(len(selected_features))}")

        return selected_features

In [26]:
def run_train():
    oofs = []
    prev_features_df = None # 次のlevel_groupで特徴量を使うための保持データ。0-4は前のlevel_groupがないので初期値はNone
    for group in level_group_list:
        print(group)
        # データ読み込み
        train_sessions = pd.read_csv(cfg.prep_dir + f"train{group}_cleaned.csv")
        labels = pd.read_csv(cfg.prep_dir + f"train_labels{group}.csv")
        train = get_train_dataset(train_sessions, labels)

        # 一つ前のlevel_groupの特徴量を追加
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
    
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]

        # 特徴量選択
        if cfg.base_exp is None:
            features = FeaturesSelect(train, features).features_select()
        else:
            # 使用する特徴量の抽出
            features = pd.read_csv(cfg.output_dir + f"{cfg.base_exp}/fi_{group}.csv").head(cfg.n_features)["feature"].tolist()

        gkf = GroupKFold(n_splits=cfg.n_splits)
        fis = []
        
        oof_groups = []
        for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
            model_path = cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.lgb"
            
            print(f"fold : {i}")
            tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
            vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]
            tr_data = lgb.Dataset(tr_x, label=tr_y)
            vl_data = lgb.Dataset(vl_x, label=vl_y)

            if os.path.exists(model_path):
                print(f"modelが既に存在するのでロード : {model_path}")
                model = lgb.Booster(model_file=model_path)
            else:
                model = lgb.train(params, tr_data, valid_sets=[tr_data, vl_data],
                                num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
            # モデル出力
            model.save_model(cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{group}_{i}.lgb")
        
            # valid_pred
            oof_fold = train.iloc[vl_idx].copy()
            oof_fold["pred"] = model.predict(vl_x, num_iteration=model.best_iteration)
            oof_groups.append(oof_fold)

            # 特徴量重要度
            fi_fold = pd.DataFrame()
            fi_fold["feature"] = model.feature_name()
            fi_fold["importance"] = model.feature_importance(importance_type="gain")
            fi_fold["fold"] = i
            fis.append(fi_fold)

        fi = pd.concat(fis)    
        fi = fi.groupby("feature")["importance"].mean().reset_index()
        fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
        fi.to_csv(cfg.output_dir + f"{cfg.exp_name}/fi_{group}.csv", index=False)

        oof_group = pd.concat(oof_groups)
        oofs.append(oof_group)

        # 次のlevel_groupで使う用に特徴量を保持
        prev_features_df = train.groupby("session_id").head(1).drop(columns=["question", "correct", "level_group"])

        # meta_featureの付与
        meta_df = oof_group.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{group}_pred_mean", "max":f"{group}_pred_max", "min":f"{group}_pred_min", "std":f"{group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

    # cv
    oof = pd.concat(oofs)
    best_threshold = calc_metrics(oof)
    cfg.best_threshold = best_threshold
    oof[["session_id", "question", "pred", "correct"]].to_csv(cfg.output_dir + f"{cfg.exp_name}/oof.csv.gz", compression="gzip", index=False)

In [27]:
def get_mock_iter_train():
    """trainデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    sub["level_group"] = sub["session_level"].apply(lambda x: x.split("_")[-1])
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["level_group2"] = test["level_group"].str.replace("13-22", "6")
    sub["level_group2"] = sub["level_group"].str.replace("13-22", "6")

    tests = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in test.groupby("level_group2")]
    subs = [df[1].drop(columns=["session_level", "level_group2"]).reset_index(drop=True) for df in sub.groupby("level_group2")]
    return zip(tests, subs)

def get_mock_iter_test():
    """testデータのiter分割を適用したtest_sample
    """
    test = pd.read_csv(cfg.input_dir + "_old/test.csv")
    sub = pd.read_csv(cfg.input_dir + "_old/sample_submission.csv")
    
    # groupbyでiter作るときにgroup_levelの順番が崩れないように
    test["session_level"] = test["session_level"].str.replace("13-22", "6")
    sub["session_level"] = sub["session_level"].str.replace("13-22", "6")

    tests = [df[1].drop(columns="session_level").reset_index(drop=True) for df in test.groupby("session_level")]
    subs = [df[1].drop(columns="session_level").reset_index(drop=True) for df in sub.groupby("session_level")]
    return zip(tests, subs)

In [28]:
def inference(mode):
    if mode == "local_cv":
        # time series apiを模したiterをモックとして用意する
        iter_test = get_mock_iter_test()
        start_time = time.time()
    elif mode == "kaggle_inf":
        env = jo_wilder_310.make_env()
        iter_test = env.iter_test()
        
    model_dict = {}
    features_dict = {}
    for g in level_group_list:
        if mode == "local_cv":
            model_paths = [cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.lgb" for i in range(cfg.n_splits)]
        elif mode == "kaggle_inf":
            model_paths = [f"/kaggle/input/jo-wilder-{cfg.exp_name}/{cfg.exp_name}_model_{g}_{i}.lgb" for i in range(cfg.n_splits)]
        model_dict[g] = [lgb.Booster(model_file=p) for p in model_paths]
        features_dict[g] = model_dict[g][0].feature_name()
    need_create_features = features_dict["0-4"] + features_dict["5-12"] + features_dict["13-22"]
    not_drop_cols = ["0-4_elapsed_time_max", "0-4_index_max", "5-12_elapsed_time_max", "5-12_index_max", "13-22_elapsed_time_max", "13-22_index_max",
                     "0-4_elapsed_time_min", "0-4_index_min", "5-12_elapsed_time_min", "5-12_index_min", "13-22_elapsed_time_min", "13-22_index_min"]
    need_create_features = need_create_features + not_drop_cols
    need_create_features = list(set(need_create_features))
    
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        test = get_test_dataset(test_sessions, sample_submission, feature_select=True, need_create_features=need_create_features)
        features = features_dict[level_group]
        preds = np.zeros(len(test))

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]

        prev_features_df = test.groupby("session_id").head(1).drop(columns=["question", "correct"])

        for i in range(cfg.n_splits):
            model = model_dict[level_group][i]
            preds += model.predict(test[features], num_iteration=model.best_iteration) / cfg.n_splits
        test["pred"] = preds
        preds = (preds>cfg.best_threshold).astype(int)
        sample_submission["correct"] = preds

        # meta_featureの付与
        meta_df = test.groupby("session_id")["pred"].agg(["mean", "max", "min", "std"]).reset_index()
        meta_df = meta_df.rename(columns={"mean":f"{level_group}_pred_mean", "max":f"{level_group}_pred_max", "min":f"{level_group}_pred_min", "std":f"{level_group}_pred_std"})
        prev_features_df = prev_features_df.merge(meta_df, on="session_id", how="left")

        if mode == "local_cv":
            print(sample_submission["correct"].values)
        elif mode == "kaggle_inf":
            env.predict(sample_submission)
    if mode == "local_cv":
        process_time = format(time.time() - start_time, ".1f")
        print("sample_inf処理時間 : ", process_time, "秒")

In [29]:
def valid_train_test_process_identity():
    iter_train = get_mock_iter_train()
    iter_test = get_mock_iter_test()

    print("train_iter")
    train_df_dict = {}
    train_features_dict = {}
    prev_features_df = None
    for (sessions, sub) in iter_train:
        group = sessions["level_group"].values[0]
        print(group)
        train = get_train_dataset(sessions, sub)
        if prev_features_df is not None:
            train = train.merge(prev_features_df, on=["session_id"], how="left")
        else:
            pass
            # 前のlevel_groupのquestionパートの経過時間特徴量
        if group == "5-12":
            train["0-4_question_duration_time"] = train["5-12_elapsed_time_min"] - train["0-4_elapsed_time_max"]
            train["0-4_question_duration_index"] = train["5-12_index_min"] - train["0-4_index_max"]
        elif group == "13-22":
            train["5-12_question_duration_time"] = train["13-22_elapsed_time_min"] - train["5-12_elapsed_time_max"]
            train["5-12_question_duration_index"] = train["13-22_index_min"] - train["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in train.columns if c not in not_use_cols]
        train_df_dict[group] = train[["session_id"]+features].sort_values(["session_id", "question"], ignore_index=True)
        prev_features_df = train[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        train_features_dict[group] = features


    print("test_iter")
    test_dfs_0_4 = []
    test_dfs_5_12 = []
    test_dfs_13_22 = []
    prev_features_df = None
    for (test_sessions, sample_submission) in iter_test:
        level_group = test_sessions["level_group"].values[0]
        session_id = test_sessions["session_id"].values[0]
        print(session_id, level_group)
        features = train_features_dict[level_group]
        test = get_test_dataset(test_sessions, sample_submission)

        if level_group == "0-4":
            pass
        else:
            test = test.merge(prev_features_df, on=["session_id"], how="left")

        # 前のlevel_groupのquestionパートの経過時間特徴量
        if level_group == "5-12":
            test["0-4_question_duration_time"] = test["5-12_elapsed_time_min"] - test["0-4_elapsed_time_max"]
            test["0-4_question_duration_index"] = test["5-12_index_min"] - test["0-4_index_max"]
        elif level_group == "13-22":
            test["5-12_question_duration_time"] = test["13-22_elapsed_time_min"] - test["5-12_elapsed_time_max"]
            test["5-12_question_duration_index"] = test["13-22_index_min"] - test["5-12_index_max"]
        target = "correct"
        not_use_cols = [target, "session_id", "level_group"]
        features = [c for c in test.columns if c not in not_use_cols]
        prev_features_df = test[["session_id"]+features].groupby("session_id").head(1).drop(columns="question")
        if level_group == "0-4":
            test_dfs_0_4.append(test[["session_id"]+features])
        elif level_group == "5-12":
            test_dfs_5_12.append(test[["session_id"]+features])
        elif level_group == "13-22":
            test_dfs_13_22.append(test[["session_id"]+features])
        

    test_dfs_0_4 = pd.concat(test_dfs_0_4, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_5_12 = pd.concat(test_dfs_5_12, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)
    test_dfs_13_22 = pd.concat(test_dfs_13_22, ignore_index=True).sort_values(["session_id", "question"], ignore_index=True)

    assert train_df_dict["0-4"][train_features_dict["0-4"]].equals(test_dfs_0_4[train_features_dict["0-4"]])
    assert train_df_dict["5-12"][train_features_dict["5-12"]].equals(test_dfs_5_12[train_features_dict["5-12"]])
    assert train_df_dict["13-22"][train_features_dict["13-22"]].equals(test_dfs_13_22[train_features_dict["13-22"]])

In [30]:
if cfg.mode == "local_cv":
    valid_train_test_process_identity()
    run_train()
inference(cfg.mode)

train_iter
0-4
5-12
13-22
test_iter
20090109393214576 0-4
20090109393214576 5-12
20090109393214576 13-22
20090312143683264 0-4
20090312143683264 5-12
20090312143683264 13-22
20090312331414616 0-4
20090312331414616 5-12
20090312331414616 13-22
0-4
特徴量間の相関性が高い特徴量を12個抽出
55 -> 43
fold : 0
[LightGBM] [Info] Number of positive: 49821, number of negative: 6726
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8046
[LightGBM] [Info] Number of data points in the train set: 56547, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.881055 -> initscore=2.002456
[LightGBM] [Info] Start training from score 2.002456
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.308265	valid_1's binary_logloss: 0.324131
[200]	training's binary_logloss: 0.283154	valid_1's binary_logloss: 0.304663
[300]	training's binary_logloss: 0.268829	valid_1's binary_logloss: 0.295844
[400]	training's binary_logloss: 0