# exp094

exp094のblending

各level_groupに閉じた形 : 0.699077  
level_group跨ぎ : 0.698151  
questionをcategoryに : 0.698698  
class_weight : 0.698593

In [26]:
import os
import sys
import traceback
import gc
import re
import time
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass
from collections import defaultdict

import pandas as pd
import numpy as np
import polars as pl
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
from xgboost import XGBClassifier

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import itertools

import Levenshtein

import warnings
warnings.simplefilter('ignore')

In [27]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp094-blending"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
    seed = 42
    n_splits = 5
    best_threshold = 0.630 # local_cvの結果を入れる
    base_exp = None # 特徴量重要度を使う元のexp
    n_features = 500 # 特徴量削減の数
cfg = Cfg()

if cfg.mode == "local_cv":
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name), exist_ok=True)
    os.makedirs(os.path.join(cfg.output_dir, cfg.exp_name, "cache"), exist_ok=True)
    import cudf

elif cfg.mode == "kaggle_inf":
    import jo_wilder_310

In [28]:
def calc_metrics(oof):
    logloss = log_loss(oof["correct"], oof["pred"])

    # find best th
    scores = []; thresholds = []
    best_score = 0; best_threshold = 0

    for threshold in np.arange(0.4,0.81,0.01):
        preds = (oof["pred"].values>threshold).astype(int)
        m = f1_score(oof["correct"].values, preds, average='macro')   
        scores.append(m)
        thresholds.append(threshold)
        if m>best_score:
            best_score = m
            best_threshold = threshold
    print("logloss", format(logloss, ".6f"))
    print("best_score", format(best_score, ".6f"))
    print("best_threshold", format(best_threshold, ".3f"))

    # Q別スコア
    print("---"*10)
    for q in range(18):
        q = q + 1
        preds = (oof[oof["question"]==q]["pred"].values>threshold).astype(int)
        m = f1_score(oof[oof["question"]==q]["correct"].values, preds, average='macro')
        print(f"Q{q} : F1 = {format(m, '.6f')}")
    return best_threshold

In [29]:
def transform_labels_df_train(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels


def transform_labels_df_inf(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)

    return labels

In [30]:
oof1 = pd.read_csv(cfg.output_dir + "exp094/oof.csv.gz")
oof2 = pd.read_csv(cfg.output_dir + "exp094-2/oof.csv.gz")
oof3 = pd.read_csv(cfg.output_dir + "exp094-3/oof.csv.gz")

In [31]:
labels = pd.read_csv(cfg.input_dir + "train_labels.csv")
train = transform_labels_df_train(labels)

In [32]:
train0_4 = train[train["level_group"]=="0-4"].reset_index(drop=True)
train5_12 = train[train["level_group"]=="5-12"].reset_index(drop=True)
train13_22 = train[train["level_group"]=="13-22"].reset_index(drop=True)

In [33]:
exp_name = "exp094"
oof_pivot = pd.pivot_table(oof1, index="session_id", columns="question", values="pred")
new_cols = [f"{exp_name}_pred_q" + str(i) for i in oof_pivot.columns]
oof_pivot.columns = new_cols
train0_4 = train0_4.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3]]], on="session_id", how="left")
train5_12 = train5_12.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13]]], on="session_id", how="left")
train13_22 = train13_22.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]], on="session_id", how="left")

exp_name = "exp094-2"
oof_pivot = pd.pivot_table(oof2, index="session_id", columns="question", values="pred")
new_cols = [f"{exp_name}_pred_q" + str(i) for i in oof_pivot.columns]
oof_pivot.columns = new_cols
train0_4 = train0_4.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3]]], on="session_id", how="left")
train5_12 = train5_12.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13]]], on="session_id", how="left")
train13_22 = train13_22.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]], on="session_id", how="left")

exp_name = "exp094-3"
oof_pivot = pd.pivot_table(oof3, index="session_id", columns="question", values="pred")
new_cols = [f"{exp_name}_pred_q" + str(i) for i in oof_pivot.columns]
oof_pivot.columns = new_cols
train0_4 = train0_4.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3]]], on="session_id", how="left")
train5_12 = train5_12.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13]]], on="session_id", how="left")
train13_22 = train13_22.merge(oof_pivot[[f"{exp_name}_pred_q" + str(i) for i in [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18]]], on="session_id", how="left")

train = pd.concat([train0_4, train5_12, train13_22], ignore_index=True)
train["question"] = train["question"].astype("category")

In [34]:
w0 = len(train)/len(train[train["correct"] == 0])

In [35]:
w1 = len(train)/len(train[train["correct"] == 1])

In [36]:
train["w"] = 0.0
train.loc[train["correct"]==0, "w"] = w0
train.loc[train["correct"]==1, "w"] = w1

In [37]:
params = {
        'objective': 'binary', 
        'boosting': 'gbdt', 
        'learning_rate': 0.01, 
        'metric': 'binary_logloss', 
        'seed': cfg.seed
}

In [38]:
target = "correct"
not_use_cols = [target, "session_id", "level_group", "w"]
features = [c for c in train.columns if c not in not_use_cols]

In [39]:
gkf = GroupKFold(n_splits=cfg.n_splits)
fis = []

oofs = []
for i, (tr_idx, vl_idx) in enumerate(gkf.split(train[features], train[target], train["session_id"])):
    model_path = cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{i}.lgb"
    
    print(f"fold : {i}")
    tr_x, tr_y = train.iloc[tr_idx][features], train.iloc[tr_idx][target]
    vl_x, vl_y = train.iloc[vl_idx][features], train.iloc[vl_idx][target]
    tr_data = lgb.Dataset(tr_x, label=tr_y, weight=train.iloc[tr_idx]["w"])
    vl_data = lgb.Dataset(vl_x, label=vl_y, weight=train.iloc[vl_idx]["w"])


    model = lgb.train(params, tr_data, valid_sets=[tr_data, vl_data],
                    num_boost_round=20000, early_stopping_rounds=100, verbose_eval=100)
    # モデル出力
    model.save_model(cfg.output_dir + f"{cfg.exp_name}/{cfg.exp_name}_model_{i}.lgb")

    # valid_pred
    oof_fold = train.iloc[vl_idx].copy()
    oof_fold["pred"] = model.predict(vl_x, num_iteration=model.best_iteration)
    oofs.append(oof_fold)

    # 特徴量重要度
    fi_fold = pd.DataFrame()
    fi_fold["feature"] = model.feature_name()
    fi_fold["importance"] = model.feature_importance(importance_type="gain")
    fi_fold["fold"] = i
    fis.append(fi_fold)

fi = pd.concat(fis)    
fi = fi.groupby("feature")["importance"].mean().reset_index()
fi = fi.sort_values("importance", ascending=False).reset_index(drop=True)
fi.to_csv(cfg.output_dir + f"{cfg.exp_name}/fi.csv", index=False)


# cv
oof = pd.concat(oofs)
best_threshold = calc_metrics(oof)
cfg.best_threshold = best_threshold
oof[["session_id", "question", "pred", "correct"]].to_csv(cfg.output_dir + f"{cfg.exp_name}/oof.csv.gz", compression="gzip", index=False)

fold : 0
[LightGBM] [Info] Number of positive: 239789, number of negative: 99493
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13789
[LightGBM] [Info] Number of data points in the train set: 339282, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501349 -> initscore=0.005396
[LightGBM] [Info] Start training from score 0.005396
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.573053	valid_1's binary_logloss: 0.573039
[200]	training's binary_logloss: 0.546361	valid_1's binary_logloss: 0.5469
[300]	training's binary_logloss: 0.539044	valid_1's binary_logloss: 0.540131
[400]	training's binary_logloss: 0.536213	valid_1's binary_logloss: 0.538117
[500]	training's binary_logloss: 0.534508	valid_1's binary_logloss: 0.537589
[600]	training's binary_logloss: 0.533153	valid_1's binary_logloss: 0.537383
[700]	training's binary_logloss: 0.531995	valid_1's binary_logloss: 0.537275
[