# iter_train

iter_testのtrain版

In [1]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm



In [2]:
@dataclass
class Cfg:
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
cfg = Cfg()

In [3]:
def transform_labels_df(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    labels["session"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)
    
    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels

In [4]:
train_sessions = pd.read_csv(cfg.input_dir + "train.csv")
labels = pd.read_csv(cfg.input_dir + "train_labels.csv")

In [5]:
train_sessions["session_level"] = train_sessions["session_id"].astype(str) + "_" + train_sessions["level_group"].map({"0-4":"0", "5-12":"1", "13-22":"2"})

In [6]:
labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
labels["session"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(str)
labels["level_group"] = ""
labels.loc[labels["question"]<=3, "level_group"] = "0"
labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "1"
labels.loc[labels["question"]>=14, "level_group"] = "2"
labels["session_level"] = labels["session"] + "_" + labels["level_group"]
labels = labels.drop(columns=["session", "question"])

In [7]:
train_session_dfs = [df[1].drop(columns="session_level").reset_index(drop=True) for df in train_sessions.groupby("session_level")]
labels_dfs = [df[1].drop(columns=["session_level", "level_group"]).reset_index(drop=True) for df in labels.groupby("session_level")]

In [8]:
# APIの仕様が変わったのでそれに合わせて、session_dfとlabelの順序を入れ替え
iter_train = zip(train_session_dfs, labels_dfs)
with open(cfg.prep_dir + "iter_train.pkl", "wb") as f:
    pickle.dump(iter_train, f)