# train_group_cleaned

group別に分割したtrainデータ  
２週目を除外

In [1]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', 10000)



In [2]:
@dataclass
class Cfg:
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
cfg = Cfg()

In [3]:
def transform_labels_df(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    #labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)
    
    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels

In [4]:
train_sessions = pd.read_csv(cfg.input_dir + "train.csv")
labels = pd.read_csv(cfg.input_dir + "train_labels.csv")
labels = transform_labels_df(labels)

In [6]:
# リプレイデータを除外する（level_groupの変化を積算し、3回を超えたレコードを除外）
train_sessions["level_group_change"] = (train_sessions["level_group"] != train_sessions.groupby("session_id")["level_group"].shift(1)).astype(int)
train_sessions["level_group_change_cumsum"] = train_sessions.groupby("session_id")["level_group_change"].cumsum()

train_sessions = train_sessions[train_sessions["level_group_change_cumsum"]<=3].reset_index(drop=True).drop(columns=["level_group_change", "level_group_change_cumsum"])

In [6]:
for group in ["0-4", "5-12", "13-22"]:
    train_sessions_group = train_sessions[train_sessions["level_group"]==group].copy()
    train_sessions_group = train_sessions_group.reset_index(drop=True)
    labels_group = labels[labels["level_group"]==group].copy()
    labels_group = labels_group[["session_id", "correct"]].reset_index(drop=True)
    train_sessions_group.to_csv(cfg.prep_dir + f"train{group}_cleaned.csv", index=False)
    labels_group.to_csv(cfg.prep_dir + f"train_labels{group}_cleaned.csv", index=False)

In [7]:
train_sessions_group.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,512,836732,navigate_click,undefined,13,,290.153549,-204.499365,651.0,445.0,,,,tunic.capitol_1.hall,,0,0,1,13-22
1,20090312431273200,513,837245,navigate_click,undefined,13,,353.805607,-210.332061,672.0,445.0,,,,tunic.capitol_1.hall,,0,0,1,13-22
2,20090312431273200,514,837779,navigate_click,undefined,13,,587.680024,-280.706245,780.0,489.0,,,,tunic.capitol_1.hall,,0,0,1,13-22
3,20090312431273200,515,838446,navigate_click,undefined,13,,751.496869,-102.153292,823.0,365.0,,,toentry,tunic.capitol_1.hall,,0,0,1,13-22
4,20090312431273200,516,839629,map_hover,basic,13,,,,,,67.0,,tunic.drycleaner,tunic.capitol_1.hall,,0,0,1,13-22


In [8]:
labels_group.head()

Unnamed: 0,session_id,correct
0,20090312431273200_q14,1
1,20090312433251036_q14,1
2,20090312455206810_q14,1
3,20090313091715820_q14,1
4,20090313571836404_q14,0
