# train_group

group別に分割したtrainデータ

In [1]:
import os
import sys
import traceback
import gc
import random
import pickle
import pathlib
import subprocess
from dataclasses import dataclass

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
@dataclass
class Cfg:
    mode = "local_cv" # "local_cv" or "kaggle_inf" 
    exp_name = "exp015"
    input_dir = "/mnt/predict-student-performance-from-game-play/input/"
    output_dir = "/mnt/predict-student-performance-from-game-play/output/"
    prep_dir = "/mnt/predict-student-performance-from-game-play/prep/"
cfg = Cfg()

In [3]:
def transform_labels_df(labels_):
    """
    labelsデータを整形する
    """
    labels = labels_.copy()
    labels["question"] = labels["session_id"].apply(lambda x: x.split("_")[1].replace("q", "")).astype(int)
    #labels["session_id"] = labels["session_id"].apply(lambda x: x.split("_")[0]).astype(int)
    
    # trainの特徴量と結合するためにquestionに対応するlabel_groupを列として設けておく
    labels["level_group"] = ""
    labels.loc[labels["question"]<=3, "level_group"] = "0-4"
    labels.loc[(labels["question"]>=4)&(labels["question"]<=13), "level_group"] = "5-12"
    labels.loc[labels["question"]>=14, "level_group"] = "13-22"

    return labels

In [4]:
train_sessions = pd.read_csv(cfg.input_dir + "train.csv")
labels = pd.read_csv(cfg.input_dir + "train_labels.csv")

In [5]:
train_sessions

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,,,,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,,,,0-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13174206,22100221145014656,1600,5483231,navigate_click,undefined,22,,343.887291,36.701026,483.0,273.0,,,,tunic.capitol_2.hall,,,,,13-22
13174207,22100221145014656,1601,5485166,navigate_click,undefined,22,,332.696070,141.493178,545.0,221.0,,,chap4_finale_c,tunic.capitol_2.hall,,,,,13-22
13174208,22100221145014656,1602,5485917,navigate_click,undefined,22,,369.912859,140.569205,611.0,217.0,,,,tunic.capitol_2.hall,,,,,13-22
13174209,22100221145014656,1603,5486753,navigate_click,undefined,22,,252.299653,123.805889,526.0,232.0,,,chap4_finale_c,tunic.capitol_2.hall,,,,,13-22


In [6]:
labels

Unnamed: 0,session_id,correct
0,20090312431273200_q1,1
1,20090312433251036_q1,0
2,20090314121766812_q1,1
3,20090314363702160_q1,1
4,20090314441803444_q1,1
...,...,...
212017,22100215342220508_q18,1
212018,22100215460321130_q18,1
212019,22100217104993650_q18,1
212020,22100219442786200_q18,1


In [7]:
labels.dtypes

session_id    object
correct        int64
dtype: object

In [8]:
labels = transform_labels_df(labels)

In [9]:
labels

Unnamed: 0,session_id,correct,question,level_group
0,20090312431273200_q1,1,1,0-4
1,20090312433251036_q1,0,1,0-4
2,20090314121766812_q1,1,1,0-4
3,20090314363702160_q1,1,1,0-4
4,20090314441803444_q1,1,1,0-4
...,...,...,...,...
212017,22100215342220508_q18,1,18,13-22
212018,22100215460321130_q18,1,18,13-22
212019,22100217104993650_q18,1,18,13-22
212020,22100219442786200_q18,1,18,13-22


In [10]:
labels.dtypes

session_id     object
correct         int64
question        int64
level_group    object
dtype: object

In [11]:
train_sessions["level_group"].value_counts()

13-22    6746397
5-12     4433127
0-4      1994687
Name: level_group, dtype: int64

In [12]:
for group in ["0-4", "5-12", "13-22"]:
    train_sessions_group = train_sessions[train_sessions["level_group"]==group].copy()
    train_sessions_group = train_sessions_group.reset_index(drop=True)
    labels_group = labels[labels["level_group"]==group].copy()
    labels_group = labels_group[["session_id", "correct"]].reset_index(drop=True)
    train_sessions_group.to_csv(cfg.prep_dir + f"train{group}.csv", index=False)
    labels_group.to_csv(cfg.prep_dir + f"train_labels{group}.csv", index=False)

In [13]:
train_sessions_group.head()

Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,512,836732,navigate_click,undefined,13,,290.153549,-204.499365,651.0,445.0,,,,tunic.capitol_1.hall,,,,,13-22
1,20090312431273200,513,837245,navigate_click,undefined,13,,353.805607,-210.332061,672.0,445.0,,,,tunic.capitol_1.hall,,,,,13-22
2,20090312431273200,514,837779,navigate_click,undefined,13,,587.680024,-280.706245,780.0,489.0,,,,tunic.capitol_1.hall,,,,,13-22
3,20090312431273200,515,838446,navigate_click,undefined,13,,751.496869,-102.153292,823.0,365.0,,,toentry,tunic.capitol_1.hall,,,,,13-22
4,20090312431273200,516,839629,map_hover,basic,13,,,,,,67.0,,tunic.drycleaner,tunic.capitol_1.hall,,,,,13-22


In [14]:
labels_group.head()

Unnamed: 0,session_id,correct
0,20090312431273200_q14,1
1,20090312433251036_q14,1
2,20090314121766812_q14,1
3,20090314363702160_q14,1
4,20090314441803444_q14,1
