In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import pickle
import polars as pl
from sklearn.model_selection import KFold, GroupKFold, train_test_split
from xgboost import XGBClassifier
# from catboost import CatBoostClassifier, Pool
from sklearn.metrics import f1_score, accuracy_score
from tqdm.notebook import tqdm
from collections import defaultdict
import warnings
from itertools import combinations

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 200)

In [11]:
# targets = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')

# ----------------Locally load data
targets = pd.read_csv('./Data/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))
print(targets.shape)

(424116, 4)


In [12]:
"""# See each user got how many questions right
targets.groupby('session')['correct'].agg('sum').sort_values()

# Save user #ID answers
targets.loc[targets['session']==20100514144222530].to_csv('20100514144222530_answer.csv')"""

"# See each user got how many questions right\ntargets.groupby('session')['correct'].agg('sum').sort_values()\n\n# Save user #ID answers\ntargets.loc[targets['session']==20100514144222530].to_csv('20100514144222530_answer.csv')"

In [14]:
train_dtypes = {"session_id": pl.Int64,"elapsed_time": pl.Int64,"event_name": pl.Categorical,
                "name": pl.Categorical,"level": pl.Int8,"page": pl.Float32,
                "room_coor_x": pl.Float32,"room_coor_y": pl.Float32,"screen_coor_x": pl.Float32,
                "screen_coor_y": pl.Float32,"hover_duration": pl.Float32,"text": pl.Utf8,
                "fqid": pl.Utf8,"room_fqid": pl.Categorical,"text_fqid": pl.Utf8,
                "fullscreen": pl.Int8,"hq": pl.Int8,"music": pl.Int8,"level_group": pl.Categorical
               }

In [15]:
time_up_lim = 3.6e6 # 1 hour. events longer than 1 hour are outliers.
columns = [
    pl.col("page").cast(pl.Float32),
    (
        (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1)) # time used for each action
         .fill_null(0)
         .clip(0, time_up_lim)
         .over(["session_id", "level_group"])
         .alias("elapsed_time_diff")
    ),
    pl.col("fqid").fill_null("fqid_None"),
    pl.col("text_fqid").fill_null("text_fqid_None")
]

In [16]:
%%time

# we prepare the dataset for the training by level :
"""df = (pl.read_csv("/kaggle/input/predict-student-performance-from-game-play/train.csv", dtypes=train_dtypes)
      .drop(["fullscreen", "hq", "music"])
      .with_columns(columns))"""
#df = reduce_mem_usage_pl(df)

# ----------------Locally load data
df = (pl.read_csv("./Data/train.csv", dtypes=train_dtypes)
      .drop(["fullscreen", "hq", "music"])
      .with_columns(columns))

print(f'df loaded with shape: {df.shape}')

df loaded with shape: (26296946, 18)
CPU times: user 24.6 s, sys: 11.3 s, total: 36 s
Wall time: 7.41 s


In [17]:
df1 = df.filter(pl.col("level_group")=='0-4')
df2 = df.filter(pl.col("level_group")=='5-12')
df3 = df.filter(pl.col("level_group")=='13-22')

print(f'df1 loaded with shape: {df1.shape}')
print(f'df2 loaded with shape: {df2.shape}')
print(f'df3 loaded with shape: {df3.shape}')

df1 loaded with shape: (3981005, 18)
df2 loaded with shape: (8844238, 18)
df3 loaded with shape: (13471703, 18)


In [25]:
df = pd.read_csv('./Data/train.csv')
df1= df.loc[df["level_group"] == '0-4']

print(f'df1 loaded with shape: {df1.shape}')



df1 loaded with shape: (3981005, 20)


In [41]:
df1= df.loc[df["level_group"] == '0-4']
df1.set_index('session_id',inplace=True)
df1

Unnamed: 0_level_0,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090312431273200,0,0,cutscene_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
20090312431273200,1,1323,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,2,831,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,3,1147,person_click,basic,0,,-413.991405,-159.314686,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
20090312431273200,4,1863,person_click,basic,0,,-412.991405,-159.314686,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100221145014656,206,352635,navigate_click,undefined,4,,-914.204218,-50.000000,26.0,380.0,,,toentry,tunic.kohlcenter.halloffame,,0,0,1,0-4
22100221145014656,207,353336,map_hover,basic,4,,,,,,84.0,,tunic.kohlcenter,tunic.kohlcenter.halloffame,,0,0,1,0-4
22100221145014656,208,354020,map_click,undefined,4,,-549.217711,-2.000000,449.0,332.0,,,tunic.capitol_0,tunic.kohlcenter.halloffame,,0,0,1,0-4
22100221145014656,209,356238,navigate_click,undefined,4,,313.584260,107.913147,463.0,226.0,,,chap1_finale,tunic.capitol_0.hall,,0,0,1,0-4


In [56]:
print(len(targets.loc[(targets['q']==1) & (targets['correct'] ==1), 'session'])) # correct ones

print(len(targets.loc[(targets['q']==1) & (targets['correct'] ==0), 'session'])) # wrong ones


17141
6421


In [59]:
df1_temp=df1.loc[targets.loc[(targets['q']==1) & (targets['correct'] ==0), 'session'].values] 
df1_temp.loc[df1_temp['text'] == 'Better check back later.']

Unnamed: 0_level_0,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090315081004164,129,195230,observation_click,basic,3,,32.873385,151.547441,467.0,180.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
20090317080721164,92,179700,observation_click,basic,3,,112.782111,123.557993,543.0,208.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
20090317080721164,94,182005,observation_click,basic,3,,-377.215412,-273.427730,53.0,605.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
20090318021527530,123,153964,observation_click,basic,3,,-13.385638,134.827478,418.0,190.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
20090318021527530,125,155915,observation_click,basic,3,,-69.247561,-205.613026,361.0,537.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22090614531427040,199,94657,observation_click,basic,3,,138.545748,55.817875,588.0,275.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
22090615160053548,100,165342,observation_click,basic,3,,251.739638,145.313195,682.0,186.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,0,0-4
22100215460321130,109,230353,observation_click,basic,3,,136.946999,-6.352391,606.0,360.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4
22100221145014656,142,285335,observation_click,basic,3,,16.732242,147.270564,447.0,184.0,,Better check back later.,outtolunch,tunic.historicalsociety.stacks,tunic.historicalsociety.stacks.outtolunch,0,0,1,0-4


In [30]:
aggs = [
        pl.col("screen_coor_x").max().alias(f"screen_coor_x_max"),
        pl.col("screen_coor_x").min().alias(f"screen_coor_x_min"),
        pl.col("screen_coor_y").max().alias(f"screen_coor_y_max"),
        pl.col("screen_coor_y").min().alias(f"screen_coor_y_min"),
]

df2.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")


session_id,screen_coor_x_max,screen_coor_x_min,screen_coor_y_max,screen_coor_y_min
i64,f32,f32,f32,f32
20090312431273200,872.0,15.0,640.0,50.0
20090312433251036,878.0,2.0,637.0,59.0
20090312455206810,1196.0,13.0,880.0,50.0
20090313091715820,1023.0,17.0,752.0,32.0
20090313571836404,879.0,12.0,654.0,25.0
20090314035813970,877.0,11.0,634.0,50.0
20090314121766812,870.0,14.0,643.0,33.0
20090314221187252,861.0,32.0,654.0,58.0
20090314363702160,869.0,17.0,637.0,52.0
20090314441803444,878.0,60.0,637.0,45.0


In [None]:
df1.filter(pl.col("session_id")== 20100514144222530).write_csv('20100514144222530_part1.csv')
df2.filter(pl.col("session_id")== 20100514144222530).write_csv('20100514144222530_part2.csv')
df3.filter(pl.col("session_id")== 20100514144222530).write_csv('20100514144222530_part3.csv')

In [None]:
df1.groupby(['fqid','text_fqid']).n_unique().write_csv('./fqid_text_interact1.csv')
df2.groupby(['fqid','text_fqid']).n_unique().write_csv('./fqid_text_interact2.csv')
df3.groupby(['fqid','text_fqid']).n_unique().write_csv('./fqid_text_interact3.csv')