The notebook uses the preprocessing from From : https://www.kaggle.com/code/takanashihumbert/magic-bingo-train-part-lb-0-687
and from https://www.kaggle.com/code/leehomhuang/catboost-baseline-with-lots-features-inference
with some new features.

And ideas from https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-676 as well.

In [1]:
import gc
import os

import pandas as pd
import numpy as np
import warnings
import pickle
import polars as pl

from collections import defaultdict
from itertools import combinations
import pyarrow as pa

import lightgbm as lgb
from lightgbm import LGBMClassifier
from lightgbm import Booster
from lightgbm import early_stopping

from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.metrics import roc_auc_score, f1_score


import matplotlib.pyplot as plt
from colorama import Fore, Back, Style

In [2]:
train_dtypes = {"session_id": pl.Int64,"elapsed_time": pl.Int64,"event_name": pl.Categorical,
                "name": pl.Categorical,"level": pl.Int8,"page": pl.Float32,
                "room_coor_x": pl.Float32,"room_coor_y": pl.Float32,"screen_coor_x": pl.Float32,
                "screen_coor_y": pl.Float32,"hover_duration": pl.Float32,"text": pl.Utf8,
                "fqid": pl.Utf8,"room_fqid": pl.Categorical,"text_fqid": pl.Utf8,
                "fullscreen": pl.Int8,"hq": pl.Int8,"music": pl.Int8,"level_group": pl.Categorical
               }

test_dtypes = {"session_id": pl.Int64,"elapsed_time": pl.Int64,"event_name": pl.Categorical,
                "name": pl.Categorical,"level": pl.Int8,"page": pl.Float32,
                "room_coor_x": pl.Float32,"room_coor_y": pl.Float32,"screen_coor_x": pl.Float32,
                "screen_coor_y": pl.Float32,"hover_duration": pl.Float32,"text": pl.Utf8,
                "fqid": pl.Utf8,"room_fqid": pl.Categorical,"text_fqid": pl.Utf8,
                "fullscreen": pl.Int8,"hq": pl.Int8,"music": pl.Int8,"level_group": pl.Categorical,
               "session_level":pl.Int8}

In [3]:
def reduce_mem_usage_pl(df):
    
    start_mem = df.estimated_size("mb")
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    # pl.Uint8,pl.UInt16,pl.UInt32,pl.UInt64
    Numeric_Int_types = [pl.Int8,pl.Int16,pl.Int32,pl.Int64]
    Numeric_Float_types = [pl.Float32,pl.Float64]
    
    for col in df.columns:
        col_type = df[col].dtype
        c_min = df[col].min()
        c_max = df[col].max()
        if col_type in Numeric_Int_types:
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df = df.with_columns(df[col].cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df = df.with_columns(df[col].cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df = df.with_columns(df[col].cast(pl.Int32))
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df = df.with_columns(df[col].cast(pl.Int64))

        elif col_type in Numeric_Float_types:
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df = df.with_columns(df[col].cast(pl.Float32))
            else:
                pass
        elif col_type == pl.Utf8:
            df = df.with_columns(df[col].cast(pl.Categorical))
        else:
            pass
    mem_usg = df.estimated_size("mb")
    print("Memory usage became: ",mem_usg," MB")
    
    return df

# Data preprocessing

In [4]:
CATS = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMS = ['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
        'hover_duration', 'elapsed_time_diff']
DIALOGS = ['that', 'this', 'it', 'you','find','found','Found','notebook','Wells','wells','help','need', 'Oh','Ooh','Jo', 'flag', 'can','and','is','the','to']

name_feature = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
event_name_feature = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']

# from https://www.kaggle.com/code/leehomhuang/catboost-baseline-with-lots-features-inference :
fqid_lists = ['worker', 'archivist', 'gramps', 'wells', 'toentry', 'confrontation', 'crane_ranger', 'groupconvo', 'flag_girl', 'tomap', 'tostacks', 'tobasement', 'archivist_glasses', 'boss', 'journals', 'seescratches', 'groupconvo_flag', 'cs', 'teddy', 'expert', 'businesscards', 'ch3start', 'tunic.historicalsociety', 'tofrontdesk', 'savedteddy', 'plaque', 'glasses', 'tunic.drycleaner', 'reader_flag', 'tunic.library', 'tracks', 'tunic.capitol_2', 'trigger_scarf', 'reader', 'directory', 'tunic.capitol_1', 'journals.pic_0.next', 'unlockdoor', 'tunic', 'what_happened', 'tunic.kohlcenter', 'tunic.humanecology', 'colorbook', 'logbook', 'businesscards.card_0.next', 'journals.hub.topics', 'logbook.page.bingo', 'journals.pic_1.next', 'journals_flag', 'reader.paper0.next', 'tracks.hub.deer', 'reader_flag.paper0.next', 'trigger_coffee', 'wellsbadge', 'journals.pic_2.next', 'tomicrofiche', 'journals_flag.pic_0.bingo', 'plaque.face.date', 'notebook', 'tocloset_dirty', 'businesscards.card_bingo.bingo', 'businesscards.card_1.next', 'tunic.wildlife', 'tunic.hub.slip', 'tocage', 'journals.pic_2.bingo', 'tocollectionflag', 'tocollection', 'chap4_finale_c', 'chap2_finale_c', 'lockeddoor', 'journals_flag.hub.topics', 'tunic.capitol_0', 'reader_flag.paper2.bingo', 'photo', 'tunic.flaghouse', 'reader.paper1.next', 'directory.closeup.archivist', 'intro', 'businesscards.card_bingo.next', 'reader.paper2.bingo', 'retirement_letter', 'remove_cup', 'journals_flag.pic_0.next', 'magnify', 'coffee', 'key', 'togrampa', 'reader_flag.paper1.next', 'janitor', 'tohallway', 'chap1_finale', 'report', 'outtolunch', 'journals_flag.hub.topics_old', 'journals_flag.pic_1.next', 'reader.paper2.next', 'chap1_finale_c', 'reader_flag.paper2.next', 'door_block_talk', 'journals_flag.pic_1.bingo', 'journals_flag.pic_2.next', 'journals_flag.pic_2.bingo', 'block_magnify', 'reader.paper0.prev', 'block', 'reader_flag.paper0.prev', 'block_0', 'door_block_clean', 'reader.paper2.prev', 'reader.paper1.prev', 'doorblock', 'tocloset', 'reader_flag.paper2.prev', 'reader_flag.paper1.prev', 'block_tomap2', 'journals_flag.pic_0_old.next', 'journals_flag.pic_1_old.next', 'block_tocollection', 'block_nelson', 'journals_flag.pic_2_old.next', 'block_tomap1', 'block_badge', 'need_glasses', 'block_badge_2', 'fox', 'block_1']
text_lists = ['tunic.historicalsociety.cage.confrontation', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.entry.groupconvo', 'tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.drycleaner.frontdesk.worker.hub', 'tunic.historicalsociety.closet_dirty.gramps.news', 'tunic.humanecology.frontdesk.worker.intro', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'tunic.historicalsociety.basement.seescratches', 'tunic.historicalsociety.collection.cs', 'tunic.flaghouse.entry.flag_girl.hello', 'tunic.historicalsociety.collection.gramps.found', 'tunic.historicalsociety.basement.ch3start', 'tunic.historicalsociety.entry.groupconvo_flag', 'tunic.library.frontdesk.worker.hello', 'tunic.library.frontdesk.worker.wells', 'tunic.historicalsociety.collection_flag.gramps.flag', 'tunic.historicalsociety.basement.savedteddy', 'tunic.library.frontdesk.worker.nelson', 'tunic.wildlife.center.expert.removed_cup', 'tunic.library.frontdesk.worker.flag', 'tunic.historicalsociety.frontdesk.archivist.hello', 'tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'tunic.historicalsociety.entry.boss.flag', 'tunic.flaghouse.entry.flag_girl.symbol', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.drycleaner.frontdesk.worker.done', 'tunic.historicalsociety.closet_dirty.what_happened', 'tunic.wildlife.center.wells.animals', 'tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.historicalsociety.cage.teddy.trapped', 'tunic.historicalsociety.cage.unlockdoor', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.entry.wells.flag', 'tunic.humanecology.frontdesk.worker.badger', 'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'tunic.historicalsociety.closet.intro', 'tunic.historicalsociety.closet.retirement_letter.hub', 'tunic.historicalsociety.entry.directory.closeup.archivist', 'tunic.historicalsociety.collection.tunic.slip', 'tunic.kohlcenter.halloffame.plaque.face.date', 'tunic.historicalsociety.closet_dirty.trigger_coffee', 'tunic.drycleaner.frontdesk.logbook.page.bingo', 'tunic.library.microfiche.reader.paper2.bingo', 'tunic.kohlcenter.halloffame.togrampa', 'tunic.capitol_2.hall.boss.haveyougotit', 'tunic.wildlife.center.wells.nodeer_recap', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.historicalsociety.closet_dirty.gramps.helpclean', 'tunic.wildlife.center.expert.recap', 'tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'tunic.historicalsociety.cage.lockeddoor', 'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'tunic.historicalsociety.collection.gramps.lost', 'tunic.historicalsociety.closet.notebook', 'tunic.historicalsociety.frontdesk.magnify', 'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'tunic.wildlife.center.remove_cup', 'tunic.library.frontdesk.wellsbadge.hub', 'tunic.wildlife.center.tracks.hub.deer', 'tunic.historicalsociety.frontdesk.key', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.flaghouse.entry.colorbook', 'tunic.wildlife.center.coffee', 'tunic.capitol_1.hall.boss.haveyougotit', 'tunic.historicalsociety.basement.janitor', 'tunic.historicalsociety.collection_flag.gramps.recap', 'tunic.wildlife.center.wells.animals2', 'tunic.flaghouse.entry.flag_girl.symbol_recap', 'tunic.historicalsociety.closet_dirty.photo', 'tunic.historicalsociety.stacks.outtolunch', 'tunic.library.frontdesk.worker.wells_recap', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'tunic.capitol_0.hall.boss.talktogramps', 'tunic.historicalsociety.closet.photo', 'tunic.historicalsociety.collection.tunic', 'tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'tunic.historicalsociety.closet_dirty.gramps.archivist', 'tunic.historicalsociety.closet_dirty.door_block_talk', 'tunic.historicalsociety.entry.boss.flag_recap', 'tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'tunic.historicalsociety.entry.wells.talktogramps', 'tunic.historicalsociety.frontdesk.block_magnify', 'tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'tunic.historicalsociety.closet_dirty.gramps.nothing', 'tunic.historicalsociety.closet_dirty.door_block_clean', 'tunic.capitol_1.hall.boss.writeitup', 'tunic.library.frontdesk.worker.nelson_recap', 'tunic.library.frontdesk.worker.hello_short', 'tunic.historicalsociety.stacks.block', 'tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'tunic.historicalsociety.entry.boss.talktogramps', 'tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'tunic.historicalsociety.entry.wells.flag_recap', 'tunic.drycleaner.frontdesk.worker.done2', 'tunic.library.frontdesk.worker.flag_recap', 'tunic.humanecology.frontdesk.block_0', 'tunic.library.frontdesk.worker.preflag', 'tunic.historicalsociety.basement.gramps.seeyalater', 'tunic.flaghouse.entry.flag_girl.hello_recap', 'tunic.historicalsociety.closet.doorblock', 'tunic.drycleaner.frontdesk.worker.takealook', 'tunic.historicalsociety.basement.gramps.whatdo', 'tunic.library.frontdesk.worker.droppedbadge', 'tunic.historicalsociety.entry.block_tomap2', 'tunic.library.frontdesk.block_nelson', 'tunic.library.microfiche.block_0', 'tunic.historicalsociety.entry.block_tocollection', 'tunic.historicalsociety.entry.block_tomap1', 'tunic.historicalsociety.collection.gramps.look_0', 'tunic.library.frontdesk.block_badge', 'tunic.historicalsociety.cage.need_glasses', 'tunic.library.frontdesk.block_badge_2', 'tunic.kohlcenter.halloffame.block_0', 'tunic.capitol_0.hall.chap1_finale_c', 'tunic.capitol_1.hall.chap2_finale_c', 'tunic.capitol_2.hall.chap4_finale_c', 'tunic.wildlife.center.fox.concern', 'tunic.drycleaner.frontdesk.block_0', 'tunic.historicalsociety.entry.gramps.hub', 'tunic.humanecology.frontdesk.block_1', 'tunic.drycleaner.frontdesk.block_1']
room_lists = ['tunic.historicalsociety.entry', 'tunic.wildlife.center', 'tunic.historicalsociety.cage', 'tunic.library.frontdesk', 'tunic.historicalsociety.frontdesk', 'tunic.historicalsociety.stacks', 'tunic.historicalsociety.closet_dirty', 'tunic.humanecology.frontdesk', 'tunic.historicalsociety.basement', 'tunic.kohlcenter.halloffame', 'tunic.library.microfiche', 'tunic.drycleaner.frontdesk', 'tunic.historicalsociety.collection', 'tunic.historicalsociety.closet', 'tunic.flaghouse.entry', 'tunic.historicalsociety.collection_flag', 'tunic.capitol_1.hall', 'tunic.capitol_0.hall', 'tunic.capitol_2.hall']
LEVELS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
level_groups = ["0-4", "5-12", "13-22"]

In [5]:
# Some few updates of https://www.kaggle.com/code/takanashihumbert/magic-bingo-train-part-lb-0-687

columns = [

    pl.col("page").cast(pl.Float32),
    (
        (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1)) 
         .fill_null(0)
         .clip(0, 1e9)
         .over(["session_id", "level_group"])
         .alias("elapsed_time_diff")
    ),
    (
        (pl.col("screen_coor_x") - pl.col("screen_coor_x").shift(1)) 
         .abs()
         .over(["session_id", "level_group"])
        .alias("location_x_diff") 
    ),
    (
        (pl.col("screen_coor_y") - pl.col("screen_coor_y").shift(1)) 
         .abs()
         .over(["session_id", "level_group"])
        .alias("location_y_diff") 
    ),
    pl.col("fqid").fill_null("fqid_None"),
    pl.col("text_fqid").fill_null("text_fqid_None")
]

In [6]:
def feature_engineer_pl(x, grp, use_extra, feature_suffix):
    aggs = [
        pl.col("index").count().alias(f"session_number_{feature_suffix}"),

        *[pl.col('index').filter(pl.col('text').str.contains(c)).count().alias(f'word_{c}') for c in DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).mean().alias(f'word_mean_{c}') for c in
          DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).std().alias(f'word_std_{c}') for c in
          DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).max().alias(f'word_max_{c}') for c in
          DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).sum().alias(f'word_sum_{c}') for c in
          DIALOGS],
        *[pl.col("elapsed_time_diff").filter((pl.col('text').str.contains(c))).median().alias(f'word_median_{c}') for c
          in DIALOGS],

        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in CATS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") for c in NUMS],

        *[pl.col("fqid").filter(pl.col("fqid") == c).count().alias(f"{c}_fqid_counts{feature_suffix}")
          for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in fqid_lists],

        *[pl.col("text_fqid").filter(pl.col("text_fqid") == c).count().alias(f"{c}_text_fqid_counts{feature_suffix}")
          for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}")
          for
          c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in text_lists],

        *[pl.col("room_fqid").filter(pl.col("room_fqid") == c).count().alias(f"{c}_room_fqid_counts{feature_suffix}")
          for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).median().alias(f"{c}_ET_median_{feature_suffix}")
          for
          c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in room_lists],

        *[pl.col("event_name").filter(pl.col("event_name") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")
          for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}")
          for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).median().alias(
            f"{c}_ET_median_{feature_suffix}") for
          c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in event_name_feature],

        *[pl.col("name").filter(pl.col("name") == c).count().alias(f"{c}_name_counts{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in
          name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          name_feature],

        *[pl.col("level").filter(pl.col("level") == c).count().alias(f"{c}_LEVEL_count{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c
          in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).median().alias(f"{c}_ET_median_{feature_suffix}") for
          c in
          LEVELS],
        *[pl.col("elapsed_time_diff").filter(pl.col("level") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in
          LEVELS],

        *[pl.col("level_group").filter(pl.col("level_group") == c).count().alias(
            f"{c}_LEVEL_group_count{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).std().alias(f"{c}_ET_std_{feature_suffix}") for
          c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}")
          for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for
          c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).median().alias(
            f"{c}_ET_median_{feature_suffix}") for c in
          level_groups],
        *[pl.col("elapsed_time_diff").filter(pl.col("level_group") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for
          c in
          level_groups],

    ]

    df = x.groupby(['session_id'], maintain_order=True).agg(aggs).sort("session_id")

    if use_extra:
        if grp == '5-12':
            aggs = [
                pl.col("elapsed_time").filter((pl.col("text") == "Here's the log book.")
                                              | (pl.col("fqid") == 'logbook.page.bingo'))
                    .apply(lambda s: s.max() - s.min()).alias("logbook_bingo_duration"),
                pl.col("index").filter(
                    (pl.col("text") == "Here's the log book.") | (pl.col("fqid") == 'logbook.page.bingo')).apply(
                    lambda s: s.max() - s.min()).alias("logbook_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                            pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "reader_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader')) | (
                        pl.col("fqid") == "reader.paper2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "reader_bingo_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                            pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "journals_bingo_duration"),
                pl.col("index").filter(((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals')) | (
                        pl.col("fqid") == "journals.pic_2.bingo")).apply(lambda s: s.max() - s.min()).alias(
                    "journals_bingo_indexCount"),
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

        if grp == '13-22':
            aggs = [
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                            pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'reader_flag')) | (
                            pl.col("fqid") == "tunic.library.microfiche.reader_flag.paper2.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("reader_flag_indexCount"),
                pl.col("elapsed_time").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                            pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_duration"),
                pl.col("index").filter(
                    ((pl.col("event_name") == 'navigate_click') & (pl.col("fqid") == 'journals_flag')) | (
                            pl.col("fqid") == "journals_flag.pic_0.bingo")).apply(
                    lambda s: s.max() - s.min() if s.len() > 0 else 0).alias("journalsFlag_bingo_indexCount")
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

    return df.to_pandas()

In [7]:
%%time

# we prepare the dataset for the training by level :
df = (pl.read_csv("./train.csv",dtypes=train_dtypes)
      .drop(["fullscreen", "hq", "music"])
      .with_columns(columns))
df = reduce_mem_usage_pl(df)

Memory usage of dataframe is 3819.28 MB
Memory usage became:  2009.4658946990967  MB
CPU times: user 30.5 s, sys: 11.7 s, total: 42.3 s
Wall time: 10.6 s


In [8]:
df = df.with_columns(df["text"].cast(pl.Utf8))

In [9]:
df.head()

session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,level_group,elapsed_time_diff,location_x_diff,location_y_diff
i64,i16,i32,cat,cat,i8,f32,f32,f32,f32,f32,f32,str,cat,cat,cat,cat,i32,f32,f32
20090312431273200,0,0,"""cutscene_click...","""basic""",0,,-413.991394,-159.314682,380.0,494.0,,"""undefined""","""intro""","""tunic.historic...","""tunic.historic...","""0-4""",0,,
20090312431273200,1,1323,"""person_click""","""basic""",0,,-413.991394,-159.314682,380.0,494.0,,"""Whatcha doing ...","""gramps""","""tunic.historic...","""tunic.historic...","""0-4""",1323,0.0,0.0
20090312431273200,2,831,"""person_click""","""basic""",0,,-413.991394,-159.314682,380.0,494.0,,"""Just talking t...","""gramps""","""tunic.historic...","""tunic.historic...","""0-4""",0,0.0,0.0
20090312431273200,3,1147,"""person_click""","""basic""",0,,-413.991394,-159.314682,380.0,494.0,,"""I gotta run to...","""gramps""","""tunic.historic...","""tunic.historic...","""0-4""",316,0.0,0.0
20090312431273200,4,1863,"""person_click""","""basic""",0,,-412.991394,-159.314682,381.0,494.0,,"""Can I come, Gr...","""gramps""","""tunic.historic...","""tunic.historic...","""0-4""",716,1.0,0.0


In [10]:
df1 = df.filter(pl.col("level_group")=='0-4')
df2 = df.filter(pl.col("level_group")=='5-12')
df3 = df.filter(pl.col("level_group")=='13-22')
df1.shape,df2.shape,df3.shape

((3981005, 20), (8844238, 20), (13471703, 20))

In [11]:
del df
gc.collect()

110

In [12]:
%%time
df1 = feature_engineer_pl(df1, grp='0-4', use_extra=True, feature_suffix='')
print('df1 done',df1.shape)
df2 = feature_engineer_pl(df2, grp='5-12', use_extra=True, feature_suffix='')
print('df2 done',df2.shape)
df3 = feature_engineer_pl(df3, grp='13-22', use_extra=True, feature_suffix='')
print('df3 done',df3.shape)

df1 done (23562, 2052)
df2 done (23562, 2058)
df3 done (23562, 2056)
CPU times: user 2min 55s, sys: 3.56 s, total: 2min 59s
Wall time: 48.4 s


In [13]:
# some cleaning...
null1 = df1.isnull().sum().sort_values(ascending=False) / len(df1)
null2 = df2.isnull().sum().sort_values(ascending=False) / len(df1)
null3 = df3.isnull().sum().sort_values(ascending=False) / len(df1)

drop1 = list(null1[null1>0.9].index)
drop2 = list(null2[null2>0.9].index)
drop3 = list(null3[null3>0.9].index)
print(len(drop1), len(drop2), len(drop3))

for col in df1.columns:
    if df1[col].nunique()==1:
        print(col)
        drop1.append(col)
print("*********df1 DONE*********")
for col in df2.columns:
    if df2[col].nunique()==1:
        print(col)
        drop2.append(col)
print("*********df2 DONE*********")
for col in df3.columns:
    if df3[col].nunique()==1:
        print(col)
        drop3.append(col)
print("*********df3 DONE*********")

print(len(drop1), len(drop2), len(drop3))


1200 925 800
word_found
word_wells
word_Oh
word_flag
elapsed_time_diff_min_
worker_fqid_counts
archivist_fqid_counts
confrontation_fqid_counts
crane_ranger_fqid_counts
flag_girl_fqid_counts
archivist_glasses_fqid_counts
journals_fqid_counts
seescratches_fqid_counts
groupconvo_flag_fqid_counts
expert_fqid_counts
businesscards_fqid_counts
ch3start_fqid_counts
tofrontdesk_fqid_counts
savedteddy_fqid_counts
glasses_fqid_counts
tunic.drycleaner_fqid_counts
reader_flag_fqid_counts
tunic.library_fqid_counts
tracks_fqid_counts
tunic.capitol_2_fqid_counts
trigger_scarf_fqid_counts
reader_fqid_counts
tunic.capitol_1_fqid_counts
journals.pic_0.next_fqid_counts
unlockdoor_fqid_counts
what_happened_fqid_counts
tunic.humanecology_fqid_counts
colorbook_fqid_counts
logbook_fqid_counts
businesscards.card_0.next_fqid_counts
journals.hub.topics_fqid_counts
logbook.page.bingo_fqid_counts
journals.pic_1.next_fqid_counts
journals_flag_fqid_counts
reader.paper0.next_fqid_counts
tracks.hub.deer_fqid_counts
re

In [14]:
def time_feature(train):
    
    train["year"] = train["session_id"].apply(lambda x: int(str(x)[:2])).astype(np.uint8)
    train["month"] = train["session_id"].apply(lambda x: int(str(x)[2:4])+1).astype(np.uint8)
    train["day"] = train["session_id"].apply(lambda x: int(str(x)[4:6])).astype(np.uint8)
    train["hour"] = train["session_id"].apply(lambda x: int(str(x)[6:8])).astype(np.uint8)
    train["minute"] = train["session_id"].apply(lambda x: int(str(x)[8:10])).astype(np.uint8)
    train["second"] = train["session_id"].apply(lambda x: int(str(x)[10:12])).astype(np.uint8)

    return train

In [15]:
df1 = time_feature(df1)
df2 = time_feature(df2)
df3 = time_feature(df3)

In [16]:
df1.head()

Unnamed: 0,session_id,session_number_,word_that,word_this,word_it,word_you,word_find,word_found,word_Found,word_notebook,...,13-22_ET_median_,0-4_ET_max_,5-12_ET_max_,13-22_ET_max_,year,month,day,hour,minute,second
0,20090312431273200,165,4,2,12,11,1,0,1,3,...,,30837,,,20,10,3,12,43,12
1,20090312433251036,139,4,2,10,9,1,0,1,3,...,,37409,,,20,10,3,12,43,32
2,20090312455206810,149,3,3,11,10,1,0,1,3,...,,209421,,,20,10,3,12,45,52
3,20090313091715820,176,4,2,11,12,2,0,1,3,...,,47849,,,20,10,3,13,9,17
4,20090313571836404,112,4,2,10,9,1,0,1,3,...,,31920,,,20,10,3,13,57,18


In [18]:
df1 = df1.set_index('session_id')
df2 = df2.set_index('session_id')
df3 = df3.set_index('session_id')

FEATURES1 = [c for c in df1.columns if c not in drop1+['level_group']]
FEATURES2 = [c for c in df2.columns if c not in drop2+['level_group']]
FEATURES3 = [c for c in df3.columns if c not in drop3+['level_group']]
print('We will train with', len(FEATURES1), len(FEATURES2), len(FEATURES3) ,'features')
ALL_USERS = df1.index.unique()
print('We will train with', len(ALL_USERS) ,'users info')

We will train with 629 970 1137 features
We will train with 23562 users info


In [19]:
f1 = open('FEATURES1.txt', 'wb')
pickle.dump(FEATURES1, f1)
f2 = open('FEATURES2.txt', 'wb')
pickle.dump(FEATURES2, f2)
f3 = open('FEATURES3.txt', 'wb')
pickle.dump(FEATURES3, f3)

In [20]:
# With previous training notebook (Kfold with 20 folds as performed in others notebooks) :
estimators_lgb = [498, 448, 378, 364, 405, 495, 456, 249, 384, 405, 356, 262, 484, 381, 392, 248 ,248, 345]

In [21]:
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'learning_rate': 0.05,
    'alpha': 8,
    'max_depth': 4,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'random_state': 42
}

## We fit and store the models for predictions

In [22]:
warnings.filterwarnings("ignore")
targets = pd.read_csv('./train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))
pred_lgb = np.zeros((df1.shape[0], 18))
n_splits = 5
kf = KFold(n_splits=n_splits)



In [23]:
for i in range(1,19):
    print(f'Correct rate for question {i}: ')
    print(targets.loc[(targets['q']==i) & (targets['correct']==1)].shape[0]/targets.loc[targets['q']==i].shape[0])

Correct rate for question 1: 
0.7274849333672863
Correct rate for question 2: 
0.9788218317630082
Correct rate for question 3: 
0.9340039045921399
Correct rate for question 4: 
0.7982344452932688
Correct rate for question 5: 
0.5482556659027247
Correct rate for question 6: 
0.7759528053645701
Correct rate for question 7: 
0.7360580595874714
Correct rate for question 8: 
0.6172226466344113
Correct rate for question 9: 
0.7362702656820304
Correct rate for question 10: 
0.5054324760207113
Correct rate for question 11: 
0.6436210847975554
Correct rate for question 12: 
0.8629573041337747
Correct rate for question 13: 
0.27510398098633393
Correct rate for question 14: 
0.7076648841354723
Correct rate for question 15: 
0.4810287751464222
Correct rate for question 16: 
0.7348697054579407
Correct rate for question 17: 
0.6878023936847466
Correct rate for question 18: 
0.9506408624055683


In [24]:
val_scores=[]
for q in range(1, 19):
    # USE THIS TRAIN DATA WITH THESE QUESTIONS
    if q <= 3:
        grp = '0-4'
        df = df1
        FEATURES = FEATURES1
    elif q <= 13:
        grp = '5-12'
        df = df2
        FEATURES = FEATURES2
    elif q <= 22:
        grp = '13-22'
        df = df3
        FEATURES = FEATURES3

    lgb_params['n_estimators'] = estimators_lgb[q - 1]

    # TRAIN DATA
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        df_train = df.iloc[train_idx] #.reset_index(drop=True)
        train_users = df_train.index.values
        train_y = targets[targets['session'].isin(list(train_users))].loc[targets.q == q].set_index('session')

        df_val = df.iloc[val_idx] #.reset_index(drop=True)
        val_users = df_val.index.values
        val_y = targets[targets['session'].isin(list(val_users))].loc[targets.q == q].set_index('session')

        clf = LGBMClassifier(**lgb_params)
        clf.fit(df_train[FEATURES].astype('float32'), train_y['correct'], verbose=0)

        clf.booster_.save_model(f'LGBM_question{q}_fold{fold}.lgb')
        print(f'Model saved for question {q} fold {fold} with iterations = {estimators_lgb[q-1]}')

        pred_y=clf.predict(df_val[FEATURES].astype('float32'))
        score = f1_score(val_y['correct'],pred_y)
        print(f'Current Fold val f1 score: {score}')
        
        val_scores.append(score)

Model saved for question 1 fold 0 with iterations = 498
Current Fold val f1 score: 0.8310911808669657
Model saved for question 1 fold 1 with iterations = 498
Current Fold val f1 score: 0.8462053272654263
Model saved for question 1 fold 2 with iterations = 498
Current Fold val f1 score: 0.8461332619748463
Model saved for question 1 fold 3 with iterations = 498
Current Fold val f1 score: 0.8493801107887101
Model saved for question 1 fold 4 with iterations = 498
Current Fold val f1 score: 0.8518126488489018
Model saved for question 2 fold 0 with iterations = 448
Current Fold val f1 score: 0.991112538815719
Model saved for question 2 fold 1 with iterations = 448
Current Fold val f1 score: 0.989060489060489
Model saved for question 2 fold 2 with iterations = 448
Current Fold val f1 score: 0.9887326966412704
Model saved for question 2 fold 3 with iterations = 448
Current Fold val f1 score: 0.989924973204716
Model saved for question 2 fold 4 with iterations = 448
Current Fold val f1 score: 0.

In [28]:
print('Averaged F1 score across all question: \n')
np.mean(val_scores)

Averaged F1 score across all question: 



0.8031110565453702

In [30]:
Qe=1
for i in range(0,90,5):
    # print(f'Question-{Qe}...The averaged F1 Score across five folds are: {np.mean(val_scores[i:i+5])}')
    print(np.mean(val_scores[i:i+5]))
    Qe+=1

0.84492450594897
0.9892295961222549
0.9657435958894673
0.8949146227524473
0.7075481687491324
0.8772722542991381
0.8448902797197786
0.7500009357333776
0.8481287165585851
0.6484596039559574
0.7726229226598432
0.9256927725583749
0.2772402780490246
0.8315073899843078
0.6460442702232112
0.8457954077171351
0.8116020220235864
0.9743816748720704


In [31]:
# Train with the same train and val set for all 18 questions
val_scores_s2=[]

# TRAIN DATA
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    
    for q in range(1, 19):
        # USE THIS TRAIN DATA WITH THESE QUESTIONS
        if q <= 3:
            grp = '0-4'
            df = df1
            FEATURES = FEATURES1
        elif q <= 13:
            grp = '5-12'
            df = df2
            FEATURES = FEATURES2
        elif q <= 22:
            grp = '13-22'
            df = df3
            FEATURES = FEATURES3

        lgb_params['n_estimators'] = estimators_lgb[q - 1]

        df_train = df.iloc[train_idx] #.reset_index(drop=True)
        train_users = df_train.index.values
        train_y = targets[targets['session'].isin(list(train_users))].loc[targets.q == q].set_index('session')

        df_val = df.iloc[val_idx] #.reset_index(drop=True)
        val_users = df_val.index.values
        val_y = targets[targets['session'].isin(list(val_users))].loc[targets.q == q].set_index('session')

        clf = LGBMClassifier(**lgb_params)
        clf.fit(df_train[FEATURES].astype('float32'), train_y['correct'], verbose=0)

        clf.booster_.save_model(f'LGBM_question{q}_fold{fold}.lgb')
        print(f'Model saved for question {q} fold {fold} with iterations = {estimators_lgb[q-1]}')

        pred_y=clf.predict(df_val[FEATURES].astype('float32'))
        score = f1_score(val_y['correct'],pred_y)
        print(f'Current Fold val f1 score: {score}')

        val_scores_s2.append(score)

Model saved for question 1 fold 0 with iterations = 498
Current Fold val f1 score: 0.8310911808669657
Model saved for question 2 fold 0 with iterations = 448
Current Fold val f1 score: 0.991112538815719
Model saved for question 3 fold 0 with iterations = 378
Current Fold val f1 score: 0.9646670335718216
Model saved for question 4 fold 0 with iterations = 364
Current Fold val f1 score: 0.887
Model saved for question 5 fold 0 with iterations = 405
Current Fold val f1 score: 0.688839615668884
Model saved for question 6 fold 0 with iterations = 495
Current Fold val f1 score: 0.8698082052584784
Model saved for question 7 fold 0 with iterations = 456
Current Fold val f1 score: 0.8374756018217306
Model saved for question 8 fold 0 with iterations = 249
Current Fold val f1 score: 0.7314761631246411
Model saved for question 9 fold 0 with iterations = 384
Current Fold val f1 score: 0.8363157894736842
Model saved for question 10 fold 0 with iterations = 405
Current Fold val f1 score: 0.64595257563

In [32]:
print('Averaged F1 score across all question: \n')
np.mean(val_scores_s2)

Averaged F1 score across all question: 



0.8031110565453702

In [47]:
Qe=1
for i in range(0,18):
    # print(f'Question-{Qe}...The averaged F1 Score across five folds are: {np.mean(val_scores[i:i+5])}')
    # print(np.mean(val_scores_s2[i:18:i+72]))
    print(np.mean([val_scores_s2[i], val_scores_s2[i+18], val_scores_s2[i+36], val_scores_s2[i+54], val_scores_s2[i+72] ]))
    Qe+=1

0.84492450594897
0.9892295961222549
0.9657435958894673
0.8949146227524473
0.7075481687491324
0.8772722542991381
0.8448902797197786
0.7500009357333776
0.8481287165585851
0.6484596039559574
0.7726229226598432
0.9256927725583749
0.2772402780490246
0.8315073899843078
0.6460442702232112
0.8457954077171351
0.8116020220235864
0.9743816748720704


# Submission

In [None]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [None]:
models_list = [[Booster(model_file = f"/kaggle/working/LGBM_question{q}_fold{fold}.lgb"
) for fold in range(5)] for q in range(1, 19)]

In [None]:
limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

count = 0

for (test, sample_submission) in iter_test:
    session_id = test.session_id.values[0]
    grp = test.level_group.values[0]
    a, b = limits[grp]
    preds = []

    # ------------------- level 0-4 ---------------------------------
    if a == 1:
        FEATURES = FEATURES1
        test = (pl.from_pandas(test)
                .drop(["fullscreen", "hq", "music"])
                .with_columns(columns))
        test = feature_engineer_pl(test, grp, use_extra=True, feature_suffix='')
        test = time_feature(test)
        test = test[FEATURES]

    # ------------------- level 5-12 ---------------------------------
    elif a == 4:
        FEATURES = FEATURES2
        test = (pl.from_pandas(test)
                .drop(["fullscreen", "hq", "music"])
                .with_columns(columns))
        test = feature_engineer_pl(test, grp, use_extra=True, feature_suffix='')
        test = time_feature(test)
        test = test[FEATURES]

    # ------------------- level 13-22 ---------------------------------
    elif a == 14:
        FEATURES = FEATURES3
        test = (pl.from_pandas(test)
                .drop(["fullscreen", "hq", "music"])
                .with_columns(columns))
        test = feature_engineer_pl(test, grp, use_extra=True, feature_suffix='')
        test = time_feature(test)
        test = test[FEATURES]
    
    for q in range(a,b):
        fold = 0
        thresh = 0.63
        model_0 = models_list[q-1][fold]
        model_1 = models_list[q-1][fold+1]
        model_2 = models_list[q-1][fold+2]
        model_3 = models_list[q-1][fold+3]
        model_4 = models_list[q-1][fold+4]
        
        pred_0 = model_0.predict(test[FEATURES].astype(np.float32))
        pred_1 = model_1.predict(test[FEATURES].astype(np.float32))
        pred_2 = model_2.predict(test[FEATURES].astype(np.float32))
        pred_3 = model_3.predict(test[FEATURES].astype(np.float32))
        pred_4 = model_4.predict(test[FEATURES].astype(np.float32))
        
        pred = (pred_0 + pred_1 + pred_2 + pred_3 + pred_4) / 5
        preds.append(int(pred > thresh))

    sample_submission["correct"] = preds
    env.predict(sample_submission)

In [None]:
pd.read_csv('submission.csv').head(10)