In [30]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [31]:
import pandas as pd
import numpy as np
import polars as pl
from sklearn.model_selection import KFold,GroupKFold,StratifiedKFold,StratifiedGroupKFold
import warnings
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import Pool, CatBoostRegressor
from collections import defaultdict
from sklearn.metrics import mean_squared_error
from copy import deepcopy
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import regex as re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import joblib
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
warnings.filterwarnings("ignore")


In [32]:
CATS = ['activations', 'text_change']
NUMS = ['action_time', 'cursor_position', 'word_count', 'elapsed_time_diff_1', "down_time" ]

NUMS_1 = [ "elapsed_time_diff_2",
            "elapsed_time_diff_3",
            "elapsed_time_diff_5",
            "elapsed_time_diff_10",
            "elapsed_time_diff_20",
            "elapsed_time_diff_50",
            "cursor_position_diff_1",
            "cursor_position_diff_2",
            "cursor_position_diff_3",
            "cursor_position_diff_5",
            "cursor_position_diff_10",
            "cursor_position_diff_20",
            "cursor_position_diff_50",
            "word_count_diff_1",
            "word_count_diff_2",
            "word_count_diff_3",
            "word_count_diff_5",
            "word_count_diff_10",
            "word_count_diff_20",
            "word_count_diff_50",


]

NUMS += NUMS_1
name_feature = ['Nonproduction',
                 'Input',
                 'Remove/Cut',
                 'Replace',
                 'Move',
                 'Paste',
               ]

text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']

down_event_feature = [
            'Dead','F3','{','Meta','F15','ArrowDown','z','F6','Pause','S','\x80',')','u','Cancel','Backspace','End','¡','MediaTrackNext','/','Tab','C','i','Process','F11','Home','~','%','1','q','F','s','5','Clear','l','OS','h','o','.','2','n','Control','*','Escape','#','`','â\x80\x93','Â´','MediaTrackPrevious','Alt','Ä±','a','@','PageUp','A','NumLock','ModeChange',',','^','PageDown','ContextMenu','F12','F2','Unknownclick','b','\x9b','-','"',"'",';','Delete','F1','$','T','0','CapsLock','M','ArrowRight','Ë\x86','AudioVolumeDown','?','p','Insert','\x96','y','w','AudioVolumeUp','Enter','Leftclick','V','¿','MediaPlayPause','}','AltGraph','_','I',':','AudioVolumeMute','Rightclick','>','ArrowLeft','c','Middleclick','(','ScrollLock','r','ArrowUp','Shift','Unidentified','&','|','g','!','v','F10','x','+','=','j','t','d','e','\x97','Space','Å\x9f','m','f','\\','ä'

]


down_event_feature = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
pasue_gaps = [8]

up_event_feature = [
    'AudioVolumeDown','h','MediaTrackNext','ArrowRight','l','Ë\x86','S',"'",'f','Â´','>','F12','~',']','Space','Escape','r','o','F10','Leftclick','[','g','e','=','¿','T','-','MediaPlayPause','k','Backspace','Enter','C','{','i','AudioVolumeUp','MediaTrackPrevious','<','`','F15','q','F1','Å\x9f','Delete','AltGraph','(','^','Process','0','CapsLock',')','Alt','ArrowUp','ScrollLock','Clear','c','ModeChange','_','|','ArrowDown','Ä±','+','PageUp','\x80','NumLock','"','Middleclick','M','Cancel','#',':','â\x80\x93','a','ä','Home','\x9b','1','V','Unidentified','b','\\','Insert','Shift','Rightclick','\x97','?','F11','5','Unknownclick','2','n','\x96','%','t','$','j','ContextMenu','p','y','End','d','@','m','v','Pause','.','!','OS','ArrowLeft','PageDown','s','Control','F3','}','F6','w','&','Dead','¡','u','x','A','Meta',',','z','/','*',';','Tab','AudioVolumeMute','F2'
    ]

up_event_feature = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']

In [33]:
def rename_act(act_name):
    if act_name.startswith("Move"):
        return "Move"
    else:
        return act_name

In [34]:
columns = [
    pl.col("action_time").cast(pl.Float32),
    pl.col("cursor_position").cast(pl.Float32),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff")),
    pl.col("word_count").cast(pl.Float32),
    pl.col("activity").apply(rename_act).alias("activations"),
    
    ((pl.col("down_time") - pl.col("up_time").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_1")),
    ((pl.col("down_time") - pl.col("up_time").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_2")),
    ((pl.col("down_time") - pl.col("up_time").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_3")),
    ((pl.col("down_time") - pl.col("up_time").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_5")),
    ((pl.col("down_time") - pl.col("up_time").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_10")),
    ((pl.col("down_time") - pl.col("up_time").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_20")),
    ((pl.col("down_time") - pl.col("up_time").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_50")),
    
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_1")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_2")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_3")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_5")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_10")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_20")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_50")),

    ((pl.col("word_count") - pl.col("word_count").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_1")),
    ((pl.col("word_count") - pl.col("word_count").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_2")),
    ((pl.col("word_count") - pl.col("word_count").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_3")),
    ((pl.col("word_count") - pl.col("word_count").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_5")),
    ((pl.col("word_count") - pl.col("word_count").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_10")),
    ((pl.col("word_count") - pl.col("word_count").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_20")),
    ((pl.col("word_count") - pl.col("word_count").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_50")),
    
    (pl.col('word_count').max()/pl.col('action_time').max()/1000).alias('word_count_pre_step'),
]

In [35]:
df = (pl.read_csv("train_logs.csv")
      .with_columns(columns)
      .drop(["activity"])
     )

## build text feature

In [36]:
text_df = pd.read_csv('corpus.csv')
# corpus = text_df['text'].tolist()
# pipe = Pipeline([('count', CountVectorizer()),
#                  ('tfid', TfidfTransformer())]).fit(corpus)
# text_feature = pipe.transform(corpus).toarray()
# text_feature_df = pd.DataFrame(text_feature, columns=[f'text_features_{i}' for i in range(text_feature.shape[1])])
# text_feature_df['id'] = tmp_df['id']
# text_feature_df = text_feature_df.set_index('id')
# joblib.dump(pipe, './xgb_5fold/pipeline.pkl')

In [37]:
bert_result = pd.read_csv('bert_predict.csv').set_index('id')[['bert']]

In [38]:
pipe = joblib.load('./xgb_5fold/pipeline.pkl')

In [39]:
# 加载预训练模型

# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(0.1)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        #feature = F.normalize(feature, p=2, dim=1)
        return feature

    
# model_name = './pretrained_model'
# device = 'cuda'
# model = CustomModel(model_name)
# model.to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)


class TestDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
       
        inputs = tokenizer(text,
                           max_length=256,
                           pad_to_max_length=True,
                           add_special_tokens=True,
                           return_offsets_mapping=False)

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

def get_model_feature(model, texts):
    feature_outs_all = []
    test_dataset = TestDataset(texts)
    test_loader = DataLoader(test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
                             num_workers=0, pin_memory=True, drop_last=False)

    # tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tqdm(test_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            feature_outs = model(inputs)
            feature_outs_all.append(feature_outs.cpu())

    feature_outs_all_final = torch.cat(feature_outs_all, dim=0).numpy()
    #print(feature_outs_all_final.shape)

    return feature_outs_all_final



In [40]:
eps = 1e-3
def feature_engineer_for_index(x, feature_suffix):

    aggs = [
        (pl.col('word_count').max()/ pl.col("up_event").filter(pl.col("up_event") == '.').count()).clip(0, 500).alias('every_sentence_word_count'),
        (pl.col('cursor_position').max()/ pl.col("word_count").max()).alias('every_char_word'),
        (pl.col('activations').filter(pl.col("activations").is_in(['Remove/Cut', 'Replace'])).count()/ pl.col("up_event").filter(pl.col("up_event") == '.').count()).clip(0, 500).alias('every_sentence_change'),
        ( pl.col("cursor_position").max() / (pl.col('down_time').max() - pl.col('down_time').min()) * 1000 ).alias('char_per_min'),
        ( pl.col("word_count").max() / (pl.col('down_time').max() - pl.col('down_time').min()) * 1000 ).alias('word_per_min'),
        
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).sum() / pl.col("elapsed_time_diff_1").sum()).alias('proportion_of_pause_time'),
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).count()).alias('count_of_pause_time'),
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).sum() / pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 10000 ).count()).alias('pause_length'),
        (pl.col("elapsed_time_diff_1").filter((pl.col("elapsed_time_diff_1") > 8000) & (pl.col("text_change") == ' ' )).count()).alias('freq_of_pause_time'),
        #(pl.col("activations").filter(pl.col("elapsed_time_diff_1") > 2000 ).count() / pl.col("activations").count()).alias('proportion_of_pause'),
        # (pl.col('down_event').filter(pl.col('down_event').is_in(punctuations)).count() ).alias('punct_cnt'),
        #*[(pl.col('text_change').filter(pl.col('text_change') == c).count() ).alias(f'text_change_{c}_cnt') for c in text_changes],
        #pl.col('event_id').count().alias('event_count'),
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in CATS],
        *[pl.col(c).quantile(0.1, "nearest").alias(f"{c}_quantile1_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.2, "nearest").alias(f"{c}_quantile2_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.3, "nearest").alias(f"{c}_quantile3_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.4, "nearest").alias(f"{c}_quantile4_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.5, "nearest").alias(f"{c}_quantile5_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.6, "nearest").alias(f"{c}_quantile6_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.7, "nearest").alias(f"{c}_quantile7_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.8, "nearest").alias(f"{c}_quantile8_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.9, "nearest").alias(f"{c}_quantile9_{feature_suffix}") for c in NUMS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") for c in NUMS],
        *[pl.col(c).sum().alias(f"{c}_sum_{feature_suffix}") for c in NUMS],
        *[pl.col(c).skew().alias(f"{c}_skew_{feature_suffix}") for c in NUMS],
        *[pl.col(c).kurtosis().alias(f"{c}_kurtosis_{feature_suffix}") for c in NUMS],
        # adding rank features
        *[pl.col(c).last().alias(f"{c}_rank_{feature_suffix}") for c in NUMS],

        
        
        
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_ET_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_ET_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_ET_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_ET_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_ET_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_ET_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_ET_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_ET_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_ET_quantile9_{feature_suffix}") for c in name_feature],
        
        *[pl.col("activations").filter(pl.col("activations") == c).count().alias(f"{c}_name_counts{feature_suffix}")for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).last().alias(f"{c}_ET_rank_{feature_suffix}") for c in name_feature],
        #*[pl.col("elapsed_time_diff_1").filter(pl.col("name")==c).mode().alias(f"{c}_ET_mode_{feature_suffix}") for c in name_feature],
        
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_ETD_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_ETD_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_ETD_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_ETD_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_ETD_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_ETD_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_ETD_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_ETD_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_ETD_quantile9_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).mean().alias(f"{c}_ETD_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).max().alias(f"{c}_ETD_max_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).min().alias(f"{c}_ETD_min_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).std().alias(f"{c}_ETD_std_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).median().alias(f"{c}_ETD_median_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).last().alias(f"{c}_ETD_rank_{feature_suffix}") for c in name_feature],
        
        
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_CP_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_CP_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_CP_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_CP_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_CP_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_CP_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_CP_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_CP_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_CP_quantile9_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).mean().alias(f"{c}_CP_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).max().alias(f"{c}_CP_max_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).min().alias(f"{c}_CP_min_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).std().alias(f"{c}_CP_std_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).median().alias(f"{c}_CP_median_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).last().alias(f"{c}_CP_rank_{feature_suffix}") for c in name_feature],
        
        
        *[pl.col("activations").filter(pl.col("down_event") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_ET_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_ET_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_ET_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_ET_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_ET_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_ET_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_ET_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_ET_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_ET_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_ET_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_ET_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_ET_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_ET_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_ET_median_{feature_suffix}") for c in down_event_feature],
        # adding rank features
        *[pl.col("action_time").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_ET_rank_{feature_suffix}") for c in down_event_feature],
        
        
        
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_ETD_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_ETD_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_ETD_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_ETD_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_ETD_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_ETD_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_ETD_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_ETD_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_ETD_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_ETD_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_ETD_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_ETD_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_ETD_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_ETD_median_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_ETD_rank_{feature_suffix}") for c in down_event_feature],
        
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_DT_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_DT_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_DT_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_DT_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_DT_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_DT_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_DT_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_DT_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_DT_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_DT_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_DT_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_DT_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_DT_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_DT_median_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_DT_rank_{feature_suffix}") for c in down_event_feature],
        
        
        
        
        *[pl.col("activations").filter(pl.col("up_event") == c).count().alias(f"UE_{c}_event_name_counts{feature_suffix}")for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.1, "nearest").alias(f"UE_{c}_ET_quantile1_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.2, "nearest").alias(f"UE_{c}_ET_quantile2_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.3, "nearest").alias(f"UE_{c}_ET_quantile3_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.4, "nearest").alias(f"UE_{c}_ET_quantile4_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.5, "nearest").alias(f"UE_{c}_ET_quantile5_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.6, "nearest").alias(f"UE_{c}_ET_quantile6_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.7, "nearest").alias(f"UE_{c}_ET_quantile7_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.8, "nearest").alias(f"UE_{c}_ET_quantile8_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.9, "nearest").alias(f"UE_{c}_ET_quantile9_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).mean().alias(f"UE_{c}_ET_mean_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).std().alias(f"UE_{c}_ET_std_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).max().alias(f"UE_{c}_ET_max_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).min().alias(f"UE_{c}_ET_min_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).median().alias(f"UE_{c}_ET_median_{feature_suffix}") for c in up_event_feature],
        # adding rank features
        *[pl.col("action_time").filter(pl.col("up_event")==c).last().alias(f"UE_{c}_ET_rank_{feature_suffix}") for c in up_event_feature],


        ]

    df = x.groupby(["id"], maintain_order=True).agg(aggs).sort("id")
    tmp_df = x.filter((~x['text_change'].str.contains('=>'))&(x['text_change'] != 'NoChange'))
    tmp_df = tmp_df.groupby('id').agg(pl.col("text_change"))
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: ''.join(x)).alias("text_change")])
    corpus = text_df['text'].tolist()
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: re.findall(r'q+', x)).alias("text_change")])
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: len(x)).alias("input_word_count")])
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_mean")])
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_max")])
    tmp_df = tmp_df.with_columns([pl.col("text_change").apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_std")])
    tmp_df = tmp_df.drop(["text_change"])
    df = df.join(tmp_df, on='id')
    df = df.to_pandas()
    df['word_time_ratio'] = df[f"word_count_max_{feature_suffix}"] / df[f"down_time_max_{feature_suffix}"]
    #df['word_event_ratio'] = df[f"word_count_max_{feature_suffix}"] / df['event_count']
    #df['event_time_ratio'] = df['event_count']  / df[f"down_time_max_{feature_suffix}"]
    df['idle_time_ratio'] = df[f"elapsed_time_diff_1_sum_{feature_suffix}"] / df[f"down_time_max_{feature_suffix}"]
    
    text_feature = pipe.transform(corpus).toarray() # pipe.transform(corpus).toarray() # get_model_feature(model, texts)
    text_feature_df = pd.DataFrame(text_feature, columns=[f'text_features_{i}' for i in range(text_feature.shape[1])])
    text_feature_df['id'] = text_df['id']
    text_feature_df = text_feature_df.set_index('id')

    df = df.join(text_feature_df, on='id')
    #df = df.join(bert_result, on='id')
    return df

In [41]:
df = feature_engineer_for_index(df, 'index_')
invalid_columns = [c for c in df.columns if '[' in c or ']' in c or '<' in c]
rename_dict = {}
for name in invalid_columns:
    rename_dict[name] = name.replace('[', 'left_brackets').replace(']', 'right_brackets').replace('<', 'less')
df = df.rename(columns=rename_dict)

In [42]:
# some cleaning...
null1 = df.isnull().sum().sort_values(ascending=False) / len(df)


drop1 = list(null1[null1>0.9].index)

print(len(drop1))

for col in df.columns:
    if df[col].nunique()==1:
        print(col)
        drop1.append(col)
print("*********df DONE*********")


105
word_count_diff_1_quantile1_index_
word_count_diff_1_quantile2_index_
word_count_diff_2_quantile2_index_
word_count_diff_3_quantile2_index_
word_count_diff_1_quantile3_index_
word_count_diff_2_quantile3_index_
word_count_diff_3_quantile3_index_
word_count_diff_1_quantile4_index_
word_count_diff_2_quantile4_index_
word_count_diff_3_quantile4_index_
word_count_diff_1_quantile5_index_
word_count_diff_2_quantile5_index_
word_count_diff_1_quantile6_index_
word_count_diff_1_quantile7_index_
cursor_position_min_index_
word_count_diff_1_median_index_
word_count_diff_2_median_index_
Input_CP_quantile1_index_
Input_CP_quantile2_index_
Input_CP_quantile3_index_
Input_CP_quantile4_index_
Input_CP_quantile5_index_
Input_CP_quantile6_index_
Input_CP_quantile7_index_
Input_CP_quantile8_index_
Input_CP_quantile9_index_
Input_CP_median_index_
Input_CP_rank_index_
DE_Shift_ET_quantile1_index_
DE_Shift_ET_quantile2_index_
DE_Shift_ET_quantile3_index_
DE_Shift_ET_quantile4_index_
DE_Shift_ET_quantile5

In [43]:
#drop1 += not_important

In [44]:
FEATURES = [c for c in df.columns if c not in drop1 + ['id']]

In [45]:
len(FEATURES)

1620

In [46]:
df = df.set_index('id')
df = df[FEATURES]
df

Unnamed: 0_level_0,every_sentence_word_count,every_char_word,every_sentence_change,char_per_min,word_per_min,proportion_of_pause_time,count_of_pause_time,pause_length,freq_of_pause_time,activations_unique_index_,...,text_features_38,text_features_39,text_features_40,text_features_41,text_features_42,text_features_43,text_features_44,text_features_45,text_features_46,text_features_47
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
fde20dd8,12.596491,5.126741,25.403509,2.085945,0.406875,0.168502,13,20855.111111,1,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
842d59a3,18.403226,5.192813,4.677419,3.232087,0.622415,0.221349,16,22094.888889,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
d06feb60,22.222222,6.067500,31.388889,1.366254,0.225176,0.554864,34,29721.545455,6,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3b5cd2b2,21.500000,5.754611,25.672414,4.010514,0.696922,0.355067,11,38380.125000,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1dc5440c,20.363636,6.183036,10.181818,0.815264,0.131855,0.408144,38,25269.962963,2,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9f3cc888,9.434783,6.230415,5.434783,0.881389,0.141466,0.431006,23,31197.789474,7,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dfbc5b5e,18.069767,5.962677,31.976744,2.599682,0.435992,0.285243,16,31745.636364,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31ead610,19.187500,5.993485,15.437500,1.034373,0.172583,0.814255,19,67820.944444,9,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
394c1342,13.863636,5.642623,42.681818,0.954745,0.169202,0.534655,31,31291.000000,3,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
every_sentence_word_count,2471.0,17.818535,13.062090,2.197802,13.431677,16.409091,19.841667,500.000000
every_char_word,2471.0,5.764699,0.555682,3.544469,5.506235,5.739910,5.985366,14.500000
every_sentence_change,2471.0,17.737873,21.296764,0.181818,8.343137,14.095238,22.153846,500.000000
char_per_min,2471.0,1.311744,0.581970,0.128895,0.877846,1.171892,1.605255,4.974864
word_per_min,2471.0,0.229235,0.103489,0.021794,0.149384,0.206804,0.281960,0.885433
...,...,...,...,...,...,...,...,...
text_features_43,2471.0,0.000028,0.001410,0.000000,0.000000,0.000000,0.000000,0.070085
text_features_44,2471.0,0.000028,0.001392,0.000000,0.000000,0.000000,0.000000,0.069196
text_features_45,2471.0,0.000050,0.002465,0.000000,0.000000,0.000000,0.000000,0.122540
text_features_46,2471.0,0.000023,0.001121,0.000000,0.000000,0.000000,0.000000,0.055724


In [48]:
target = pd.read_csv('train_scores.csv')
target = target.set_index('id')

In [49]:
skf = StratifiedKFold(n_splits=5)
oof_xgb = pd.DataFrame(data=np.zeros((len(target),3)), index=target.index, columns=['xgb', 'lgbm', 'cat'])
#models = {}
best_iteration_xgb = defaultdict(list)
importance_dict = {}



xgb_params = {
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'objective': 'reg:squarederror',
        'eval_metric':'rmse',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'subsample':0.8,
        'colsample_bytree': 0.5,
        'seed': 42
        }

xgb_params['n_estimators'] = 500


lgbm_params = {
    'objective' : 'regression',
    'learning_rate': 0.02,
    'max_depth': 4,
    'num_iterations': 500,
    'random_state': 42
}

cat_params = {
    
    'learning_rate': 0.02,
    'depth': 8,
    'iterations': 500,
    'random_state': 42
}

feature_importance_df = pd.DataFrame()
df = df.join(target)
models = []

In [50]:
convert_target = {0.5: 0,
                    1.0: 1,
                    1.5: 2,
                    2.0: 3,
                    2.5: 4,
                    3.0: 5,
                    3.5: 6,
                    4.0: 7,
                    4.5: 8,
                    5.0: 9,
                    5.5: 10,
                    6.0: 11,}

reverse_convert_target = {0:0.5,
                            1:1.0,
                            2:1.5,
                            3:2.0,
                            4:2.5,
                            5:3.0,
                            6:3.5,
                            7:4.0,
                            8:4.5,
                            9:5.0,
                            10:5.5,
                            11:6.0,}
df['new_label'] = df['score'].map(convert_target)

In [51]:
def create_ae_mlp(num_columns, hidden_units=[256,512,1024,1024,512,256], lr = 1e-3):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    out = keras.Sequential(
    [
        keras.layers.Dense(512, activation="tanh",),
        #keras.layers.Dropout(0.1),
        keras.layers.Dense(512, activation="tanh"),
        #keras.layers.Dropout(0.1),
        keras.layers.Dense(256, activation="tanh"),
        #keras.layers.Dropout(0.1),
        keras.layers.Dense(1,),
    ]
    )(inp)
    
    
    model = tf.keras.models.Model(inputs = inp, outputs = [out])
    model.compile(optimizer = tf.keras.optimizers.AdamW(learning_rate = lr),
                  loss = "MSE",
                  metrics =[tf.keras.metrics.RootMeanSquaredError()]
                            , 
                 )
    
    return model

In [52]:
df = df.fillna(0).clip(-1e9, 1e9)

train_mean = df[FEATURES].mean()
train_std = df[FEATURES].std()

train_min = df[FEATURES].min()
train_max = df[FEATURES].max()

df[FEATURES] = (df[FEATURES] - train_min) /  (train_max - train_min)
df = df.fillna(0).clip(-1e9, 1e9)

In [26]:
df

Unnamed: 0_level_0,every_sentence_word_count,every_char_word,every_sentence_change,char_per_min,word_per_min,proportion_of_pause_time,count_of_pause_time,pause_length,freq_of_pause_time,activations_unique_index_,...,text_features_40,text_features_41,text_features_42,text_features_43,text_features_44,text_features_45,text_features_46,text_features_47,score,new_label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
791be024,0.016728,0.126137,0.021494,0.227704,0.264414,0.464379,0.236364,0.000029,0.051282,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7
955ea377,0.024354,0.223226,0.054728,0.257321,0.240724,0.452721,0.190909,0.000032,0.128205,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,8
9d8ba1be,0.025230,0.217764,0.040439,0.305569,0.289060,0.425016,0.300000,0.000025,0.153846,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,9
8adb949f,0.029735,0.121707,0.021644,0.101095,0.121654,0.503640,0.290909,0.000031,0.051282,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,4
1fbf2800,0.024681,0.175143,0.025452,0.280205,0.289872,0.619117,0.127273,0.000067,0.025641,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1343a4d2,0.021079,0.170912,0.027210,0.413461,0.430601,0.401843,0.190909,0.000023,0.025641,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,11
5cdb9757,0.027152,0.190778,0.044110,0.334360,0.334222,0.610403,0.181818,0.000041,0.025641,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,8
f4080033,0.026341,0.068922,0.012675,0.198500,0.268529,0.499112,0.109091,0.000074,0.025641,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,8
f79335cf,0.031045,0.169729,0.014946,0.235181,0.246580,0.264479,0.190909,0.000023,0.102564,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,7


In [27]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                    patience=10,
                                                    mode='min')


target_name = 'score'

for i, (train_index, test_index) in enumerate(skf.split(X=df, y=df.score*10,)):
     # TRAIN DATA
    best_weights_filepath = f'./mlp/best_weights_{i}.hdf5'
    earlyStopping=tf.keras.callbacks.ModelCheckpoint(best_weights_filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    train_x = df.iloc[train_index][FEATURES]
    train_users = train_x.index.values
    train_y = df[[target_name]].iloc[train_index]

    # VALID DATA
    valid_x = df.iloc[test_index][FEATURES]
    valid_users = valid_x.index.values
    valid_y = df[[target_name]].iloc[test_index]

    print(i+1, ', ', end='')
    clf = create_ae_mlp(num_columns=len(FEATURES), hidden_units=[256,512,1024,1024,512,256], lr = 1e-3)
    #best_iteration_xgb[str(i)].append(clf.best_ntree_limit)

    history = clf.fit(train_x.astype('float32').values, train_y[target_name].values, epochs=50,
                      validation_data=(valid_x.astype('float32').values, valid_y[target_name].values),
                      callbacks=[early_stopping,earlyStopping],
                     batch_size=64,
                     )
    clf.load_weights(best_weights_filepath)
    oof_xgb.loc[valid_users, 'mlp'] = clf.predict(valid_x.astype('float32').values)
   
    # clf = TabNetRegressor()  #TabNetRegressor()
    # train_x = train_x.fillna(0)
    # valid_x = valid_x.fillna(0)
    # clf.fit(
    #   train_x.values , train_y.values,
    #   eval_set=[(valid_x.values, valid_y.values)],
    #   eval_metric=['rmse']
    # )
    
    #oof_xgb.loc[valid_users, 'predict'] = clf.predict(valid_x.values)[:,0]
    
final_result = oof_xgb.join(target)


1 , Epoch 1/50


2023-10-26 17:12:57.153875: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2023-10-26 17:12:57.153911: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: will-Super-Server
2023-10-26 17:12:57.153918: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: will-Super-Server
2023-10-26 17:12:57.154023: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 525.125.6
2023-10-26 17:12:57.154040: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 525.125.6
2023-10-26 17:12:57.154045: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 525.125.6


Epoch 1: val_loss improved from inf to 1.06850, saving model to ./mlp/best_weights_0.hdf5
Epoch 2/50
Epoch 2: val_loss improved from 1.06850 to 0.62002, saving model to ./mlp/best_weights_0.hdf5
Epoch 3/50
Epoch 3: val_loss improved from 0.62002 to 0.49486, saving model to ./mlp/best_weights_0.hdf5
Epoch 4/50
Epoch 4: val_loss did not improve from 0.49486
Epoch 5/50
Epoch 5: val_loss improved from 0.49486 to 0.47273, saving model to ./mlp/best_weights_0.hdf5
Epoch 6/50
Epoch 6: val_loss did not improve from 0.47273
Epoch 7/50
Epoch 7: val_loss did not improve from 0.47273
Epoch 8/50
Epoch 8: val_loss did not improve from 0.47273
Epoch 9/50
Epoch 9: val_loss improved from 0.47273 to 0.45770, saving model to ./mlp/best_weights_0.hdf5
Epoch 10/50
Epoch 10: val_loss improved from 0.45770 to 0.44355, saving model to ./mlp/best_weights_0.hdf5
Epoch 11/50
Epoch 11: val_loss improved from 0.44355 to 0.43457, saving model to ./mlp/best_weights_0.hdf5
Epoch 12/50
Epoch 12: val_loss did not impro

In [53]:
mean_squared_error(final_result['score'], final_result['mlp'], squared=False)

0.6477864066578247

In [34]:
mean_squared_error(final_result['score'], final_result['mlp'], squared=False)

0.6475833321892487

In [29]:
!zip -r mlp.zip mlp

updating: mlp/ (stored 0%)
updating: mlp/best_weights_3.hdf5 (deflated 7%)
updating: mlp/.ipynb_checkpoints/ (stored 0%)
updating: mlp/best_weights_2.hdf5 (deflated 7%)
updating: mlp/best_weights_4.hdf5 (deflated 7%)
updating: mlp/best_weights_1.hdf5 (deflated 7%)
updating: mlp/best_weights_0.hdf5 (deflated 7%)


In [20]:
# 0.6381 BASELINE - v1
# 0.6372807396907196 remving up event
# 0.6363905859568488 adding every_word_count with up event
# 0.6388692848476841 adding every_word_count without up event
# 0.6375898226286834 removing every_word_count with up event + word_count_pre_step
# 0.634864097139824 -v4
# 0.6324737799704642 adding every_char_word
# 0.6312740919756227 adding every_char_word & every_sentence_change
# 0.6301625913548855 - v9
# 0.624126755429523 -v17
# 0.6228860242274676 -v19
# cv 0.616 lb 0.602 -v28
# cv 0.6159972729277812 lb 0.601 -v29