In [1]:
import pandas as pd
import numpy as np
import polars as pl
from sklearn.model_selection import KFold,GroupKFold,StratifiedKFold,StratifiedGroupKFold
import warnings
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import Pool, CatBoostRegressor
from collections import defaultdict
from sklearn.metrics import mean_squared_error
from copy import deepcopy
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
import regex as re
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
import joblib
from transformers import AutoModel, AutoTokenizer
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from tqdm import tqdm
import optuna
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm
2023-12-10 18:27:18.987214: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
CATS = ['activations', 'text_change']
NUMS = ['action_time', 'cursor_position', 'word_count', 'elapsed_time_diff_1', "down_time" ]

NUMS_1 = [ "elapsed_time_diff_2",
            "elapsed_time_diff_3",
            "elapsed_time_diff_5",
            "elapsed_time_diff_10",
            "elapsed_time_diff_20",
            "elapsed_time_diff_50",
            "cursor_position_diff_1",
            "cursor_position_diff_2",
            "cursor_position_diff_3",
            "cursor_position_diff_5",
            "cursor_position_diff_10",
            "cursor_position_diff_20",
            "cursor_position_diff_50",
            "word_count_diff_1",
            "word_count_diff_2",
            "word_count_diff_3",
            "word_count_diff_5",
            "word_count_diff_10",
            "word_count_diff_20",
            "word_count_diff_50",


]

NUMS += NUMS_1
name_feature = ['Nonproduction',
                 'Input',
                 'Remove/Cut',
                 'Replace',
                 'Move',
                 'Paste',
               ]

text_changes = ['q', ' ', 'NoChange', '.', ',', '\n', "'", '"', '-', '?', ';', '=', '/', '\\', ':']

punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']

down_event_feature = [
            'Dead','F3','{','Meta','F15','ArrowDown','z','F6','Pause','S','\x80',')','u','Cancel','Backspace','End','¡','MediaTrackNext','/','Tab','C','i','Process','F11','Home','~','%','1','q','F','s','5','Clear','l','OS','h','o','.','2','n','Control','*','Escape','#','`','â\x80\x93','Â´','MediaTrackPrevious','Alt','Ä±','a','@','PageUp','A','NumLock','ModeChange',',','^','PageDown','ContextMenu','F12','F2','Unknownclick','b','\x9b','-','"',"'",';','Delete','F1','$','T','0','CapsLock','M','ArrowRight','Ë\x86','AudioVolumeDown','?','p','Insert','\x96','y','w','AudioVolumeUp','Enter','Leftclick','V','¿','MediaPlayPause','}','AltGraph','_','I',':','AudioVolumeMute','Rightclick','>','ArrowLeft','c','Middleclick','(','ScrollLock','r','ArrowUp','Shift','Unidentified','&','|','g','!','v','F10','x','+','=','j','t','d','e','\x97','Space','Å\x9f','m','f','\\','ä'

]


down_event_feature = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
pasue_gaps = [8]

up_event_feature = [
    'AudioVolumeDown','h','MediaTrackNext','ArrowRight','l','Ë\x86','S',"'",'f','Â´','>','F12','~',']','Space','Escape','r','o','F10','Leftclick','[','g','e','=','¿','T','-','MediaPlayPause','k','Backspace','Enter','C','{','i','AudioVolumeUp','MediaTrackPrevious','<','`','F15','q','F1','Å\x9f','Delete','AltGraph','(','^','Process','0','CapsLock',')','Alt','ArrowUp','ScrollLock','Clear','c','ModeChange','_','|','ArrowDown','Ä±','+','PageUp','\x80','NumLock','"','Middleclick','M','Cancel','#',':','â\x80\x93','a','ä','Home','\x9b','1','V','Unidentified','b','\\','Insert','Shift','Rightclick','\x97','?','F11','5','Unknownclick','2','n','\x96','%','t','$','j','ContextMenu','p','y','End','d','@','m','v','Pause','.','!','OS','ArrowLeft','PageDown','s','Control','F3','}','F6','w','&','Dead','¡','u','x','A','Meta',',','z','/','*',';','Tab','AudioVolumeMute','F2'
    ]

up_event_feature = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']

In [3]:
def rename_act(act_name):
    if act_name.startswith("Move"):
        return "Move"
    else:
        return act_name

In [4]:
columns = [
    pl.col("action_time").cast(pl.Float32),
    pl.col("cursor_position").cast(pl.Float32),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff")),
    pl.col("word_count").cast(pl.Float32),
    pl.col("activity").apply(rename_act).alias("activations"),
    
    ((pl.col("down_time") - pl.col("up_time").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_1")),
    ((pl.col("down_time") - pl.col("up_time").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_2")),
    ((pl.col("down_time") - pl.col("up_time").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_3")),
    ((pl.col("down_time") - pl.col("up_time").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_5")),
    ((pl.col("down_time") - pl.col("up_time").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_10")),
    ((pl.col("down_time") - pl.col("up_time").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_20")),
    ((pl.col("down_time") - pl.col("up_time").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("elapsed_time_diff_50")),
    
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_1")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_2")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_3")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_5")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_10")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_20")),
    ((pl.col("cursor_position") - pl.col("cursor_position").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("cursor_position_diff_50")),

    ((pl.col("word_count") - pl.col("word_count").shift(1)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_1")),
    ((pl.col("word_count") - pl.col("word_count").shift(2)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_2")),
    ((pl.col("word_count") - pl.col("word_count").shift(3)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_3")),
    ((pl.col("word_count") - pl.col("word_count").shift(5)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_5")),
    ((pl.col("word_count") - pl.col("word_count").shift(10)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_10")),
    ((pl.col("word_count") - pl.col("word_count").shift(20)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_20")),
    ((pl.col("word_count") - pl.col("word_count").shift(50)).fill_null(0).clip(-1e9, 1e9).over(
        ["id"]).alias("word_count_diff_50")),
    
    (pl.col('word_count').max()/pl.col('action_time').max()/1000).alias('word_count_pre_step'),
]

In [5]:
df = (pl.read_csv("train_logs.csv")
      .with_columns(columns)
      .drop(["activity"])
     )

## build text feature

In [6]:
train_essays = pd.read_csv('train_essays_02.csv')
train_essays['text'] = train_essays['essay']
train_essays.index = train_essays["Unnamed: 0"]
train_essays.index.name = None
train_essays.drop(columns=["Unnamed: 0"], inplace=True)
train_essays.head()
corpus = train_essays['text'].tolist()
pipe = Pipeline([('count', CountVectorizer(ngram_range=(1,2), max_features=128)),
                 ('tfid', TfidfTransformer())]).fit(corpus)
text_feature = pipe.transform(corpus).toarray()
joblib.dump(pipe, './xgb_5fold/pipeline.pkl')

['./xgb_5fold/pipeline.pkl']

In [7]:
def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

In [33]:
AGGREGATIONS = ['count', 'mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', pd.DataFrame.kurt, 'sum']

def split_essays_into_sentences(df):
    essay_df = df
    essay_df['id'] = essay_df.index
    essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
    essay_df = essay_df.explode('sent')
    essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
    # Number of characters in sentences
    essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
    # Number of words in sentences
    essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
    return essay_df

def compute_sentence_aggregations(df):
    sent_agg_df = pd.concat(
        [df[['id','sent_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','sent_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    )
    sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
    sent_agg_df['id'] = sent_agg_df.index
    sent_agg_df = sent_agg_df.reset_index(drop=True)
    sent_agg_df.drop(columns=["sent_word_count_count"], inplace=True)
    sent_agg_df = sent_agg_df.rename(columns={"sent_len_count":"sent_count"})
    return sent_agg_df

def split_essays_into_paragraphs(df):
    essay_df = df
    essay_df['id'] = essay_df.index
    essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
    essay_df = essay_df.explode('paragraph')
    # Number of characters in paragraphs
    essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
    # Number of words in paragraphs
    essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
    essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
    return essay_df

def compute_paragraph_aggregations(df):
    paragraph_agg_df = pd.concat(
        [df[['id','paragraph_len']].groupby(['id']).agg(AGGREGATIONS), df[['id','paragraph_word_count']].groupby(['id']).agg(AGGREGATIONS)], axis=1
    ) 
    paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
    paragraph_agg_df['id'] = paragraph_agg_df.index
    paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
    paragraph_agg_df.drop(columns=["paragraph_word_count_count"], inplace=True)
    paragraph_agg_df = paragraph_agg_df.rename(columns={"paragraph_len_count":"paragraph_count"})
    return paragraph_agg_df

In [9]:
train_sent_df = split_essays_into_sentences(train_essays)
train_sent_agg_df = compute_sentence_aggregations(train_sent_df)

In [10]:
train_paragraph_df = split_essays_into_paragraphs(train_essays)
train_paragraph_agg_df = compute_paragraph_aggregations(train_paragraph_df)

In [11]:
bert_result = pd.read_csv('bert_predict.csv').set_index('id')[['bert']]

In [12]:
pipe = joblib.load('./xgb_5fold/pipeline.pkl')

In [13]:
# 加载预训练模型

# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(0.1)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        #feature = F.normalize(feature, p=2, dim=1)
        return feature

    
# model_name = './pretrained_model'
# device = 'cuda'
# model = CustomModel(model_name)
# model.to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)


class TestDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
       
        inputs = tokenizer(text,
                           max_length=256,
                           pad_to_max_length=True,
                           add_special_tokens=True,
                           return_offsets_mapping=False)

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

def get_model_feature(model, texts):
    feature_outs_all = []
    test_dataset = TestDataset(texts)
    test_loader = DataLoader(test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
                             num_workers=0, pin_memory=True, drop_last=False)

    # tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tqdm(test_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            feature_outs = model(inputs)
            feature_outs_all.append(feature_outs.cpu())

    feature_outs_all_final = torch.cat(feature_outs_all, dim=0).numpy()
    #print(feature_outs_all_final.shape)

    return feature_outs_all_final



In [14]:
# 加载预训练模型

# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        
        self.model = AutoModel.from_pretrained(model_name)
        self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(0.1)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        #feature = F.normalize(feature, p=2, dim=1)
        return feature

    
# model_name = './pretrained_model'
# device = 'cuda'
# model = CustomModel(model_name)
# model.to(device)
# tokenizer = AutoTokenizer.from_pretrained(model_name)


class TestDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
       
        inputs = tokenizer(text,
                           max_length=256,
                           pad_to_max_length=True,
                           add_special_tokens=True,
                           return_offsets_mapping=False)

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

def get_model_feature(model, texts):
    feature_outs_all = []
    test_dataset = TestDataset(texts)
    test_loader = DataLoader(test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
                             num_workers=0, pin_memory=True, drop_last=False)

    # tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tqdm(test_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            feature_outs = model(inputs)
            feature_outs_all.append(feature_outs.cpu())

    feature_outs_all_final = torch.cat(feature_outs_all, dim=0).numpy()
    #print(feature_outs_all_final.shape)

    return feature_outs_all_final



In [15]:
eps = 1e-3
def feature_engineer_for_index(x, feature_suffix):

    aggs = [
        (pl.col('word_count').max()/ pl.col("up_event").filter(pl.col("up_event") == '.').count()).clip(0, 500).alias('every_sentence_word_count'),
        (pl.col('cursor_position').max()/ pl.col("word_count").max()).alias('every_char_word'),
        (pl.col('activations').filter(pl.col("activations").is_in(['Remove/Cut', 'Replace'])).count()/ pl.col("up_event").filter(pl.col("up_event") == '.').count()).clip(0, 500).alias('every_sentence_change'),
        ( pl.col("cursor_position").max() / (pl.col('down_time').max() - pl.col('down_time').min()) * 1000 ).alias('char_per_min'),
        ( pl.col("word_count").max() / (pl.col('down_time').max() - pl.col('down_time').min()) * 1000 ).alias('word_per_min'),
        
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).sum() / pl.col("elapsed_time_diff_1").sum()).alias('proportion_of_pause_time'),
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).count()).alias('count_of_pause_time'),
        (pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 8000 ).sum() / pl.col("elapsed_time_diff_1").filter(pl.col("elapsed_time_diff_1") > 10000 ).count()).alias('pause_length'),
        (pl.col("elapsed_time_diff_1").filter((pl.col("elapsed_time_diff_1") > 8000) & (pl.col("text_change") == ' ' )).count()).alias('freq_of_pause_time'),
        #(pl.col("activations").filter(pl.col("elapsed_time_diff_1") > 2000 ).count() / pl.col("activations").count()).alias('proportion_of_pause'),
        # (pl.col('down_event').filter(pl.col('down_event').is_in(punctuations)).count() ).alias('punct_cnt'),
        #*[(pl.col('text_change').filter(pl.col('text_change') == c).count() ).alias(f'text_change_{c}_cnt') for c in text_changes],
        #pl.col('event_id').count().alias('event_count'),
        *[pl.col(c).drop_nulls().n_unique().alias(f"{c}_unique_{feature_suffix}") for c in CATS],
        *[pl.col(c).quantile(0.1, "nearest").alias(f"{c}_quantile1_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.2, "nearest").alias(f"{c}_quantile2_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.3, "nearest").alias(f"{c}_quantile3_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.4, "nearest").alias(f"{c}_quantile4_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.5, "nearest").alias(f"{c}_quantile5_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.6, "nearest").alias(f"{c}_quantile6_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.7, "nearest").alias(f"{c}_quantile7_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.8, "nearest").alias(f"{c}_quantile8_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.9, "nearest").alias(f"{c}_quantile9_{feature_suffix}") for c in NUMS],

        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).std().alias(f"{c}_std_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        *[pl.col(c).median().alias(f"{c}_median_{feature_suffix}") for c in NUMS],
        *[pl.col(c).sum().alias(f"{c}_sum_{feature_suffix}") for c in NUMS],
        *[pl.col(c).skew().alias(f"{c}_skew_{feature_suffix}") for c in NUMS],
        *[pl.col(c).kurtosis().alias(f"{c}_kurtosis_{feature_suffix}") for c in NUMS],
        # adding rank features
        *[pl.col(c).last().alias(f"{c}_rank_{feature_suffix}") for c in NUMS],

        
        
        
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_ET_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_ET_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_ET_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_ET_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_ET_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_ET_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_ET_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_ET_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_ET_quantile9_{feature_suffix}") for c in name_feature],
        
        *[pl.col("activations").filter(pl.col("activations") == c).count().alias(f"{c}_name_counts{feature_suffix}")for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).std().alias(f"{c}_ET_std_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).median().alias(f"{c}_ET_median_{feature_suffix}") for c in name_feature],
        *[pl.col("action_time").filter(pl.col("activations")==c).last().alias(f"{c}_ET_rank_{feature_suffix}") for c in name_feature],
        #*[pl.col("elapsed_time_diff_1").filter(pl.col("name")==c).mode().alias(f"{c}_ET_mode_{feature_suffix}") for c in name_feature],
        
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_ETD_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_ETD_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_ETD_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_ETD_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_ETD_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_ETD_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_ETD_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_ETD_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_ETD_quantile9_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).mean().alias(f"{c}_ETD_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).max().alias(f"{c}_ETD_max_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).min().alias(f"{c}_ETD_min_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).std().alias(f"{c}_ETD_std_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).median().alias(f"{c}_ETD_median_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("activations")==c).last().alias(f"{c}_ETD_rank_{feature_suffix}") for c in name_feature],
        
        
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.1, "nearest").alias(f"{c}_CP_quantile1_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.2, "nearest").alias(f"{c}_CP_quantile2_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.3, "nearest").alias(f"{c}_CP_quantile3_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.4, "nearest").alias(f"{c}_CP_quantile4_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.5, "nearest").alias(f"{c}_CP_quantile5_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.6, "nearest").alias(f"{c}_CP_quantile6_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.7, "nearest").alias(f"{c}_CP_quantile7_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.8, "nearest").alias(f"{c}_CP_quantile8_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).quantile(0.9, "nearest").alias(f"{c}_CP_quantile9_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).mean().alias(f"{c}_CP_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).max().alias(f"{c}_CP_max_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).min().alias(f"{c}_CP_min_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).std().alias(f"{c}_CP_std_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).median().alias(f"{c}_CP_median_{feature_suffix}") for c in name_feature],
        *[pl.col("cursor_position_diff").filter(pl.col("activations")==c).last().alias(f"{c}_CP_rank_{feature_suffix}") for c in name_feature],
        
        
        *[pl.col("activations").filter(pl.col("down_event") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_ET_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_ET_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_ET_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_ET_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_ET_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_ET_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_ET_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_ET_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_ET_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_ET_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_ET_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_ET_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_ET_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("action_time").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_ET_median_{feature_suffix}") for c in down_event_feature],
        # adding rank features
        *[pl.col("action_time").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_ET_rank_{feature_suffix}") for c in down_event_feature],
        
        
        
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_ETD_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_ETD_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_ETD_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_ETD_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_ETD_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_ETD_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_ETD_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_ETD_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_ETD_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_ETD_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_ETD_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_ETD_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_ETD_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_ETD_median_{feature_suffix}") for c in down_event_feature],
        *[pl.col("elapsed_time_diff_1").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_ETD_rank_{feature_suffix}") for c in down_event_feature],
        
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.1, "nearest").alias(f"DE_{c}_DT_quantile1_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.2, "nearest").alias(f"DE_{c}_DT_quantile2_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.3, "nearest").alias(f"DE_{c}_DT_quantile3_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.4, "nearest").alias(f"DE_{c}_DT_quantile4_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.5, "nearest").alias(f"DE_{c}_DT_quantile5_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.6, "nearest").alias(f"DE_{c}_DT_quantile6_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.7, "nearest").alias(f"DE_{c}_DT_quantile7_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.8, "nearest").alias(f"DE_{c}_DT_quantile8_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).quantile(0.9, "nearest").alias(f"DE_{c}_DT_quantile9_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).mean().alias(f"DE_{c}_DT_mean_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).max().alias(f"DE_{c}_DT_max_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).min().alias(f"DE_{c}_DT_min_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).std().alias(f"DE_{c}_DT_std_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).median().alias(f"DE_{c}_DT_median_{feature_suffix}") for c in down_event_feature],
        *[pl.col("down_time").filter(pl.col("down_event")==c).last().alias(f"DE_{c}_DT_rank_{feature_suffix}") for c in down_event_feature],
        
        
        
        
        *[pl.col("activations").filter(pl.col("up_event") == c).count().alias(f"UE_{c}_event_name_counts{feature_suffix}")for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.1, "nearest").alias(f"UE_{c}_ET_quantile1_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.2, "nearest").alias(f"UE_{c}_ET_quantile2_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.3, "nearest").alias(f"UE_{c}_ET_quantile3_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.4, "nearest").alias(f"UE_{c}_ET_quantile4_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.5, "nearest").alias(f"UE_{c}_ET_quantile5_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.6, "nearest").alias(f"UE_{c}_ET_quantile6_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.7, "nearest").alias(f"UE_{c}_ET_quantile7_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.8, "nearest").alias(f"UE_{c}_ET_quantile8_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).quantile(0.9, "nearest").alias(f"UE_{c}_ET_quantile9_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).mean().alias(f"UE_{c}_ET_mean_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).std().alias(f"UE_{c}_ET_std_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).max().alias(f"UE_{c}_ET_max_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).min().alias(f"UE_{c}_ET_min_{feature_suffix}") for c in up_event_feature],
        *[pl.col("action_time").filter(pl.col("up_event")==c).median().alias(f"UE_{c}_ET_median_{feature_suffix}") for c in up_event_feature],
        # adding rank features
        *[pl.col("action_time").filter(pl.col("up_event")==c).last().alias(f"UE_{c}_ET_rank_{feature_suffix}") for c in up_event_feature],


        ]

    df = x.groupby(["id"], maintain_order=True).agg(aggs).sort("id")
    tmp_df = pl.from_pandas(train_essays)
    corpus = train_essays['text'].tolist()
    tmp_df = tmp_df.with_columns([pl.col("text").apply(lambda x: re.findall(r'q+', x)).alias("text_change")])
    tmp_df = tmp_df.with_columns([pl.col("text").apply(lambda x: len(x)).alias("input_word_count")])
    tmp_df = tmp_df.with_columns([pl.col("text").apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_mean")])
    tmp_df = tmp_df.with_columns([pl.col("text").apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_max")])
    tmp_df = tmp_df.with_columns([pl.col("text").apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0) ).alias("input_word_length_std")])
    tmp_df = tmp_df.drop(["text_change"])
    df = df.join(tmp_df, on='id')
    df = df.to_pandas()
    df['word_time_ratio'] = df[f"word_count_max_{feature_suffix}"] / df[f"down_time_max_{feature_suffix}"]
    #df['word_event_ratio'] = df[f"word_count_max_{feature_suffix}"] / df['event_count']
    #df['event_time_ratio'] = df['event_count']  / df[f"down_time_max_{feature_suffix}"]
    df['idle_time_ratio'] = df[f"elapsed_time_diff_1_sum_{feature_suffix}"] / df[f"down_time_max_{feature_suffix}"]
    
    text_feature = pipe.transform(corpus).toarray() # pipe.transform(corpus).toarray() # get_model_feature(model, texts)
    text_feature_df = pd.DataFrame(text_feature, columns=[f'text_features_{i}' for i in range(text_feature.shape[1])])
    text_feature_df['id'] = train_essays.index
    text_feature_df = text_feature_df.set_index('id')
    
    
#     x.with_clumns([pl.col("up_time").shift(1).fill_null(pl.col("down_time")).clip(-1e9, 1e9).over(
#         ["id"]).alias("up_time_lagged"),
#                   pl.abs(pl.col('down_time') - pl.col('up_time_lagged')) / 1000
                  
#                   ])
    
    
    df = df.join(text_feature_df, on='id')
    
    df = df.merge(train_sent_agg_df, on='id', how='left')
    df = df.merge(train_paragraph_agg_df, on='id', how='left')

    #df = df.join(bert_result, on='id')
    return df

In [16]:
# df = df.with_columns([pl.col("up_time").shift(1).fill_null(pl.col("down_time")).clip(-1e9, 1e9).over(["id"]).alias("up_time_lagged"),  
#                   ])
# df = df.with_columns([
#                  ((pl.col('down_time') - pl.col('up_time_lagged')) / 1000).abs().alias("time_diff"), 
#                   ])
# aggs = [pl.col('time_diff').max().alias("time_diff_max_"),
#         pl.col('time_diff').min().alias("time_diff_min_"),
#         pl.col('time_diff').median().alias("time_diff_median_"),
#        ]
# df = x.groupby(["id"], maintain_order=True).agg(aggs).sort("id")

In [17]:
train_logs=pd.read_csv("train_logs.csv")
data = []
for logs in [train_logs]:
    #up_time向后移动并且用down_time填充缺失的位置
    logs['up_time_lagged'] = logs.groupby('id')['up_time'].shift(1).fillna(logs['down_time'])
    #(down_time减上一个时刻的up_time) /1000是单位转换
    logs['time_diff'] = abs(logs['down_time'] - logs['up_time_lagged']) / 1000

    #按照id打包time_diff
    group = logs.groupby('id')['time_diff']
    #延迟时间的max,min,median
    largest_lantency = group.max()
    smallest_lantency = group.min()
    median_lantency = group.median()
    #down_time的first /1000是做单位转换吧
    initial_pause = logs.groupby('id')['down_time'].first() / 1000
    #分层次求和
    pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x <= 1)).sum())
    pauses_1_sec = group.apply(lambda x: ((x > 1) & (x <= 1.5)).sum())
    pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x <= 2)).sum())
    pauses_2_sec = group.apply(lambda x: ((x > 2) & (x <= 3)).sum())
    pauses_3_sec = group.apply(lambda x: (x > 3).sum())

    data.append(pd.DataFrame({
        'id': logs['id'].unique(),
        #延迟
        'largest_lantency': largest_lantency,
        'smallest_lantency': smallest_lantency,
        'median_lantency': median_lantency,
        'initial_pause': initial_pause,
        'pauses_half_sec': pauses_half_sec,
        'pauses_1_sec': pauses_1_sec,
        'pauses_1_half_sec': pauses_1_half_sec,
        'pauses_2_sec': pauses_2_sec,
        'pauses_3_sec': pauses_3_sec,
    }).reset_index(drop=True))

train_eD592674 = data[0]

In [18]:
df = feature_engineer_for_index(df, 'index_')
df = df.merge(train_eD592674, on='id', how='left')
df.drop(columns=['essay', 'text', 'sent', 'paragraph'], inplace=True)
invalid_columns = [c for c in df.columns if '[' in c or ']' in c or '<' in c]
rename_dict = {}
for name in invalid_columns:
    rename_dict[name] = name.replace('[', 'left_brackets').replace(']', 'right_brackets').replace('<', 'less')
df = df.rename(columns=rename_dict)

In [19]:
# some cleaning...
null1 = df.isnull().sum().sort_values(ascending=False) / len(df)

drop1 = list(null1[null1>0.9].index)

print(len(drop1))

for col in df.columns:
    if df[col].nunique()==1:
        print(col)
        drop1.append(col)
print("*********df DONE*********")


105
word_count_diff_1_quantile1_index_
word_count_diff_1_quantile2_index_
word_count_diff_2_quantile2_index_
word_count_diff_3_quantile2_index_
word_count_diff_1_quantile3_index_
word_count_diff_2_quantile3_index_
word_count_diff_3_quantile3_index_
word_count_diff_1_quantile4_index_
word_count_diff_2_quantile4_index_
word_count_diff_3_quantile4_index_
word_count_diff_1_quantile5_index_
word_count_diff_2_quantile5_index_
word_count_diff_1_quantile6_index_
word_count_diff_1_quantile7_index_
cursor_position_min_index_
word_count_diff_1_median_index_
word_count_diff_2_median_index_
Input_CP_quantile1_index_
Input_CP_quantile2_index_
Input_CP_quantile3_index_
Input_CP_quantile4_index_
Input_CP_quantile5_index_
Input_CP_quantile6_index_
Input_CP_quantile7_index_
Input_CP_quantile8_index_
Input_CP_quantile9_index_
Input_CP_median_index_
Input_CP_rank_index_
DE_Shift_ET_quantile1_index_
DE_Shift_ET_quantile2_index_
DE_Shift_ET_quantile3_index_
DE_Shift_ET_quantile4_index_
DE_Shift_ET_quantile5

In [20]:
#drop1 += not_important

In [21]:
FEATURES = [c for c in df.columns if c not in drop1 + ['id']]

In [22]:
len(FEATURES)

1759

In [23]:
df = df.set_index('id')
df = df[FEATURES]
df

Unnamed: 0_level_0,every_sentence_word_count,every_char_word,every_sentence_change,char_per_min,word_per_min,proportion_of_pause_time,count_of_pause_time,pause_length,freq_of_pause_time,activations_unique_index_,...,paragraph_word_count_kurt,paragraph_word_count_sum,largest_lantency,median_lantency,initial_pause,pauses_half_sec,pauses_1_sec,pauses_1_half_sec,pauses_2_sec,pauses_3_sec
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001519c8,12.190476,6.011719,20.190476,0.856260,0.142432,0.527536,34,35973.181818,2,5,...,,269,154.136,0.062,4.526,116,51,30,21,103
0022f953,21.533333,5.188855,17.400000,0.953237,0.183709,0.686187,38,35089.103448,0,5,...,2.342703,355,145.899,0.061,30.623,141,37,14,19,61
0042269b,19.238095,5.670792,21.238095,1.296711,0.228665,0.691334,26,46527.700000,1,4,...,-1.536764,410,153.886,0.040,4.441,83,46,25,25,52
0059420b,15.846154,5.082524,11.692308,0.768159,0.151137,0.458725,21,31664.941176,5,5,...,,208,101.690,0.131,41.395,178,81,34,32,55
0075873a,10.956522,5.563492,22.478261,0.885146,0.159099,0.645042,34,29264.178571,3,3,...,0.722916,256,110.688,0.059,78.470,65,24,11,17,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ffb8c745,10.720930,3.544469,22.372093,0.923626,0.260582,0.694018,18,51827.294118,3,4,...,-3.798053,308,128.570,0.034,22.467,117,41,18,11,30
ffbef7e5,14.129032,4.285388,1.967742,1.056042,0.246428,0.469328,29,36683.200000,3,4,...,-0.942230,443,267.869,0.172,21.732,121,43,24,24,66
ffccd6fd,40.200000,13.736319,17.600000,1.426290,0.103834,0.553205,28,40992.521739,10,3,...,,1846,229.804,0.116,23.482,168,83,37,29,58
ffec5b38,13.322581,5.164649,8.903226,1.433034,0.277470,0.461450,16,55336.200000,1,3,...,-1.192696,417,127.733,0.091,19.885,116,35,27,15,48


In [24]:
target = pd.read_csv('train_scores.csv')
target = target.set_index('id')

In [25]:
oof_xgb = pd.DataFrame(data=np.zeros((len(target),3)), index=target.index, columns=['xgb', 'lgbm', 'cat'])
#models = {}
best_iteration_xgb = defaultdict(list)
importance_dict = {}



xgb_params = {
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'objective': 'reg:squarederror',
        'eval_metric':'rmse',
        'learning_rate': 0.02,
        'alpha': 8,
        'max_depth': 4,
        'subsample':0.8,
        'colsample_bytree': 0.5,
        'seed': 42
        }



cat_params = {
    
    'learning_rate': 0.02,
    'depth': 8,
    'iterations': 500,
    'random_state': 42
}



feature_importance_df = pd.DataFrame()
df = df.join(target)
models = []

In [26]:
convert_target = {0.5: 0,
                    1.0: 1,
                    1.5: 2,
                    2.0: 3,
                    2.5: 4,
                    3.0: 5,
                    3.5: 6,
                    4.0: 7,
                    4.5: 8,
                    5.0: 9,
                    5.5: 10,
                    6.0: 11,}

reverse_convert_target = {0:0.5,
                            1:1.0,
                            2:1.5,
                            3:2.0,
                            4:2.5,
                            5:3.0,
                            6:3.5,
                            7:4.0,
                            8:4.5,
                            9:5.0,
                            10:5.5,
                            11:6.0,}
df['new_label'] = df['score'].map(convert_target)

In [27]:
df = df.fillna(0).clip(-1e9, 1e9)

In [38]:
def objective(trial):
    model_name = 'lgbm'
    target_name = 'score'
    task_type = 'reg'
    final_score = []
    if model_name == 'lgbm':
        params = {
                'metric': 'rmse', 
                'random_state': 42,
                'n_estimators': trial.suggest_int('n_estimators', 100, 500, log=True),
                'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
                'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
                'subsample': trial.suggest_float('subsample', 0.5, 1),
                'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
                'num_leaves' : trial.suggest_int('num_leaves', 2, 32),
                #'max_depth' : trial.suggest_int('max_depth', 4, 16),
                'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
            }
    elif model_name == 'xgb':
        params = {
                'booster': 'gbtree',
                'tree_method': 'hist',
                'objective': 'reg:squarederror',
                'eval_metric':'rmse',
                'random_state': 42,
                'n_estimators': trial.suggest_int('n_estimators', 100, 500, log=True),
                'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
                'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1),
                'subsample': trial.suggest_float('subsample', 0.5, 1),
                'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
                'max_leaves' : trial.suggest_int('max_leaves', 2, 32),
                'max_depth': trial.suggest_int('max_depth', 1, 32),
            }
    else:
        params = {
            'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.1, log=True),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
            "depth": trial.suggest_int("depth", 1, 12),
            'iterations': trial.suggest_int('iterations', 100, 500, log=True),
            'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-3, 10.0),
            'subsample': trial.suggest_float('subsample', 0.5, 1),
            'eval_metric':'RMSE',
            'random_state': 42,
            }
    
    skf = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(X=df, y=df.score*10,)):
         # TRAIN DATA
        train_x = df.iloc[train_index][FEATURES]
        train_users = train_x.index.values
        train_y = df[[target_name]].iloc[train_index]

        # VALID DATA
        valid_x = df.iloc[test_index][FEATURES]
        valid_users = valid_x.index.values
        valid_y = df[[target_name]].iloc[test_index]


        if model_name == 'lgbm' and task_type != 'class':
            clf = LGBMRegressor(**params)#XGBRegressor(**xgb_params)
        elif model_name == 'lgbm' and task_type == 'class':
            clf = LGBMClassifier(**params)#XGBRegressor(**xgb_params)
        elif model_name == 'xgb':
            clf = XGBRegressor(**params)
        else:
            clf = CatBoostRegressor(**params)
        if model_name != 'cat':
            clf.fit(train_x.astype('float32'), train_y[target_name],
                    eval_set=[(valid_x.astype('float32'), valid_y[target_name])],
                    verbose=0
                    )
        else:
            train_pool = Pool(train_x.astype('float32'), train_y[target_name])
            test_pool = Pool(valid_x.astype('float32'), valid_y[target_name])
            clf.fit(train_pool,
                    eval_set=test_pool,
                    verbose=0)

        oof_xgb.loc[valid_users, model_name] = clf.predict(valid_x.astype('float32')) 

    final_result = oof_xgb.join(target)
    rmse = mean_squared_error(final_result['score'], final_result[model_name], squared=False)
    final_score.append(rmse)
    return np.mean(final_score)

In [None]:
study = optuna.create_study(direction='minimize', study_name='Optimize boosting hyperparameters')
study.optimize(objective, n_trials=100)

[32m[I 2023-12-11 10:33:28,535][0m A new study created in memory with name: Optimize boosting hyperparameters[0m
[32m[I 2023-12-11 10:54:18,182][0m Trial 0 finished with value: 0.6239449600639966 and parameters: {'n_estimators': 111, 'reg_alpha': 8.470380606220221, 'reg_lambda': 6.60194071607205, 'colsample_bytree': 0.7701745056326487, 'subsample': 0.6910953127309167, 'learning_rate': 0.034669768734086205, 'num_leaves': 18, 'min_child_samples': 94}. Best is trial 0 with value: 0.6239449600639966.[0m
[32m[I 2023-12-11 11:48:17,997][0m Trial 1 finished with value: 0.6870895214546545 and parameters: {'n_estimators': 232, 'reg_alpha': 2.550060514101067, 'reg_lambda': 7.920996059263239, 'colsample_bytree': 0.7481698664485007, 'subsample': 0.9088073752387767, 'learning_rate': 0.005643788347624591, 'num_leaves': 23, 'min_child_samples': 75}. Best is trial 0 with value: 0.6239449600639966.[0m
[32m[I 2023-12-11 12:45:12,069][0m Trial 2 finished with value: 0.6228124590482226 and para

In [30]:
print('Best trial:', study.best_trial.params)

Best trial: {'n_estimators': 230, 'reg_alpha': 0.011637601667639394, 'reg_lambda': 0.0040057167198726314, 'colsample_bytree': 0.6440699798660601, 'subsample': 0.6135016901133, 'learning_rate': 0.0213326106169659, 'num_leaves': 21, 'min_child_samples': 15}


In [None]:
model_name = 'lgbm'
target_name = 'score'
task_type = 'reg'
final_results = []
if model_name == 'lgbm':
    params = {
            'metric': 'rmse', 
            'random_state': 42,
        }
    params.update(study.best_trial.params)
elif model_name == 'xgb':
    params = {
                'booster': 'gbtree',
                'tree_method': 'gpu_hist',
                'objective': 'reg:squarederror',
                'eval_metric':'rmse',
                'random_state': 42,}
    params.update(study.best_trial.params)
else:
    params = {
             'eval_metric':'RMSE',
             'random_state': 42,
             }
    params.update(study.best_trial.params)
for j in range(5): 
    skf = StratifiedKFold(n_splits=10, random_state=42 + j, shuffle=True)
    for i, (train_index, test_index) in enumerate(skf.split(X=df, y=df.score*10,)):
         # TRAIN DATA
        train_x = df.iloc[train_index][FEATURES]
        train_users = train_x.index.values
        train_y = df[[target_name]].iloc[train_index]

        # VALID DATA
        valid_x = df.iloc[test_index][FEATURES]
        valid_users = valid_x.index.values
        valid_y = df[[target_name]].iloc[test_index]


        if model_name == 'lgbm' and task_type != 'class':
            clf = LGBMRegressor(**params)#XGBRegressor(**xgb_params)
        elif model_name == 'lgbm' and task_type == 'class':
            clf = LGBMClassifier(**params)#XGBRegressor(**xgb_params)
        elif model_name == 'xgb':
            clf = XGBRegressor(**params)
        else:
            clf = CatBoostRegressor(**params)
        if model_name != 'cat':
            clf.fit(train_x.astype('float32'), train_y[target_name],
                    eval_set=[(valid_x.astype('float32'), valid_y[target_name])],
                    verbose=0)
        else:
            train_pool = Pool(train_x.astype('float32'), train_y[target_name])
            test_pool = Pool(valid_x.astype('float32'), valid_y[target_name])
            clf.fit(train_pool,
                    eval_set=test_pool,
                    verbose=0)
        #print(i+1, ', ', end='')
        if model_name == 'lgbm':
            clf.booster_.save_model(f'./lgbm_5fold/lgbm_question_{j}_{i}.xgb')
        elif model_name == 'xgb':
            clf.save_model(f'./xgb_5fold/XGB_question_{j}_{i}.xgb')
        else:
            clf.save_model(f'./cat_5fold/cat_question_{j}_{i}.xgb')

        oof_xgb.loc[valid_users, model_name] = clf.predict(valid_x.astype('float32'))
    
    final_result = oof_xgb.join(target)
    rmse = mean_squared_error(final_result['score'], final_result[model_name], squared=False)
    final_results.append(rmse)
print(np.mean(final_results))

In [None]:
mean_squared_error(final_result['score'], final_result['lgbm'], squared=False)

In [108]:
mean_squared_error(final_result['score'], final_result['xgb'], squared=False)

3.8501236400545635

In [112]:
mean_squared_error(final_result['score'], final_result['cat'], squared=False)

0.6108285895791181

In [30]:
best_score = 100
best_weight = 0
for w_0 in np.arange(0, 1.05, 0.05):
    for w_1 in np.arange(0, 1.05-w_0, 0.05): 
        final_result['predict'] = w_0 * final_result['xgb'] + w_1*final_result['lgbm'] + (1-w_0-w_1) * final_result['cat']
        score = mean_squared_error(final_result['score'], final_result['predict'], squared=False)
        if best_score > score:
            best_score = score
            best_weight = (w_0, w_1)
print(best_score)
print(best_weight)

0.6148173295521092
(1.0, 0.0)


In [31]:
final_result['predict'] = (final_result['xgb'] + final_result['lgbm'] + final_result['cat']) / 3
mean_squared_error(final_result['score'], final_result['predict'], squared=False)

2.6056496849770823

In [32]:
for i in sorted([3.5, 6.0, 2.0, 4.0, 4.5, 2.5, 5.0, 3.0, 1.5, 5.5, 1.0, 0.5]):
    sub = final_result[final_result['score'] == i]
    score = mean_squared_error(sub['score'], sub['lgbm'], squared=False)
    print(i)
    print(score)

0.5
0.5
1.0
1.0
1.5
1.5
2.0
2.0
2.5
2.5
3.0
3.0
3.5
3.5
4.0
4.0
4.5
4.5
5.0
5.0
5.5
5.5
6.0
6.0


In [102]:
np.save('./xgb_5fold/columns_names', FEATURES)
np.save('./xgb_5fold/rename_dict', rename_dict)

In [113]:
!zip -r cat_5fold.zip cat_5fold

  adding: cat_5fold/ (stored 0%)
  adding: cat_5fold/cat_question_3_8.xgb (deflated 58%)
  adding: cat_5fold/cat_question_0_9.xgb (deflated 57%)
  adding: cat_5fold/cat_question_2_4.xgb (deflated 58%)
  adding: cat_5fold/cat_question_0_8.xgb (deflated 57%)
  adding: cat_5fold/cat_question_0_0.xgb (deflated 58%)
  adding: cat_5fold/cat_question_3_7.xgb (deflated 58%)
  adding: cat_5fold/cat_question_2_6.xgb (deflated 57%)
  adding: cat_5fold/cat_question_3_5.xgb (deflated 59%)
  adding: cat_5fold/cat_question_4_6.xgb (deflated 58%)
  adding: cat_5fold/cat_question_3_9.xgb (deflated 59%)
  adding: cat_5fold/.ipynb_checkpoints/ (stored 0%)
  adding: cat_5fold/cat_question_3_1.xgb (deflated 59%)
  adding: cat_5fold/cat_question_2_7.xgb (deflated 57%)
  adding: cat_5fold/cat_question_3_0.xgb (deflated 58%)
  adding: cat_5fold/cat_question_2_5.xgb (deflated 58%)
  adding: cat_5fold/cat_question_3_2.xgb (deflated 58%)
  adding: cat_5fold/cat_question_1_2.xgb (deflated 58%)
  adding: cat_5fold

In [103]:
!zip -r xgb_5fold.zip xgb_5fold

updating: xgb_5fold/ (stored 0%)
updating: xgb_5fold/rename_dict.npy (deflated 30%)
updating: xgb_5fold/.ipynb_checkpoints/ (stored 0%)
updating: xgb_5fold/pipeline.pkl (deflated 45%)
updating: xgb_5fold/columns_names.npy (deflated 97%)


In [104]:
!zip -r lgbm_5fold.zip lgbm_5fold

updating: lgbm_5fold/ (stored 0%)
updating: lgbm_5fold/lgbm_question_4_4.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_0_8.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_2_0.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_1_3.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_1_5.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_1_1.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_2_5.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_0_7.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_3_5.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_2_8.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_2_9.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_3_6.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_3_8.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_3_2.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_0_5.xgb (deflated 65%)
updating: lgbm_5fold/lgbm_question_2_1.xgb (deflated 65%)
updating: lgbm_5fold/.ipynb_checkpoint

In [37]:
# 0.6381 BASELINE - v1
# 0.6372807396907196 remving up event
# 0.6363905859568488 adding every_word_count with up event
# 0.6388692848476841 adding every_word_count without up event
# 0.6375898226286834 removing every_word_count with up event + word_count_pre_step
# 0.634864097139824 -v4
# 0.6324737799704642 adding every_char_word
# 0.6312740919756227 adding every_char_word & every_sentence_change
# 0.6301625913548855 - v9
# 0.624126755429523 -v17
# 0.6228860242274676 -v19
# cv 0.616 lb 0.602 -v28
# cv 0.6159972729277812 lb 0.601 -v29
# cv 0.6139405 lb -v34