In [1]:
import pandas as pd
import numpy as np
import polars as pl
from m3_model_params import lgb_params_1
from m4_feats_polars import *
from m5_sb_models import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 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
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

# train_logs, test_logs = amend_event_id_order(train_logs, test_logs)     worsens

In [4]:
train_logs.collect().to_pandas().down_event.unique()

array(['Leftclick', 'Shift', 'q', 'Space', 'Backspace', '.', ',', 'Enter',
       'ArrowLeft', "'", ';', 'ArrowRight', '-', '?', 'Tab', '"',
       'ArrowUp', 'ArrowDown', 'Rightclick', '=', 'CapsLock', 'Control',
       'c', 'v', '/', 'Delete', ':', 'z', '[', '$', '(', ')', '+', 'Home',
       'End', '\\', 'Meta', '*', '&', 'AudioVolumeMute', 'x', '!',
       'Insert', 'MediaPlayPause', 'NumLock', '%', 'V', '>', 'Alt',
       'AudioVolumeUp', 'ContextMenu', 'AudioVolumeDown', 'a', '<',
       'PageDown', ']', 'Middleclick', '@', 'F12', 'j', '\x96', 'Dead',
       't', 's', 'n', 'y', '{', 'ScrollLock', '¿', 'Process', '}',
       'MediaTrackPrevious', 'MediaTrackNext', 'F3', '^', 'Unidentified',
       'Cancel', '2', 'i', 'd', 'r', 'e', '`', '\x9b', 'm', '#', '~',
       'PageUp', 'T', 'A', 'b', 'S', 'ModeChange', '_', 'Escape', 'F11',
       'Unknownclick', 'AltGraph', 'F10', 'h', 'F15', 'Clear', 'OS', 'F',
       'C', 'o', 'Ä±', 'f', 'u', 'w', 'p', 'g', 'M', 'l', '|',
       'â\x80\x

In [7]:
logs = train_logs.clone()
logs = logs.select(
    pl.col(['id','down_time', 'event_id','down_event','action_time','cursor_position'])).sort('id','event_id')

In [8]:
logs.head(15).collect()

id,down_time,event_id,down_event,action_time,cursor_position
str,i64,i64,str,i64,i64
"""001519c8""",4526,1,"""Leftclick""",31,0
"""001519c8""",4558,2,"""Leftclick""",404,0
"""001519c8""",106571,3,"""Shift""",0,0
"""001519c8""",106686,4,"""q""",91,1
"""001519c8""",107196,5,"""q""",127,2
"""001519c8""",107296,6,"""q""",104,3
"""001519c8""",107469,7,"""q""",127,4
"""001519c8""",107659,8,"""q""",107,5
"""001519c8""",107743,9,"""q""",109,6
"""001519c8""",107840,10,"""Space""",138,7


In [3]:
logs = train_logs.clone()
logs = logs.select(
    pl.col(['id','down_time', 'event_id','down_event','action_time','cursor_pos'])).sort('id','event_id')

logs = logs.with_columns(
    pl.when((pl.col('down_event')==".") | (pl.col('down_event')=="Shift"))
    .then(0)
    .when((pl.col('down_event')=="Backspace") | (pl.col('down_event')=="Left Arrow"))
    .then(-1)
    .otherwise(1)
    .alias('removed_sent_interm')
)

logs = logs.with_columns((pl.col('down_event') == '.').cum_sum().over('id').alias('sentence_number')) # calculate sentence number
logs = logs.with_columns(pl.col('sentence_number').shift(1).over('id').fill_null(0)) # shifting 1 so that sentence starts after "."

logs = logs.with_columns(pl.col('down_event').is_in(['.','?','!']).alias('is_sent'))
logs = logs.with_columns(pl.col('removed_sent_interm').cum_sum().over('id','sentence_number'))

# FIND REMOVED "." WITH CONSECUTIVE BACKSPACES > removed_sent_interm will be neg
removed_stops = logs.groupby('id','sentence_number').agg(
    (pl.col('removed_sent_interm') < 0)
    .any()
    .alias('has_negative')
).sort('id','sentence_number')

removed_stops = removed_stops.with_columns(pl.col('has_negative').shift(-1).over('id'))

logs = logs.join(removed_stops, on=('id', 'sentence_number'), how='left')
logs = logs.with_columns(
    pl.when(pl.col('has_negative') & (pl.col('is_sent')))
    .then(False)
    .otherwise(pl.col('is_sent'))
    .alias('is_sent')
)

logs = logs.drop('has_negative')


In [7]:
# RE-STABLISH SENTENCES STARTING POINT
logs = logs.with_columns((pl.col('is_sent')).cum_sum().over('id').alias('sentence_number'))

logs = logs.with_columns(pl.col('sentence_number').shift(1).over('id')).fill_null(0)
logs = logs.with_columns(
            sent_time = pl.cum_sum('action_time').over('id','sentence_number').fill_null(0)
        )

logs = logs.drop('removed_sent_interm','is_sent')

logs = logs.with_columns(
    pl.col('sent_time')
    .shift(-1)
    .over('id')
    .alias('sent_time_diff')
    .fill_null(0))


In [22]:
logs.filter(pl.col('id')=='0022f953').slice(offset=80, length=25).collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""0022f953""",60634,81,"""q""",160,1,3612,3732
"""0022f953""",60778,82,"""Space""",120,1,3732,3852
"""0022f953""",60874,83,"""q""",120,1,3852,3947
"""0022f953""",60979,84,"""q""",95,1,3947,4043
"""0022f953""",61058,85,"""q""",96,1,4043,4187
"""0022f953""",61138,86,"""Space""",144,1,4187,4291
"""0022f953""",61298,87,"""q""",104,1,4291,4411
"""0022f953""",61362,88,"""q""",120,1,4411,4515
"""0022f953""",61458,89,"""q""",104,1,4515,4627
"""0022f953""",61626,90,"""q""",112,1,4627,4747


In [12]:
logs.filter((pl.col('id')=='10989e65') & (pl.col('event_id')>=3190)).head(15).collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""10989e65""",1676300,3190,""".""",115,18,54872,55025
"""10989e65""",1684383,3191,"""Backspace""",153,18,55025,55161
"""10989e65""",1684570,3192,"""Space""",136,18,55161,55329
"""10989e65""",1684717,3193,"""q""",168,18,55329,55463
"""10989e65""",1684881,3194,"""q""",134,18,55463,55628
"""10989e65""",1685009,3195,"""q""",165,18,55628,55763
"""10989e65""",1685125,3196,"""Space""",135,18,55763,55880
"""10989e65""",1687447,3197,"""Backspace""",117,18,55880,55988
"""10989e65""",1687640,3198,"""Backspace""",108,18,55988,56088
"""10989e65""",1687821,3199,"""Backspace""",100,18,56088,56192


In [8]:
logs.slice(offset=348, length=15).collect() # 0022f953

id,down_time,event_id,down_event,action_time,removed_sent_interm,sentence_number,is_sent,sent_time
str,i64,i64,str,i64,i32,u32,bool,i64
"""001519c8""",305590,349,""".""",113,68,1,False,22015
"""001519c8""",305833,350,"""Space""",97,1,1,False,22112
"""001519c8""",305996,351,"""Space""",148,2,1,False,22260
"""001519c8""",312325,352,"""Backspace""",117,1,1,False,22377
"""001519c8""",312518,353,"""Backspace""",91,0,1,False,22468
"""001519c8""",312882,354,"""Backspace""",113,-1,1,False,22581
"""001519c8""",315524,355,""",""",197,0,1,False,22778
"""001519c8""",315673,356,"""Space""",119,1,1,False,22897
"""001519c8""",316808,357,"""Backspace""",78,0,1,False,22975
"""001519c8""",316969,358,"""Backspace""",81,-1,1,False,23056


In [9]:
logs.filter(pl.col('id')=="10989e65").sort('sentence_number').head(25).collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""0022f953""",47538,45,"""Space""",136,0,5814,5918
"""0022f953""",41866,31,"""Backspace""",128,0,3901,4021
"""0022f953""",32706,3,"""q""",136,0,136,256
"""0022f953""",32826,4,"""q""",120,0,256,384
"""0022f953""",32922,5,"""q""",128,0,384,504
"""0022f953""",33010,6,"""q""",120,0,504,728
"""0022f953""",33106,7,"""Space""",224,0,728,872
"""0022f953""",33370,8,"""q""",144,0,872,1032
"""0022f953""",33634,9,"""q""",160,0,1032,1176
"""0022f953""",33810,10,"""Space""",144,0,1176,1176


In [10]:
logs.sort('id','event_id').slice(length=20, offset=559_455).collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""10989e65""",99219,149,"""q""",116,0,20491,20687
"""10989e65""",99269,150,"""q""",196,0,20687,20849
"""10989e65""",99384,151,"""q""",162,0,20849,20987
"""10989e65""",99645,152,"""q""",138,0,20987,21129
"""10989e65""",99803,153,""".""",142,0,21129,152
"""10989e65""",99927,154,"""Space""",152,1,152,152
"""10989e65""",102259,155,"""Shift""",0,1,152,338
"""10989e65""",102463,156,"""q""",186,1,338,502
"""10989e65""",102596,157,"""Space""",164,1,502,619
"""10989e65""",103251,158,"""q""",117,1,619,740


In [13]:
in_between_sent_pause = logs.group_by('id','sentence_number').agg(
    sent_last_time = pl.col('sent_time_diff').last(),
).sort('id','sentence_number')
#REMOVE 0S or last sentence

In [14]:
in_between_sent_pause.filter(pl.col('id')=="0022f953").head(25).collect()

id,sentence_number,sent_last_time
str,i64,i64
"""0022f953""",-9223372036818063310,160
"""0022f953""",-9223372032523096014,200
"""0022f953""",-9223372028228128718,120
"""0022f953""",-9223372023933161422,120
"""0022f953""",-9223372019638194126,103
"""0022f953""",-9223372015343226830,135
"""0022f953""",-9223372011048259534,117
"""0022f953""",-9223372006753292238,116
"""0022f953""",-9223372002458324942,157
"""0022f953""",-9223371998163357646,144


In [None]:

sent_pause_duration = logs.group_by('id','sentence_number').agg(
    sent_last_time = pl.col('sent_time_diff').last(),
).sort('id','sentence_number')

In [4]:
sent_pause_duration.filter(pl.col('id')=="0022f953").head(25).collect()

id,sentence_number,sent_last_time
str,i64,i64
"""0022f953""",-9223371963803619278,160
"""0022f953""",-9223371959508651982,200
"""0022f953""",-9223371955213684686,120
"""0022f953""",-9223371950918717390,120
"""0022f953""",-9223371946623750094,103
"""0022f953""",-9223371942328782798,135
"""0022f953""",-9223371938033815502,117
"""0022f953""",-9223371933738848206,116
"""0022f953""",-9223371929443880910,157
"""0022f953""",-9223371925148913614,144


In [None]:
logs.sort('id','event_id').slice(length=20, offset=559_455).collect()

In [4]:
def sent_pauses(train_logs, test_logs):
    print("< sentences pauses >")    
    feats = []

    for data in [train_logs, test_logs]:
        logs = data.clone()
        logs = logs.select(
            pl.col(['id','event_id','down_event','action_time'])).sort(['id','event_id'])
            
        logs = logs.with_columns(pl.col('down_event').is_in(['.','?','!']))

        sents = logs.with_columns(
            id_runs = pl.cum_sum('down_event').over('id').shift(1).fill_null(0)
        )

        sents = sents.with_columns(
            pl.cum_sum('action_time')
            .over('id','id_runs')
            .alias('sent_cum_sum'))

        sents = sents.group_by('id','id_runs').agg(pl.col('sent_cum_sum').max()).sort('id','id_runs')    

        sent_pauses = sents.group_by(['id']).agg(
                        sen_pause_mean = pl.col('sent_cum_sum').mean(),
                        sen_pause_sum = pl.col('sent_cum_sum').sum(),
                        sen_pause_std = pl.col('sent_cum_sum').std(),
                        sen_pause_max = pl.col('sent_cum_sum').max(),
                        sen_pause_min = pl.col('sent_cum_sum').min(),
                        sen_pause_median = pl.col('sent_cum_sum').median(),
                        sen_pasuse_q1 = pl.col('sent_cum_sum').quantile(0.25),
                        sen_pasuse_q3 = pl.col('sent_cum_sum').quantile(0.75),
                        sen_pasuse_kurt = pl.col('sent_cum_sum').kurtosis(),
                        sen_pasuse_skew = pl.col('sent_cum_sum').skew(),
        )
        feats.append(sent_pauses)
    return feats[0], feats[1]

In [7]:
logs.slice(offset=348, length=15).collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""001519c8""",305590,349,""".""",113,1,22015,22112
"""001519c8""",305833,350,"""Space""",97,1,22112,22260
"""001519c8""",305996,351,"""Space""",148,1,22260,22377
"""001519c8""",312325,352,"""Backspace""",117,1,22377,22468
"""001519c8""",312518,353,"""Backspace""",91,1,22468,22581
"""001519c8""",312882,354,"""Backspace""",113,1,22581,22778
"""001519c8""",315524,355,""",""",197,1,22778,22897
"""001519c8""",315673,356,"""Space""",119,1,22897,22975
"""001519c8""",316808,357,"""Backspace""",78,1,22975,23056
"""001519c8""",316969,358,"""Backspace""",81,1,23056,23179


In [12]:
max_event_id = logs.group_by('id','sentence_number').agg(
    max_event_id = pl.col('event_id').max()
)

In [19]:
max_event_id.head(15).collect()

id,sentence_number,max_event_id
str,u32,i64
"""0042269b""",33,800
"""0042269b""",44,3327
"""0075873a""",64,68
"""0093f095""",98,706
"""009e23ab""",122,1891
"""009e23ab""",124,2065
"""00e1f05a""",165,5546
"""00e1f05a""",173,7759
"""00fc9a6a""",256,2249
"""0144e4d5""",262,253


In [18]:
max_event_id.join(logs, on=('id','sentence_number'), how='inner').collect()

id,sentence_number,max_event_id,down_time,event_id,down_event,action_time,sent_time,sent_time_diff
str,u32,i64,i64,i64,str,i64,i64,i64
"""001519c8""",0,158,4526,1,"""Leftclick""",31,31,435
"""001519c8""",0,158,4558,2,"""Leftclick""",404,435,435
"""001519c8""",0,158,106571,3,"""Shift""",0,435,526
"""001519c8""",0,158,106686,4,"""q""",91,526,653
"""001519c8""",0,158,107196,5,"""q""",127,653,757
"""001519c8""",0,158,107296,6,"""q""",104,757,884
"""001519c8""",0,158,107469,7,"""q""",127,884,991
"""001519c8""",0,158,107659,8,"""q""",107,991,1100
"""001519c8""",0,158,107743,9,"""q""",109,1100,1238
"""001519c8""",0,158,107840,10,"""Space""",138,1238,1425


In [15]:
max_event_id.sort('id','max_event_id').head(20).collect()

id,sentence_number,max_event_id
str,u32,i64
"""001519c8""",0,158
"""001519c8""",1,359
"""001519c8""",2,395
"""001519c8""",3,459
"""001519c8""",4,721
"""001519c8""",5,880
"""001519c8""",6,1071
"""001519c8""",7,1323
"""001519c8""",8,1395
"""001519c8""",9,1624


In [10]:
logs.collect()

id,down_time,event_id,down_event,action_time,sentence_number,sent_time,sent_time_diff
str,i64,i64,str,i64,u32,i64,i64
"""001519c8""",4526,1,"""Leftclick""",31,0,31,435
"""001519c8""",4558,2,"""Leftclick""",404,0,435,435
"""001519c8""",106571,3,"""Shift""",0,0,435,526
"""001519c8""",106686,4,"""q""",91,0,526,653
"""001519c8""",107196,5,"""q""",127,0,653,757
"""001519c8""",107296,6,"""q""",104,0,757,884
"""001519c8""",107469,7,"""q""",127,0,884,991
"""001519c8""",107659,8,"""q""",107,0,991,1100
"""001519c8""",107743,9,"""q""",109,0,1100,1238
"""001519c8""",107840,10,"""Space""",138,0,1238,1425


In [19]:
logs.group_by('id','sent')

id,event_id,action_time,sentence_number,sent_time
str,i64,i64,u32,i64
"""001519c8""",1,31,0,31
"""001519c8""",2,404,0,435
"""001519c8""",3,0,0,435
"""001519c8""",4,91,0,526
"""001519c8""",5,127,0,653
"""001519c8""",6,104,0,757
"""001519c8""",7,127,0,884
"""001519c8""",8,107,0,991
"""001519c8""",9,109,0,1100
"""001519c8""",10,138,0,1238


In [20]:
logs.group_by('id','sentence_number').agg(pl.col('sent_time').max()).sort('id','sentence_number').collect()


id,sentence_number,sent_time
str,u32,i64
"""001519c8""",0,19617
"""001519c8""",1,23179
"""001519c8""",2,4509
"""001519c8""",3,8418
"""001519c8""",4,37652
"""001519c8""",5,19106
"""001519c8""",6,21065
"""001519c8""",7,28682
"""001519c8""",8,8417
"""001519c8""",9,24442


In [6]:
def sentences_timing(train_logs, test_logs):
    print("< sentences timing >")    
    feats = []
    for data in [train_logs, test_logs]:
        
        logs = data.clone()
        logs = logs.select(
            pl.col(['id','event_id','down_event','action_time'])).sort('id','event_id')
            
        logs = logs.with_columns(
            pl.when(pl.col('down_event')==".")
            .then(0)
            .when(pl.col('down_event')=="Backspace")
            .then(-1)
            .otherwise(1)
            .alias('removed_sent_interm')
        )

        logs = logs.with_columns((pl.col('down_event') == '.').cum_sum().alias('sentence_number'))
        logs = logs.with_columns(pl.col('down_event').is_in(['.','?','!']).alias('is_sent'))
        logs = logs.with_columns(pl.col('removed_sent_interm').cum_sum().over('id','sentence_number'))

        # FIND REMOVED "." WITH CONSECUTIVE BACKSPACES > removed_sent_interm will be neg
        removed_stops = logs.groupby('id','sentence_number').agg(
            (pl.col('removed_sent_interm') < 0)
            .any()
            .alias('has_negative')
        )

        logs = logs.join(removed_stops, on=('id', 'sentence_number'), how='left')

        logs = logs.with_columns(
            pl.when(pl.col('has_negative') & (pl.col('is_sent')))
            .then(False)
            .otherwise(pl.col('is_sent'))
            .alias('is_sent')
        )

        logs = logs.drop('has_negative')

        # RE-STABLISH SENTENCES STARTING POINT
        logs = logs.with_columns((pl.col('is_sent')).cum_sum().alias('sentence_number'))

        logs = logs.with_columns(pl.col('sentence_number').shift(1))
        logs = logs.with_columns(
                    sent_time = pl.cum_sum('action_time').over('id','sentence_number').fill_null(0)
                )

        sentences = logs.group_by('id','sentence_number').agg(
            pl.max('sent_time')
            .alias('total_sentence_time')
        ).sort('id','sentence_number')

        sentences = sentences.group_by(['id']).agg(
                        sent_timings_mean = pl.col('total_sentence_time').mean(),
                        sent_timings_sum = pl.col('total_sentence_time').sum(),
                        sent_timings_std = pl.col('total_sentence_time').std(),
                        sent_timings_max = pl.col('total_sentence_time').max(),
                        sent_timings_min = pl.col('total_sentence_time').min(),
                        sent_timings_median = pl.col('total_sentence_time').median(),
                        sent_timingse_q1 = pl.col('total_sentence_time').quantile(0.25),
                        sent_timingse_q3 = pl.col('total_sentence_time').quantile(0.75),
                        sent_timingse_kurt = pl.col('total_sentence_time').kurtosis(),
                        sent_timingse_skew = pl.col('total_sentence_time').skew(),
        )
        feats.append(sentences)

    return feats[0], feats[1]

In [None]:
# between paragraph pauses ?
# backspace pauses
# edit pauses

In [None]:
# bad: sent_by_par, par_words, word_pauses_basic,

# neutral: cursor_pos_acceleration, countvectorize_two_one (slightly better than cursor_pos_acc + 4),  sent_timings

# good: full sentences, full paragraphs, down_events, one_grams, create_pauses -  r-burst, nunique, words feats

In [None]:
# Best Feature Set: ['train_essay_sentences.pkl', 'train_create_pauses.pkl', 
# 'train_vector_one_gram.pkl', 'train_essay_paragraphs.pkl', 
# 'train_categorical_nunique.pkl', 'train_word_pauses.pkl', 
# 'train_events_counts_rate_of_change.pkl', 'train_word_counts_rate_of_change.pkl', 
# 'train_r_burst_feats.pkl', 'train_vector_two_gram.pkl', 'train_events_counts_acceleration.pkl']

# Best_Feature_Set = ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
# 'train_create_pauses.pkl', 'train_sentences_per_paragraph.pkl', 
# 'train_add_word_pauses_basic.pkl', 'train_cursor_pos_acceleration.pkl', 
# 'train_remove_word_pauses_adv.pkl', 'train_paragraph_length.pkl', 
# 'train_categorical_nunique.pkl', 'train_paragraph_words.pkl', 
# 'train_events_counts_time_based.pkl', 'train_vector_two_gram.pkl', 
# 'train_sentences_words.pkl', 'train_add_word_pauses_adv.pkl']

# Best_Feature_Set = ['train_down_events_counts_one.pkl', 'train_essay_par_length.pkl', 
# 'train_create_pauses.pkl', 'train_countvectorize_one_two.pkl', 
# 'train_down_events_counts_two.pkl', 'train_cursor_pos_acceleration_basic.pkl', 
# 'train_essay_sents_per_par_basic.pkl', 'train_countvectorize_two_one.pkl', 
# 'train_remove_word_pauses_adv.pkl', 'train_add_word_pauses_adv.pkl', 
# 'train_word_count_acceleration_adv.pkl']

In [None]:
# best_feature_set_1 - PARTIAL
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_word_pause, ts_word_pause = word_pauses(train_logs, test_logs)
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
#tr_p_burst, ts_p_burst = p_burst_feats(train_logs, test_logs, 2)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
#tr_event_acc, ts_event_acc = events_counts_acceleration(train_logs, test_logs)
# tr_nunique, ts_nunique = categorical_nunique(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
# tr_time_by_act, ts_time_by_act = action_time_by_activity(train_logs, test_logs)
# tr_cursor_pos_roc, ts_cursor_pos_roc = cursor_pos_rate_of_change(train_logs, test_logs)
# 
# tr_act_count, ts_act_count = count_of_activities(train_logs, test_logs)
# tr_get_keys, ts_get_keys = get_keys_pressed_per_second(train_logs.collect().to_pandas(), 
#                                                        test_logs.collect().to_pandas())
# 
# tr_input_change, ts_input_change = input_text_change_feats(train_logs, test_logs)
# tr_wc_roc, ts_wc_roc =  word_counts_rate_of_change(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
#train_feats = train_feats.join(tr_p_burst, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
# train_feats = train_feats.join(tr_event_acc, on='id', how='left')
# train_feats = train_feats.join(tr_nunique, on='id', how='left')
# train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
# train_feats = train_feats.join(tr_act_count, on='id', how='left')
# train_feats = train_feats.join(tr_cursor_pos_roc, on='id', how='left')

# train_feats = train_feats.join(tr_get_keys, on='id', how='left')
# train_feats = train_feats.join(tr_input_change, on='id', how='left')
# train_feats = train_feats.join(tr_time_by_act, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
# test_feats = test_feats.join(ts_p_burst, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
# test_feats = test_feats.join(tr_event_acc, on='id', how='left')
# test_feats = test_feats.join(ts_nunique, on='id', how='left')
# test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
# test_feats = test_feats.join(ts_act_count, on='id', how='left')
# test_feats = test_feats.join(ts_cursor_pos_roc, on='id', how='left')


# test_feats = test_feats.join(ts_get_keys, on='id', how='left')
# test_feats = test_feats.join(ts_input_change, on='id', how='left')
# test_feats = test_feats.join(ts_time_by_act, on='id', how='left')


train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.sort('id')
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')

train_feats = train_feats.merge(train_scores, on='id', how='left')
print(f'train feats shape {train_feats.shape}')


In [None]:
from m5_sb_models import lgb_pipeline
lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

param = {'n_estimators': 1024,
        'learning_rate': 0.005,
        'metric': 'rmse',
        'force_col_wise': True,
        'verbosity': 0,}

# train_feats = train_feats[['id', 'score'] + feat_select]
# test_feats = test_feats[['id'] + feat_select]

print(f'train feats shape {train_feats.shape}')


In [None]:
shuffle_preds = []

for i in range(15):
    train_feats = train_feats.sample(frac=1).reset_index(drop=True)
    test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, lgb_params_1)
    shuffle_preds.append(rmse)
    #test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, param)

np.mean(shuffle_preds)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)