In [1]:
import pandas as pd
import numpy as np
import polars as pl

from m4_feats_polars import *
from m5_sb_models import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

# train_logs, test_logs = amend_event_id_order(train_logs, test_logs)     worsens

In [3]:
def word_pauses(train_logs, test_logs):
    print("word pauses")    
    feats = []

    for data in [train_logs, test_logs]:
        logs = data.clone()
        logs = logs.with_columns(pl.col('word_count')
            .diff().over('id')
            .alias('word_diff'))

        logs = logs.filter(
            pl.col('word_diff')>0).select(pl.col(['id','action_time']))

        word_pause = logs.group_by(['id']).agg(
                        word_pause_count = pl.col('action_time').count(),
                        word_pause_mean = pl.col('action_time').mean(),
                        word_pause_sum = pl.col('action_time').sum(),
                        word_pause_std = pl.col('action_time').std(),
                        word_pause_max = pl.col('action_time').max(),
                        word_pause_min = pl.col('action_time').min(),
                        word_pause_median = pl.col('action_time').median(),
                        word_pasuse_q1 = pl.col('action_time').quantile(0.25),
                        word_pasuse_q3 = pl.col('action_time').quantile(0.75),
                        word_pasuse_kurt = pl.col('action_time').kurtosis(),
                        word_pasuse_skew = pl.col('action_time').skew(),
        )
        feats.append(word_pause)
    return feats[0], feats[1]

In [4]:
def sent_pauses(train_logs, test_logs):
    print("sentences pauses")    
    feats = []

    for data in [train_logs, test_logs]:
        logs = data.clone()
        logs = logs.filter(pl.col(['down_event']).is_in(['.', '?', '!']))

        sent_pauses = logs.group_by(['id']).agg(
                        sen_pause_count = pl.col('action_time').count(),
                        sen_pause_mean = pl.col('action_time').mean(),
                        sen_pause_sum = pl.col('action_time').sum(),
                        sen_pause_std = pl.col('action_time').std(),
                        sen_pause_max = pl.col('action_time').max(),
                        sen_pause_min = pl.col('action_time').min(),
                        sen_pause_median = pl.col('action_time').median(),
                        sen_pasuse_q1 = pl.col('action_time').quantile(0.25),
                        sen_pasuse_q3 = pl.col('action_time').quantile(0.75),
                        sen_pasuse_kurt = pl.col('action_time').kurtosis(),
                        sen_pasuse_skew = pl.col('action_time').skew(),
        )
        feats.append(sent_pauses)
    return feats[0], feats[1]

In [None]:
def par_pauses(train_logs, test_logs):
    print("paragraph pauses")    
    feats = []

    for data in [train_logs, test_logs]:
        logs = data.clone()
        logs = logs.filter(pl.col(['text_change']).is_in(['\n'])).collect()

        par_pauses = logs.group_by(['id']).agg(
                        par_pause_count = pl.col('action_time').count(),
                        par_pause_mean = pl.col('action_time').mean(),
                        par_pause_sum = pl.col('action_time').sum(),
                        par_pause_std = pl.col('action_time').std(),
                        par_pause_max = pl.col('action_time').max(),
                        par_pause_min = pl.col('action_time').min(),
                        par_pause_median = pl.col('action_time').median(),
                        par_pasuse_q1 = pl.col('action_time').quantile(0.25),
                        par_pasuse_q3 = pl.col('action_time').quantile(0.75),
                        par_pasuse_kurt = pl.col('action_time').kurtosis(),
                        par_pasuse_skew = pl.col('action_time').skew(),
        )
        feats.append(par_pauses)
    return feats[0], feats[1]

In [5]:
# TEST INDIVIDUAL FEATURES
from m3_model_params import lgb_params_1
tr_sen_pauses, ts_sen_pauses = sent_pauses(train_logs, test_logs)
train_feats = tr_sen_pauses.join(train_scores, on='id', how='left')
test_feats = train_feats.clone()

train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()
test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, train_feats.drop('score', axis=1), lgb_params_1)

sentences pauses
Final RMSE over 50: 0.937467. Std 0.6724
RMSE by fold 0.937410. Std 0.0105


In [22]:
train_logs.filter(pl.col(['text_change']).is_in(['\n'])).collect()

id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,i64,i64,i64,i64,str,str,str,str,i64,i64
"""001519c8""",722,589449,589547,98,"""Input""","""Enter""","""Enter""",""" """,370,63
"""001519c8""",723,589592,589693,101,"""Input""","""Enter""","""Enter""",""" """,371,63
"""001519c8""",1717,1084341,1084477,136,"""Input""","""Enter""","""Enter""",""" """,1018,166
"""001519c8""",1718,1084716,1084834,118,"""Input""","""Enter""","""Enter""",""" """,1019,166
"""0022f953""",322,137096,137278,182,"""Input""","""Enter""","""Enter""",""" """,237,51
"""0022f953""",325,142046,142181,135,"""Input""","""Enter""","""Enter""",""" """,237,51
"""0022f953""",887,329286,329466,180,"""Input""","""Enter""","""Enter""",""" """,731,149
"""0022f953""",1674,661719,661818,99,"""Input""","""Enter""","""Enter""",""" """,1070,248
"""0022f953""",1675,663383,663546,163,"""Input""","""Enter""","""Enter""",""" """,1071,248
"""0022f953""",1676,665518,665602,84,"""Remove/Cut""","""Backspace""","""Backspace""",""" """,1070,248


In [24]:
train_logs.filter(pl.col(['down_event']).is_in(['.', '?', '!'])).collect()

id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,i64,i64,i64,i64,str,str,str,str,i64,i64
"""001519c8""",158,172362,172451,89,"""Input""",""".""",""".""",""".""",105,18
"""001519c8""",349,305590,305703,113,"""Input""",""".""",""".""",""".""",158,27
"""001519c8""",359,319001,319124,123,"""Input""",""".""",""".""",""".""",158,27
"""001519c8""",395,389063,389177,114,"""Input""",""".""",""".""",""".""",32,32
"""001519c8""",459,425623,425781,158,"""Input""",""".""",""".""",""".""",237,41
"""001519c8""",721,589069,589160,91,"""Input""",""".""",""".""",""".""",369,63
"""001519c8""",880,684433,684540,107,"""Input""",""".""",""".""",""".""",501,86
"""001519c8""",1001,737829,737935,106,"""Input""",""".""",""".""",""".""",608,104
"""001519c8""",1023,745108,745214,106,"""Input""",""".""",""".""",""".""",624,108
"""001519c8""",1071,755142,755243,101,"""Input""",""".""",""".""",""".""",636,109


In [6]:
sent_pause

Unnamed: 0,id,essay,sent,sent_len,sent_word_count
0,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,qqqqqqqqq qq qqqqq qq qqqq qqqq,31,6
1,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,qqqqqq qqq qqqq qqqqqq qq qq qqqqq qq qqqq qqq...,107,19
2,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,qqqqqq qqq qqqqq qqq qqqqqqqqqqq qq qqq qqqqqq...,123,21
3,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,qqqqqqqq qq qqqqqqqqqq qqqq qqqq qqqqqqqqq qqq...,119,22
4,001519c8,qqqqqqqqq qq qqqqq qq qqqq qqqq. qqqqqq qqq q...,qq qq qqqq qqqq qqq qqqqqqqqq qqq qqqqqqq qq q...,129,24
...,...,...,...,...,...
52790,fff05981,qq qqqq qqqqqqq qqqqqqqq qq qqqqqqqqqqq qq qq ...,qqqq qq qqqqqqqqq qq qq qqqqqqqqqqq qqqqqqqq q...,104,18
52791,fff05981,qq qqqq qqqqqqq qqqqqqqq qq qqqqqqqqqqq qq qq ...,qq qqq qq qqqqq qq qq qqqqqq q qqq qq qq qqqqq...,411,66
52792,fff05981,qq qqqq qqqqqqq qqqqqqqq qq qqqqqqqqqqq qq qq ...,"qq q qqqqqqqqqqq qqqqqqqqqqq, qqqqqq qqqqqq, q...",140,25
52793,fff05981,qq qqqq qqqqqqq qqqqqqqq qq qqqqqqqqqqq qq qq ...,qqq qqqqqqqq qq qqqqq qqqqqq qq qqqq q qqqqqqq...,92,20


In [3]:
# everything is logged - DONE
# bursts = 2/3 of a second - input only - DONE
# inter word pauses
# between sentence pauses ?
# between paragraph pauses ?
# backspace pauses
# edit pauses

In [4]:
# # TEST INDIVIDUAL FEATURES
# from m3_model_params import lgb_params_1
# tr_word_pause, ts_word_pause = word_pauses(train_logs, test_logs)
# train_feats = tr_word_pause.join(train_scores, on='id', how='left')
# test_feats = ts_word_pause.clone()

# train_feats = train_feats.collect().to_pandas()
# test_feats = test_feats.collect().to_pandas()
# test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, lgb_params_1)

In [5]:
# [('train_down_events_counts.pkl', 26),
#  ('train_vector_one_gram.pkl', 26),
#  ('train_create_pauses.pkl', 26),
#  ('train_essay_paragraphs.pkl', 26),
#  ('train_cursor_pos_acceleration.pkl', 11),
#  ('train_word_count_acceleration.pkl', 6),
#  ('train_p_burst_feats.pkl', 5),
#  ('train_r_burst_feats.pkl', 3),
#  ('train_events_counts_acceleration.pkl', 3),
#  ('train_essay_sentences.pkl', 3),
#  ('train_categorical_nunique.pkl', 3),
#  ('train_vector_two_gram.pkl', 2),
#  ('train_cursor_pos_rate_of_change.pkl', 2),
#  ('train_word_counts_rate_of_change.pkl', 2),
#  ('train_count_of_activities.pkl', 2),
#  ('train_action_time_by_activity.pkl', 1),
#  ('train_product_to_keys.pkl', 1),
#  ('train_IKI_based_fractals.pkl', 1),
#  ('train_events_counts_baseline.pkl', 1)]

In [6]:
# best_feature_set_1 - PARTIAL
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_word_pause, ts_word_pause = word_pauses(train_logs, test_logs)
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
#tr_p_burst, ts_p_burst = p_burst_feats(train_logs, test_logs, 2)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
#tr_event_acc, ts_event_acc = events_counts_acceleration(train_logs, test_logs)
# tr_nunique, ts_nunique = categorical_nunique(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
# tr_time_by_act, ts_time_by_act = action_time_by_activity(train_logs, test_logs)
# tr_cursor_pos_roc, ts_cursor_pos_roc = cursor_pos_rate_of_change(train_logs, test_logs)
# 
# tr_act_count, ts_act_count = count_of_activities(train_logs, test_logs)
# tr_get_keys, ts_get_keys = get_keys_pressed_per_second(train_logs.collect().to_pandas(), 
#                                                        test_logs.collect().to_pandas())
# 
# tr_input_change, ts_input_change = input_text_change_feats(train_logs, test_logs)
# tr_wc_roc, ts_wc_roc =  word_counts_rate_of_change(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
#train_feats = train_feats.join(tr_p_burst, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
# train_feats = train_feats.join(tr_event_acc, on='id', how='left')
# train_feats = train_feats.join(tr_nunique, on='id', how='left')
# train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
# train_feats = train_feats.join(tr_act_count, on='id', how='left')
# train_feats = train_feats.join(tr_cursor_pos_roc, on='id', how='left')

# train_feats = train_feats.join(tr_get_keys, on='id', how='left')
# train_feats = train_feats.join(tr_input_change, on='id', how='left')
# train_feats = train_feats.join(tr_time_by_act, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
# test_feats = test_feats.join(ts_p_burst, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
# test_feats = test_feats.join(tr_event_acc, on='id', how='left')
# test_feats = test_feats.join(ts_nunique, on='id', how='left')
# test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
# test_feats = test_feats.join(ts_act_count, on='id', how='left')
# test_feats = test_feats.join(ts_cursor_pos_roc, on='id', how='left')


# test_feats = test_feats.join(ts_get_keys, on='id', how='left')
# test_feats = test_feats.join(ts_input_change, on='id', how='left')
# test_feats = test_feats.join(ts_time_by_act, on='id', how='left')


train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.sort('id')
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')

train_feats = train_feats.merge(train_scores, on='id', how='left')
print(f'train feats shape {train_feats.shape}')


< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
word pauses
< word count acceleration >
< R-burst features >
< Count vectorize bi-grams >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
train feats shape (2471, 144)


In [7]:
from m5_sb_models import lgb_pipeline
lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

param = {'n_estimators': 1024,
        'learning_rate': 0.005,
        'metric': 'rmse',
        'force_col_wise': True,
        'verbosity': 0,}

# train_feats = train_feats[['id', 'score'] + feat_select]
# test_feats = test_feats[['id'] + feat_select]

print(f'train feats shape {train_feats.shape}')


train feats shape (2471, 144)


In [8]:
shuffle_preds = []

for i in range(15):
    train_feats = train_feats.sample(frac=1).reset_index(drop=True)
    test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, lgb_params_1)
    shuffle_preds.append(rmse)
    #test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, param)

np.mean(shuffle_preds)

Final RMSE over 50: 0.603617. Std 0.8270
RMSE by fold 0.603563. Std 0.0083
Final RMSE over 50: 0.601545. Std 0.8255
RMSE by fold 0.601473. Std 0.0091
Final RMSE over 50: 0.602574. Std 0.8276
RMSE by fold 0.602372. Std 0.0151
Final RMSE over 50: 0.601950. Std 0.8275
RMSE by fold 0.601897. Std 0.0080
Final RMSE over 50: 0.604154. Std 0.8274
RMSE by fold 0.603977. Std 0.0149
Final RMSE over 50: 0.602163. Std 0.8266
RMSE by fold 0.602061. Std 0.0108
Final RMSE over 50: 0.603044. Std 0.8271
RMSE by fold 0.602868. Std 0.0148
Final RMSE over 50: 0.602455. Std 0.8275
RMSE by fold 0.602338. Std 0.0117
Final RMSE over 50: 0.602454. Std 0.8279
RMSE by fold 0.602405. Std 0.0079
Final RMSE over 50: 0.602837. Std 0.8266
RMSE by fold 0.602635. Std 0.0155
Final RMSE over 50: 0.602299. Std 0.8262
RMSE by fold 0.602235. Std 0.0083
Final RMSE over 50: 0.602076. Std 0.8286
RMSE by fold 0.601953. Std 0.0120
Final RMSE over 50: 0.602816. Std 0.8265
RMSE by fold 0.602714. Std 0.0110
Final RMSE over 50: 0.602

0.6026620396931698

In [9]:
#  0.6046250116470197 - baseline
#  0.6034392851713597 - word_pauses
#  0.6031532703206033 - word_count_acc
#  0.6030545803940613 - r_burst
#  0.60246384515886   - bigrams
# sentences

- M4 + cursor_pos_acc + word_count_acc + bigrams <br />
0.6044949502480581


- M4 + cursor_pos_acc + word_count_acc  <br />
0.6047963953877546 <br />


In [10]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

Unnamed: 0,score,RMSE
0,0.5,1.528224
1,1.0,1.283878
11,6.0,1.132529
2,1.5,0.879977
10,5.5,0.738216
3,2.0,0.573584
9,5.0,0.474352
4,2.5,0.461494
5,3.0,0.460465
6,3.5,0.402527


In [11]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

Unnamed: 0,score,RMSE
0,0.5,1.528224
1,1.0,1.283878
11,6.0,1.132529
2,1.5,0.879977
10,5.5,0.738216
3,2.0,0.573584
9,5.0,0.474352
4,2.5,0.461494
5,3.0,0.460465
6,3.5,0.402527


In [12]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)

In [13]:
import lightgbm as lgb

weights = [1.0 for _ in range(len(y))]  # Default weights
weights[some_specific_index] = 1.5  # Higher weight for a specific instance

train_data = lgb.Dataset(X_train, label=y_train, weight=weights)
# Proceed with setting up parameters and training the model


NameError: name 'y' is not defined

train feats shape (2471, 151)
Final RMSE over 50: 0.605768. Std 0.8244
RMSE by fold 0.605635. Std 0.0128

reduced feats
Final RMSE over 50: 0.604472. Std 0.8245
RMSE by fold 0.604365. Std 0.0115

r burst only
Number of estimators: 350
Final RMSE over 50: 0.604057. Std 0.8244
RMSE by fold 0.603946. Std 0.0116

Number of estimators: 350
Final RMSE over 50: 0.604650. Std 0.8243
RMSE by fold 0.604527. Std 0.0122

In [None]:
test_ids = test_feats.id
y_pred = np.mean(test_preds, axis=0)

sub = pd.DataFrame({'id': test_ids, 'score': y_pred})
sub.to_csv('submission.csv', index=False)