In [1]:
import pandas as pd
import numpy as np
import polars as pl
from m3_model_params import lgb_params_1
from m4_feats_polars import *
from m5_sb_models import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# 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
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

# train_logs, test_logs = amend_event_id_order(train_logs, test_logs)     worsens

In [3]:
logs = train_logs.clone()
logs = logs.select(
    pl.col(['id','event_id', 'down_time', 'down_event','action_time','cursor_position','activity'])).sort('id','event_id')

In [4]:
logs = logs.with_columns(
    pl.col('cursor_position')
    .diff()
    .over('id')
    .fill_null(0)
    .alias('cursor_pos_diff'))

In [5]:
logs = logs.with_columns(
    pl.when(pl.col('cursor_pos_diff')< -1)
    .then(True)
    .otherwise(False)
    .alias('cursor_moved_new_sentence'))

In [6]:
logs.slice(offset=345, length=15).collect()

id,event_id,down_time,down_event,action_time,cursor_position,activity,cursor_pos_diff,cursor_moved_new_sentence
str,i64,i64,str,i64,i64,str,i64,bool
"""001519c8""",346,303811,"""Space""",161,155,"""Input""",1,False
"""001519c8""",347,304059,"""q""",151,156,"""Input""",1,False
"""001519c8""",348,304137,"""q""",151,157,"""Input""",1,False
"""001519c8""",349,305590,""".""",113,158,"""Input""",1,False
"""001519c8""",350,305833,"""Space""",97,159,"""Input""",1,False
"""001519c8""",351,305996,"""Space""",148,160,"""Input""",1,False
"""001519c8""",352,312325,"""Backspace""",117,159,"""Remove/Cut""",-1,False
"""001519c8""",353,312518,"""Backspace""",91,158,"""Remove/Cut""",-1,False
"""001519c8""",354,312882,"""Backspace""",113,157,"""Remove/Cut""",-1,False
"""001519c8""",355,315524,""",""",197,158,"""Input""",1,False


In [7]:
logs = logs.with_columns(pl.col('down_event').is_in(['.','?','!']).alias('is_end_of_sentence_mark'))

logs = logs.with_columns(
    (pl.col('is_end_of_sentence_mark') | (pl.col('cursor_moved_new_sentence')))
    .cum_sum()
    .over('id')
    .shift(1)
    .fill_null(0)
    .alias('sentence_number')
)

In [8]:
logs.slice(offset=345, length=15).collect()

id,event_id,down_time,down_event,action_time,cursor_position,activity,cursor_pos_diff,cursor_moved_new_sentence,is_end_of_sentence_mark,sentence_number
str,i64,i64,str,i64,i64,str,i64,bool,bool,u32
"""001519c8""",346,303811,"""Space""",161,155,"""Input""",1,False,False,4
"""001519c8""",347,304059,"""q""",151,156,"""Input""",1,False,False,4
"""001519c8""",348,304137,"""q""",151,157,"""Input""",1,False,False,4
"""001519c8""",349,305590,""".""",113,158,"""Input""",1,False,True,4
"""001519c8""",350,305833,"""Space""",97,159,"""Input""",1,False,False,5
"""001519c8""",351,305996,"""Space""",148,160,"""Input""",1,False,False,5
"""001519c8""",352,312325,"""Backspace""",117,159,"""Remove/Cut""",-1,False,False,5
"""001519c8""",353,312518,"""Backspace""",91,158,"""Remove/Cut""",-1,False,False,5
"""001519c8""",354,312882,"""Backspace""",113,157,"""Remove/Cut""",-1,False,False,5
"""001519c8""",355,315524,""",""",197,158,"""Input""",1,False,False,5


In [9]:
replace_end_of_sentence = logs.with_columns(
    pl.col('cursor_pos_diff')
    .cum_sum()
    .over('id','sentence_number')
    .alias('calc_is_sent_removed'))

replace_end_of_sentence = replace_end_of_sentence.with_columns(
    pl.when(pl.col('calc_is_sent_removed') == -1)
    .then(True)
    .otherwise(False)
    .alias('removed_end_of_sentence'))

In [10]:
replace_end_of_sentence = replace_end_of_sentence.group_by('id','sentence_number').agg([
    pl.col('calc_is_sent_removed').min().alias('min_value')
]).with_columns(
    (pl.col('min_value') == -1).alias('is_min_minus_one')
).sort('id','sentence_number')

In [12]:
replace_end_of_sentence.head(25).collect()

id,sentence_number,min_value,is_min_minus_one
str,u32,i64,bool
"""001519c8""",0,0,False
"""001519c8""",1,-91,False
"""001519c8""",2,-1,True
"""001519c8""",3,59,False
"""001519c8""",4,1,False
"""001519c8""",5,-1,True
"""001519c8""",6,-158,False
"""001519c8""",7,0,False
"""001519c8""",8,1,False
"""001519c8""",9,-66,False


In [11]:
replace_end_of_sentence.slice(offset=345, length=15).collect()

id,sentence_number,min_value,is_min_minus_one
str,u32,i64,bool
"""00e1f05a""",13,-2,False
"""00e1f05a""",14,-4,False
"""00e1f05a""",15,1,False
"""00e1f05a""",16,-3,False
"""00e1f05a""",17,-4,False
"""00e1f05a""",18,1,False
"""00e1f05a""",19,1,False
"""00e1f05a""",20,1,False
"""00e1f05a""",21,-176,False
"""00e1f05a""",22,-40,False


In [11]:
replace_end_of_sentence.slice(offset=450, length=15).collect()

id,event_id,down_time,down_event,action_time,cursor_position,activity,cursor_pos_diff,cursor_moved_new_sentence,is_end_of_sentence_mark,sentence_number,calc_is_sent_removed,removed_end_of_sentence
str,i64,i64,str,i64,i64,str,i64,bool,bool,u32,i64,bool
"""001519c8""",451,424271,"""q""",248,229,"""Input""",1,False,False,8,197,False
"""001519c8""",452,424419,"""q""",169,230,"""Input""",1,False,False,8,198,False
"""001519c8""",453,424702,"""q""",93,231,"""Input""",1,False,False,8,199,False
"""001519c8""",454,424780,"""q""",152,232,"""Input""",1,False,False,8,200,False
"""001519c8""",455,424988,"""q""",140,233,"""Input""",1,False,False,8,201,False
"""001519c8""",456,425148,"""q""",99,234,"""Input""",1,False,False,8,202,False
"""001519c8""",457,425345,"""q""",103,235,"""Input""",1,False,False,8,203,False
"""001519c8""",458,425469,"""q""",157,236,"""Input""",1,False,False,8,204,False
"""001519c8""",459,425623,""".""",158,237,"""Input""",1,False,True,8,205,False
"""001519c8""",460,425748,"""Space""",151,238,"""Input""",1,False,False,9,1,False


In [None]:
replace_end_of_sentence.group_by('id','sentence_number').agg([pl.when('calc_is_sent_removed') <])

In [None]:
logs = logs.with_columns(pl.col('sentence_number').shift(1).over('id').fill_null(0))
logs = logs.with_columns(pl.col('down_event').is_in(end_of_sent).shift(1).fill_null(False).alias('is_new_sentence'))
logs = logs.with_columns(pl.col('calc_is_sent_removed') < -1)
logs = logs.with_columns((pl.col('down_event') == '.').cum_sum().over('id').alias('sentence_number'))

In [20]:
end_of_sent = ['.','?','!']


logs = logs.with_columns(pl.col('sentence_number').shift(1).over('id').fill_null(0))
logs = logs.with_columns(pl.col('down_event').is_in(end_of_sent).shift(1).fill_null(False).alias('is_new_sentence'))
logs = logs.with_columns(pl.col('calc_is_sent_removed') < -1)
logs = logs.with_columns((pl.col('down_event') == '.').cum_sum().over('id').alias('sentence_number'))


In [27]:
logs.slice(offset=450, length=15).collect()

id,event_id,down_event,action_time,cursor_position,activity,cursor_pos_diff,sentence_number,is_new_sent,calc_is_sent_removed
str,i64,str,i64,i64,str,i64,u32,bool,i64
"""001519c8""",451,"""q""",248,229,"""Input""",1,4,False,197
"""001519c8""",452,"""q""",169,230,"""Input""",1,4,False,198
"""001519c8""",453,"""q""",93,231,"""Input""",1,4,False,199
"""001519c8""",454,"""q""",152,232,"""Input""",1,4,False,200
"""001519c8""",455,"""q""",140,233,"""Input""",1,4,False,201
"""001519c8""",456,"""q""",99,234,"""Input""",1,4,False,202
"""001519c8""",457,"""q""",103,235,"""Input""",1,4,False,203
"""001519c8""",458,"""q""",157,236,"""Input""",1,4,False,204
"""001519c8""",459,""".""",158,237,"""Input""",1,4,False,205
"""001519c8""",460,"""Space""",151,238,"""Input""",1,5,True,1


In [26]:
logs.with_columns(
    pl.when(pl.col('calc_is_sent_removed') == -1)
    .then(pl.col('is_new_sent'))
    .otherwise(pl.col('calc_is_sent_removed'))
    .alias('calc_is_sent_removed')  # Rename the column to replace the original one
).slice(offset=345, length=15).collect()

id,event_id,down_event,action_time,cursor_position,activity,cursor_pos_diff,sentence_number,is_new_sent,calc_is_sent_removed
str,i64,str,i64,i64,str,i64,u32,bool,i64
"""001519c8""",346,"""Space""",161,155,"""Input""",1,1,False,50
"""001519c8""",347,"""q""",151,156,"""Input""",1,1,False,51
"""001519c8""",348,"""q""",151,157,"""Input""",1,1,False,52
"""001519c8""",349,""".""",113,158,"""Input""",1,1,False,53
"""001519c8""",350,"""Space""",97,159,"""Input""",1,2,True,1
"""001519c8""",351,"""Space""",148,160,"""Input""",1,2,False,2
"""001519c8""",352,"""Backspace""",117,159,"""Remove/Cut""",-1,2,False,1
"""001519c8""",353,"""Backspace""",91,158,"""Remove/Cut""",-1,2,False,0
"""001519c8""",354,"""Backspace""",113,157,"""Remove/Cut""",-1,2,False,0
"""001519c8""",355,""",""",197,158,"""Input""",1,2,False,0


In [5]:
# IF IT IS NOT Remove/Cut then is another sentence

In [None]:
def sentences_timing(train_logs, test_logs):
    print("< sentences timing >")    
    feats = []
    for data in [train_logs, test_logs]:
        
        logs = data.clone()
        logs = logs.select(
            pl.col(['id','event_id','down_event','action_time'])).sort('id','event_id')
            
        logs = logs.with_columns(
            pl.when(pl.col('down_event')==".")
            .then(0)
            .when(pl.col('down_event')=="Backspace")
            .then(-1)
            .otherwise(1)
            .alias('removed_sent_interm')
        )

        logs = logs.with_columns((pl.col('down_event') == '.').cum_sum().alias('sentence_number'))
        logs = logs.with_columns(pl.col('down_event').is_in(['.','?','!']).alias('is_sent'))
        logs = logs.with_columns(pl.col('removed_sent_interm').cum_sum().over('id','sentence_number'))

        # FIND REMOVED "." WITH CONSECUTIVE BACKSPACES > removed_sent_interm will be neg
        removed_stops = logs.group_by('id','sentence_number').agg(
            (pl.col('removed_sent_interm') < 0)
            .any()
            .alias('has_negative')
        )

        logs = logs.join(removed_stops, on=('id', 'sentence_number'), how='left')

        logs = logs.with_columns(
            pl.when(pl.col('has_negative') & (pl.col('is_sent')))
            .then(False)
            .otherwise(pl.col('is_sent'))
            .alias('is_sent')
        )

        logs = logs.drop('has_negative')
        # RE-STABLISH SENTENCES STARTING POINT
        logs = logs.with_columns((pl.col('is_sent')).cum_sum().alias('sentence_number'))

        logs = logs.with_columns(pl.col('sentence_number').shift(1))
        logs = logs.with_columns(
                    sent_time = pl.cum_sum('action_time').over('id','sentence_number').fill_null(0)
                )

        sentences = logs.group_by('id','sentence_number').agg(
            pl.max('sent_time')
            .alias('total_sentence_time')
        ).sort('id','sentence_number')

        sentences = sentences.group_by(['id']).agg(
                        sent_timings_mean = pl.col('total_sentence_time').mean(),
                        sent_timings_sum = pl.col('total_sentence_time').sum(),
                        sent_timings_std = pl.col('total_sentence_time').std(),
                        sent_timings_max = pl.col('total_sentence_time').max(),
                        sent_timings_min = pl.col('total_sentence_time').min(),
                        sent_timings_median = pl.col('total_sentence_time').median(),
                        sent_timingse_q1 = pl.col('total_sentence_time').quantile(0.25),
                        sent_timingse_q3 = pl.col('total_sentence_time').quantile(0.75),
                        sent_timingse_kurt = pl.col('total_sentence_time').kurtosis(),
                        sent_timingse_skew = pl.col('total_sentence_time').skew(),
        )
        feats.append(sentences)

    return feats[0], feats[1]

In [3]:
# between paragraph pauses ?
# backspace pauses
# edit pauses

In [4]:
train_logs.head(5).collect()

id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
str,i64,i64,i64,i64,str,str,str,str,i64,i64
"""001519c8""",1,4526,4557,31,"""Nonproduction""","""Leftclick""","""Leftclick""","""NoChange""",0,0
"""001519c8""",2,4558,4962,404,"""Nonproduction""","""Leftclick""","""Leftclick""","""NoChange""",0,0
"""001519c8""",3,106571,106571,0,"""Nonproduction""","""Shift""","""Shift""","""NoChange""",0,0
"""001519c8""",4,106686,106777,91,"""Input""","""q""","""q""","""q""",1,1
"""001519c8""",5,107196,107323,127,"""Input""","""q""","""q""","""q""",2,1


In [None]:
# bad: sent_by_par, par_words, word_pauses_basic,

# neutral: cursor_pos_acceleration, countvectorize_two_one (slightly better than cursor_pos_acc + 4),  sent_timings, train_remove_word_pauses

# good: full sentences, full paragraphs, down_events, one_grams, create_pauses -  r-burst, nunique, words feats

In [None]:
# Best Feature Set: ['train_essay_sentences.pkl', 'train_create_pauses.pkl', 
# 'train_vector_one_gram.pkl', 'train_essay_paragraphs.pkl', 
# 'train_categorical_nunique.pkl', 'train_word_pauses.pkl', 
# 'train_events_counts_rate_of_change.pkl', 'train_word_counts_rate_of_change.pkl', 
# 'train_r_burst_feats.pkl', 'train_vector_two_gram.pkl', 'train_events_counts_acceleration.pkl']

# Best_Feature_Set = ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
# 'train_create_pauses.pkl', 'train_sentences_per_paragraph.pkl', 
# 'train_add_word_pauses_basic.pkl', 'train_cursor_pos_acceleration.pkl', 
# 'train_remove_word_pauses_adv.pkl', 'train_paragraph_length.pkl', 
# 'train_categorical_nunique.pkl', 'train_paragraph_words.pkl', 
# 'train_events_counts_time_based.pkl', 'train_vector_two_gram.pkl', 
# 'train_sentences_words.pkl', 'train_add_word_pauses_adv.pkl']

# Best_Feature_Set = ['train_down_events_counts_one.pkl', 'train_essay_par_length.pkl', 
# 'train_create_pauses.pkl', 'train_countvectorize_one_two.pkl', 
# 'train_down_events_counts_two.pkl', 'train_cursor_pos_acceleration_basic.pkl', 
# 'train_essay_sents_per_par_basic.pkl', 'train_countvectorize_two_one.pkl', 
# 'train_remove_word_pauses_adv.pkl', 'train_add_word_pauses_adv.pkl', 
# 'train_word_count_acceleration_adv.pkl']


# Best Feature Set: ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
# 'train_create_pauses.pkl', 'train_essay_paragraphs.pkl', 
# 'train_cursor_pos_acceleration.pkl', 'train_sentences_per_paragraph.pkl', 
# 'train_remove_word_pauses.pkl', 'train_vector_two_gram.pkl', 
# 'train_p_burst_feats.pkl', 'train_r_burst_feats.pkl']


# Best Feature Set: ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
#                    'train_create_pauses.pkl', 'train_sentences_per_paragraph.pkl', 
#                    'train_essay_paragraphs.pkl', 'train_cursor_pos_acceleration.pkl', 
#                    'train_p_burst_feats.pkl', 'train_remove_word_pauses.pkl', 
#                    'train_vector_two_gram.pkl', 'train_remove_words_time_spent.pkl', 
#                    'train_product_to_keys.pkl']


# Best Feature Set: ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
# 'train_create_pauses.pkl', 'train_essay_paragraphs.pkl', 
# 'train_word_count_acceleration.pkl', 'train_remove_words_time_spent.pkl']

# Best Feature Set SVR: ['train_essay_sentences.pkl', 'train_create_pauses.pkl', 
# 'train_essay_paragraphs.pkl', 'train_word_wait_10.pkl', 
# 'train_categorical_nunique.pkl', 'train_remove_words_time_spent.pkl', 
# 'train_essay_words.pkl', 'train_word_wait_1.pkl', 
# 'train_add_word_pauses.pkl', 'train_word_wait_25.pkl']

## UNFINISHED XGBOOST
# ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 
# 'train_create_pauses.pkl', 'train_essay_paragraphs.pkl', 
# 'train_essay_sentences.pkl', 'train_word_counts_rate_of_change.pkl', 
# 'train_vector_two_gram.pkl', 'train_text_changes_counts.pkl', 
# 'train_remove_word_pauses.pkl']


In [None]:
# best_feature_set_1 - PARTIAL
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_word_pause, ts_word_pause = word_pauses(train_logs, test_logs)
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
#tr_p_burst, ts_p_burst = p_burst_feats(train_logs, test_logs, 2)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
#tr_event_acc, ts_event_acc = events_counts_acceleration(train_logs, test_logs)
# tr_nunique, ts_nunique = categorical_nunique(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
# tr_time_by_act, ts_time_by_act = action_time_by_activity(train_logs, test_logs)
# tr_cursor_pos_roc, ts_cursor_pos_roc = cursor_pos_rate_of_change(train_logs, test_logs)
# 
# tr_act_count, ts_act_count = count_of_activities(train_logs, test_logs)
# tr_get_keys, ts_get_keys = get_keys_pressed_per_second(train_logs.collect().to_pandas(), 
#                                                        test_logs.collect().to_pandas())
# 
# tr_input_change, ts_input_change = input_text_change_feats(train_logs, test_logs)
# tr_wc_roc, ts_wc_roc =  word_counts_rate_of_change(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
#train_feats = train_feats.join(tr_p_burst, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
# train_feats = train_feats.join(tr_event_acc, on='id', how='left')
# train_feats = train_feats.join(tr_nunique, on='id', how='left')
# train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
# train_feats = train_feats.join(tr_act_count, on='id', how='left')
# train_feats = train_feats.join(tr_cursor_pos_roc, on='id', how='left')

# train_feats = train_feats.join(tr_get_keys, on='id', how='left')
# train_feats = train_feats.join(tr_input_change, on='id', how='left')
# train_feats = train_feats.join(tr_time_by_act, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
# test_feats = test_feats.join(ts_p_burst, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
# test_feats = test_feats.join(tr_event_acc, on='id', how='left')
# test_feats = test_feats.join(ts_nunique, on='id', how='left')
# test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
# test_feats = test_feats.join(ts_act_count, on='id', how='left')
# test_feats = test_feats.join(ts_cursor_pos_roc, on='id', how='left')


# test_feats = test_feats.join(ts_get_keys, on='id', how='left')
# test_feats = test_feats.join(ts_input_change, on='id', how='left')
# test_feats = test_feats.join(ts_time_by_act, on='id', how='left')


train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.sort('id')
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')

train_feats = train_feats.merge(train_scores, on='id', how='left')
print(f'train feats shape {train_feats.shape}')


In [None]:
from m5_sb_models import lgb_pipeline
lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

param = {'n_estimators': 1024,
        'learning_rate': 0.005,
        'metric': 'rmse',
        'force_col_wise': True,
        'verbosity': 0,}

# train_feats = train_feats[['id', 'score'] + feat_select]
# test_feats = test_feats[['id'] + feat_select]

print(f'train feats shape {train_feats.shape}')


In [None]:
shuffle_preds = []

for i in range(15):
    train_feats = train_feats.sample(frac=1).reset_index(drop=True)
    test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, lgb_params_1)
    shuffle_preds.append(rmse)
    #test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, param)

np.mean(shuffle_preds)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)
oof_res.groupby(['score'])['RMSE'].mean().reset_index().sort_values('RMSE', ascending=False)

In [None]:
oof_res = oof_preds.groupby(['id', 'score'])['preds'].mean().reset_index()
# oof_res['rmse'] = oof_res.apply(lambda x: np.sqrt((x['score']-x['preds'])**2))
oof_res['RMSE'] = np.sqrt((oof_res['score']-oof_res['preds'])**2)