In [1]:
import pandas as pd
import numpy as np
from m4_feats_polars import *
from m5_sb_models import lgb_pipeline
import polars as pl
import os

In [2]:
FEAT_STORE = 'feature_store'
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'force_col_wise': True,
         'verbosity': 0,}

feat_list = os.listdir(FEAT_STORE)
list_train_feats = [feat for feat in feat_list if feat.startswith('train')]
#test_feats = [feat for feat in feat_list if feat.startswith('test')]        

In [3]:
train_scores = train_scores.collect().to_pandas()
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()

for i in range(len(list_train_feats)):
    tr_feats_cand = list_train_feats[i]
    ts_feats_cand = tr_feats_cand.replace('train', 'test')
    tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
    ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')
    print(train_feats.empty)

    if not(train_feats.empty & test_feats.empty):
        train_feats = train_feats.merge(tr_feats, on='id', how='left')
        test_feats = test_feats.merge(ts_feats, on='id', how='left')
        assert train_feats.shape[1]==test_feats.shape[1]

        train_feats = train_feats.merge(train_scores, on='id', how='left')
    else:
        train_feats = tr_feats
        test_feats = ts_feats
        train_feats = train_feats.merge(train_scores, on='id', how='left')

    test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
    results = pd.concat([results, pd.DataFrame({'feat_name':ts_feats_cand, 'RMSE': final_rmse})])
    break

True




Final RMSE over 50: 0.768550. Std 0.7235
RMSE by fold 0.768344. Std 0.0174


In [4]:
tr_feats

Unnamed: 0,id,eid_stats_sum,eid_stats_mean,eid_stats_std,eid_stats_max,eid_stats_q1,eid_stats_median,eid_stats_q3,eid_stats_kurt,eid_stats_skew
0,adc30d84,4200,11.634349,13.978727,61,1.0,2.0,24.0,-0.397275,0.991375
1,b4988628,5525,18.055556,13.458139,58,3.0,21.0,25.0,-0.718968,0.398324
2,644ff57d,2581,6.738903,7.411589,43,1.0,3.0,13.0,1.877917,1.397889
3,87e2154e,3388,8.534005,7.345512,53,3.0,7.0,12.0,4.078040,1.555395
4,d8a426e4,7073,19.756983,14.613015,59,6.0,18.5,33.0,-1.175837,0.287185
...,...,...,...,...,...,...,...,...,...,...
2466,8c58e2db,2600,7.344633,5.140506,20,2.0,7.0,11.0,-1.035932,0.332132
2467,f802dd9a,3370,9.492958,7.354758,29,3.0,7.0,15.0,-0.940772,0.597301
2468,0ad15907,2602,7.477011,5.508918,36,3.0,7.0,12.0,1.089431,0.879204
2469,1ebb9b74,1877,5.086721,5.768926,20,1.0,2.0,10.0,-0.362831,1.111029


In [5]:

file_name = 'input_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts'
tr, ts = events_counts(train_logs, test_logs,)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = rate_of_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_stats'
tr, ts = words_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'event_id_stats'
tr, ts = events_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_stats'
tr, ts = action_time_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_stats'
tr, ts = cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'time_based_cursor_pos_stats'
tr, ts = time_based_cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = rate_of_change_events(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
# file_name = 'word_count_acceleration'
# tr, ts = wc_acceleration_feats(train_logs, test_logs)
# tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_logs, test_logs)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
train_essays = get_essay_df(train_logs)
test_essays = get_essay_df(test_logs)
file_name = 'count_vectorise'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >


KeyboardInterrupt: 