In [1]:
import pandas as pd
import numpy as np
from m4_feats_polars import *
from m5_sb_models import lgb_pipeline
import polars as pl
import os, warnings
warnings.filterwarnings('ignore')

In [2]:
FEAT_STORE = 'feature_store'
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')
train_scores = train_scores.collect().to_pandas()

param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'force_col_wise': True,
         'verbosity': 0,}

feat_list = os.listdir(FEAT_STORE)
list_train_feats = [feat for feat in feat_list if feat.startswith('train')]
#test_feats = [feat for feat in feat_list if feat.startswith('test')]        

In [None]:
train_essays           = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())
 
tr_feats, ts_feats = product_to_keys([train_logs.collect().to_pandas(),
                                      test_logs.collect().to_pandas()], 
                                      [train_essays,test_essays])
tr_feats, ts_feats = get_keys_pressed_per_second(train_logs.collect().to_pandas(), test_logs.collect().to_pandas())


In [3]:
train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_essays = get_essay_df(train_logs)
test_essays = get_essay_df(test_logs)
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise_bigrams'
tr, ts = countvectorize_two_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

            id                                              essay
0     001519c8  qqqqqqqqq qq qqqqq qq qqqq qqqq.  qqqqqq qqq q...
1     0022f953  qqqq qq qqqqqqqqqqq ? qq qq qqq qqq qqq, qqqqq...
2     0042269b  qqqqqqqqqqq qq qqqqq qqqqqqqqq qq qqqqqqqqqqq ...
3     0059420b  qq qqqqqqq qqqqqq qqqqqqqqqqqqq qqqq q qqqq qq...
4     0075873a  qqqqqqqqqqq qq qqq qqqqq qq qqqqqqqqqq, qqq qq...
...        ...                                                ...
2466  ffb8c745       qq qqqqq'q qqqqqqq, qqq'q qqqqq q qqqq qq...
2467  ffbef7e5  qqqq qqqqqq qqqqq qq qqqqq qqqqq, qq qq q qqqq...
2468  ffccd6fd  qqqqqq qqqq q qqqqqqq qqqqqqqqq qq qqqqqq qqqq...
2469  ffec5b38  qqqqqqqqqq qqqqqqq, qqqqqq qqqq qqqqq qqqq qqq...
2470  fff05981  qq qqqq qqqqqqq qqqqqqqq qq qqqqqqqqqqq qq qq ...

[2471 rows x 2 columns]
         id essay
0  0000aaaa      
1  2222bbbb    qq
2  4444cccc    q 
< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays para

In [3]:
# Initialize empty DataFrames
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
used_features = set()
added_feats = []
improved = True
results = pd.DataFrame()

while improved:
    print(f'Starting round {round} of training feats')
    improved = False
    

    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]

        # Skip if this feature set has already been used
        if tr_feats_cand in used_features:
            continue

        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        # Keep track of columns before adding new features
        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):
            print(f'feats not empty - merging with existing')
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            print(f'feats empty - setting up train_feats')
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        # Remove recently added features
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

        # Add the current feature set to the used features
        used_features.add(tr_feats_cand)
    print(results)
    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]
    print(f'Results: {results}')

    if  top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        used_features = set()
        print(f'Results improved!')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
        list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
        print(f'list_train_feats: {list_train_feats}')
        print(f'added_feats_list: {added_feats}')
        print(f'best feat: {top_feat}')

    else:
        print('Training Over!')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")

Starting round 0 of training feats
feats empty - setting up train_feats
Training... train_event_id_stats.pkl
Train feats cols Index(['id', 'eid_stats_sum', 'eid_stats_mean', 'eid_stats_std',
       'eid_stats_max', 'eid_stats_q1', 'eid_stats_median', 'eid_stats_q3',
       'eid_stats_kurt', 'eid_stats_skew', 'score'],
      dtype='object')
Final RMSE over 50: 0.767516. Std 0.7233
RMSE by fold 0.767447. Std 0.0103
Train feats  before merging Index(['id', 'score'], dtype='object')
feats not empty - merging with existing
Training... train_categorical_nunique.pkl
Train feats cols Index(['id', 'score', 'activity_nunique', 'down_event_nunique',
       'text_change_nunique'],
      dtype='object')
Final RMSE over 50: 0.906628. Std 0.5417
RMSE by fold 0.906537. Std 0.0123
Train feats  before merging Index(['id', 'score'], dtype='object')
feats not empty - merging with existing
Training... train_essay_sentences.pkl
Train feats cols Index(['id', 'score', 'sent_count', 'sent_len_mean', 'sent_len_

In [6]:
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()
best_rmse = 0.8
round = 0

for k in range(len(list_train_feats)):
    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]
        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        if not(train_feats.empty & test_feats.empty):
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
            assert train_feats.shape[1]==test_feats.shape[1]+1

        else:
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        tr_cols = train_feats.columns
        ts_cols = test_feats.columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name':tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)
        #train_feats, test_feats = pd.DataFrame(), pd.DataFrame()

    results = results.sort_values('RMSE', ascending=True)
    best_feat = results.loc[round].feat_name
    best_rmse = results.loc[round].RMSE
    list_train_feats = list(results.feat_name)
    round += 1

Training... train_word_counts_rate_of_change.pkl


In [5]:
list_train_feats

['train_word_counts_rate_of_change.pkl',
 'train_count_of_activities.pkl',
 'train_time_based_cursor_pos_stats.pkl']

In [8]:
results

Unnamed: 0,feat_name,RMSE
0,train_time_based_cursor_pos_stats.pkl,0.6914
0,train_r_burst.pkl,0.709241
0,train_word_counts_rate_of_change.pkl,0.711108
0,train_time_based_cursor_pos_stats.pkl,0.732918
0,train_word_count_stats.pkl,0.734315
0,train_r_burst.pkl,0.949428


In [5]:
tr_feats

Unnamed: 0,id,cursor_pos_mean,cursor_pos_std,cursor_pos_max,cursor_pos_q1,cursor_pos_median,cursor_pos_q3,cursor_pos_kurt,cursor_pos_skew
0,110982e4,1262.425000,561.058884,1804,925.0,1566.0,1722.0,-0.664586,-0.869034
1,fe2065cc,613.865385,355.768763,1245,344.0,621.5,881.0,-0.982589,0.179027
2,9ce1a51f,1106.324324,426.571347,1708,946.0,1127.0,1426.0,-0.049410,-0.823053
3,8387640c,922.872727,483.943257,1621,492.0,1083.0,1292.0,-1.078332,-0.430539
4,7ba082b4,2041.542373,1253.973675,3960,904.0,2013.0,3211.0,-1.348009,0.024787
...,...,...,...,...,...,...,...,...,...
2466,035f09fc,1036.129630,626.471642,2174,542.0,1029.0,1537.0,-1.037647,0.105922
2467,d68f5377,587.888889,361.275762,1300,281.0,586.5,903.0,-1.047864,0.052144
2468,5461650e,1013.368421,697.179948,2205,371.0,889.0,1733.0,-1.432414,0.153344
2469,21707506,778.227273,445.534060,1366,335.0,782.0,1303.0,-1.508519,-0.007312


In [6]:
train_feats

Unnamed: 0,id,roc_zro_count,pos_change_count,neg_change_count,roc_count,roc_mean,roc_std,roc_sum,roc_max,roc_q1,...,roc_skew,score,cursor_pos_mean,cursor_pos_std,cursor_pos_max,cursor_pos_q1,cursor_pos_median,cursor_pos_q3,cursor_pos_kurt,cursor_pos_skew
0,1c540433,103,115,5,224,0.197309,0.243458,44.0,1.0,0.0,...,0.866687,3.0,579.277778,407.281062,1208,220.0,571.0,1001.0,-1.359339,0.091134
1,b4e5001b,109,164,2,276,0.160727,0.199780,44.2,0.6,0.0,...,-1.360474,2.5,755.673913,427.909131,1441,368.0,739.5,1121.0,-1.184749,-0.001904
2,3c837aa3,215,116,1,333,0.128916,0.204499,42.8,1.0,0.0,...,1.465838,2.0,603.725000,391.308455,1164,242.0,571.0,1015.0,-1.425086,0.116625
3,bc3da4ab,187,172,10,370,0.260163,0.374109,96.0,1.4,0.0,...,0.763727,4.5,1116.937500,891.496862,2711,264.0,1003.5,1953.0,-1.288056,0.242083
4,497c5987,165,157,36,359,0.193855,0.629851,69.4,4.4,0.0,...,-0.556865,3.0,851.933333,456.773722,1766,567.0,720.0,1155.0,-0.682669,0.380586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,a96dc180,260,98,1,360,0.324791,0.612918,116.6,2.6,0.0,...,1.640228,5.0,2090.483871,984.694229,3540,1553.0,2167.0,3096.0,-0.817958,-0.500164
2467,607f216d,157,183,12,353,0.288068,0.420927,101.4,1.8,0.0,...,0.362688,5.0,1684.074074,1042.431444,3098,630.0,1824.0,2609.0,-1.355401,-0.182809
2468,f3b6b53e,118,212,2,333,0.432530,0.452615,143.6,2.0,0.0,...,0.873850,5.5,2090.654545,1306.115391,4109,893.0,2216.0,3329.0,-1.323111,-0.138195
2469,f01e26bb,162,201,6,370,0.308401,0.389481,113.8,1.8,0.0,...,1.053007,5.5,1733.836066,988.643788,3332,821.0,1775.0,2571.0,-1.263554,-0.037653


In [None]:
file_name = 'input_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts'
tr, ts = events_counts(train_logs, test_logs,)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = rate_of_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_stats'
tr, ts = words_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'event_id_stats'
tr, ts = events_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_stats'
tr, ts = action_time_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_stats'
tr, ts = cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'time_based_cursor_pos_stats'
tr, ts = time_based_cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = rate_of_change_events(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
# file_name = 'word_count_acceleration'
# tr, ts = wc_acceleration_feats(train_logs, test_logs)
# tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_logs, test_logs)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise_bigrams'
tr, ts = countvectorize_two_one(train_logs, test_logs)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >
< Action time by activities >
< Events counts features >
< Word counts rate of change features >
< Categorical # unique values features >
< word changes stats >
< Count of events feats >
< Cursor changes features >
< Cursor changes based on time >
< P-burst features >
< R-burst features >
< event_id rate of change >
< Idle time features >


100%|██████████| 2471/2471 [00:04<00:00, 610.13it/s]
100%|██████████| 3/3 [00:00<00:00, 3084.80it/s]
100%|██████████| 2471/2471 [00:03<00:00, 640.53it/s]
100%|██████████| 3/3 [00:00<00:00, 3256.45it/s]


< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >


In [None]:
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Action time by activities >
