In [1]:
import pandas as pd
import numpy as np
from m4_feats_polars import *
from m5_sb_models import xgb_pipeline, catboost_pipeline, svr_pipeline, lgb_pipeline
from m5_nn_models import automl_pipeline
import polars as pl
import os, warnings

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

warnings.filterwarnings('ignore')

In [2]:
FEAT_STORE = 'feature_store'

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')
train_scores = train_scores.collect().to_pandas()

lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 250,
    'verbosity': -1
    }

xgb_param={
'reg_alpha': 0.00087,
'reg_lambda': 2.5428,
'colsample_bynode': 0.78390,
'subsample': 0.89942, 
'eta': 0.04730, 
'max_depth': 3, 
'n_estimators': 350,
'eval_metric': 'rmse',
'device': 'cuda',
}

catboost_params = {
    'iterations': 1000, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'bagging_temperature': 0.2
}

In [3]:
file_name = 'words_rem_events_ratio'
tr, ts = words_rem_events_ratio(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_pauses_ratios'
tr, ts = word_pauses_ratios(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'activity_time_on_down_time' 
tr, ts = activity_time_on_down_time(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'sentences_timings'
tr, ts = sentences_timings(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'words_p_burst'
tr, ts = words_p_burst(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'words_duration_stats'
tr, ts = words_duration_stats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'remove_words_time_spent'
tr, ts = remove_words_time_spent(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'text_changes_counts'
tr, ts = text_changes_counts(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'punctuations'
tr, ts = punctuations(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'add_word_pauses'
tr, ts = add_word_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'remove_word_pauses'
tr, ts = remove_word_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_timings'
tr, ts = word_timings(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'input_text_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'down_events_counts'
tr, ts = down_events_counts(train_logs, test_logs, n_events=20)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = word_counts_rate_of_change(train_logs, test_logs, time_agg=5)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_acceleration'
tr, ts = word_count_acceleration(train_logs, test_logs, time_agg=8)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = events_counts_rate_of_change(train_logs, test_logs, time_agg=10)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_acceleration'
tr, ts = events_counts_acceleration(train_logs, test_logs, time_agg=4)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_rate_of_change'
tr, ts = cursor_pos_rate_of_change(train_logs, test_logs, time_agg=10)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_acceleration'
tr, ts = cursor_pos_acceleration(train_logs, test_logs, time_agg=6)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst_feats'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst_feats'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'create_pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'get_keys_pressed_per_second'
tr, ts = get_keys_pressed_per_second(train_logs.collect().to_pandas(), test_logs.collect().to_pandas())
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

for shift in [1]:
    file_name = f'word_wait_{shift}'
    tr,ts=word_wait_shift(train_logs, test_logs, shift)
    tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
    ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_essays = get_essay_df(train_logs.collect().to_pandas())
test_essays = get_essay_df(test_logs.collect().to_pandas())

file_name = 'vector_one_gram'
tr, ts = countvectorize_one_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'vector_two_gram'
tr, ts = countvectorize_two_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

file_name = 'product_to_keys'
tr, ts = product_to_keys([train_logs.collect().to_pandas(), test_logs.collect().to_pandas()],
                          [train_essays, test_essays])
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'sent_long_word_count'
tr = sent_long_word_count(train_essays)
ts = sent_long_word_count(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_long_word_count'
tr = word_long_word_count(train_essays)
ts = word_long_word_count(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

# file_name = 'sentences_per_paragraph'
# tr = essay_sents_per_par(train_essays)
# ts = essay_sents_per_par(test_essays)
# tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< words_rem_events_ratio >
< added words pauses basic
< removed words pauses basic
< Action time baseline stats >
< activity_time_on_down_time >
< sentences timings >
< words_p_burst >
< words_duration_stats >
< remove_words_time_spent >
< text chaanges counts features >
< punctuations features >
< added words pauses basic
< removed words pauses basic
< word timings advanced
< Input text change features >
< Action time by activities >
< Events counts features >
< Categorical # unique values features >
< Word counts rate of change features >
< word count acceleration >
< event_id rate of change >
< events counts acceleration >
< event_id rate of change >
< cursor position acceleration >
< P-burst features >
< R-burst features >
< Idle time features >
< get keys pressed per second >
< word_wait_shift >
< Count vectorize one-grams >
< Count vectorize bi-grams >
< product to keys >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >

In [4]:
feat_list = os.listdir(FEAT_STORE)
#train_scores = train_scores.collect().to_pandas()
list_train_feats = [feat for feat in feat_list if feat.startswith('train_')]

best_feats = []
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
added_feats = []
improved = True
results = pd.DataFrame()
list_train_feats = [feat for feat in feat_list if feat.startswith('train_')]

while improved:
    print(f'Starting round {round} of training feats')
    improved = False
    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]

    for tr_feats_cand in list_train_feats:
        ts_feats_cand = tr_feats_cand.replace('train', 'test')

        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):

            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            #print(f'feats empty - setting up train_feats')
            train_feats = tr_feats.copy()
            test_feats = ts_feats.copy()
            train_feats = train_feats.merge(train_scores, on='id', how='left')
        
        # print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        mean_rmse = []

        for i in range(1):
            train_feats = train_feats.sample(frac=1).reset_index(drop=True)
            test_feats = test_feats[train_feats.drop('score',axis=1).columns]
            valid_preds, test_preds, final_rmse = automl_pipeline(train_feats, test_feats)
            mean_rmse.append(final_rmse)

        print(f'Training... {tr_feats_cand}. Score: {np.mean(mean_rmse):.4f}')
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': np.mean(mean_rmse)}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]

    if top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        print(f'Results improved!: Selected feat {top_feat} - score {top_score:.4f}')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
    else:
        print('Training Over!')

    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
    print(f'list_train_feats: {list_train_feats}')
    print(f'added_feats_list: {added_feats}')
    print(f'best feat: {top_feat}')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")
best_feats.append(added_feats)
added_feats = []

Starting round 0 of training feats
Training... train_categorical_nunique.pkl. Score: 0.8994
Training... train_essay_sentences.pkl. Score: 0.6606
Training... train_r_burst_feats.pkl. Score: 0.9283
Training... train_action_time_by_activity.pkl. Score: 0.8167
Training... train_add_word_pauses.pkl. Score: 0.7278
Training... train_word_wait_1.pkl. Score: 0.6977
Training... train_cursor_pos_acceleration.pkl. Score: 0.8036
Training... train_word_counts_rate_of_change.pkl. Score: 0.7006
Training... train_word_timings.pkl. Score: 0.7263
Training... train_create_pauses.pkl. Score: 0.7265
Training... train_get_keys_pressed_per_second.pkl. Score: 0.7909
Training... train_activity_time_on_down_time.pkl. Score: 0.7699
Training... train_punctuations.pkl. Score: 0.9785
Training... train_events_counts_acceleration.pkl. Score: 0.7574
Training... train_sentences_timings.pkl. Score: 1.0024
Training... train_events_counts_rate_of_change.pkl. Score: 0.7783
Training... train_input_text_change.pkl. Score: 0.7

In [3]:
# 1. sentences
# 2. cursor_pos_acc
# 3. p_burst
# 4. r_burst
# 5. bi-grams
# 6. nunique

# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_word_c_acc, ts_word_c_acc = word_count_acceleration(train_logs, test_logs)
tr_rem_words_time_spent, ts_rem_words_time_spent = remove_words_time_spent(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs, 1)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)

# tr_wc_roc, ts_wc_roc = word_counts_rate_of_change(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_word_c_acc, on='id', how='left')
train_feats = train_feats.join(tr_rem_words_time_spent, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')
# train_feats = train_feats.join(tr_wc_roc, on='id', how='left')


test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_word_c_acc, on='id', how='left')
test_feats = test_feats.join(ts_rem_words_time_spent, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')
# test_feats = test_feats.join(ts_wc_roc, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< word count acceleration >
< remove_words_time_spent >
< Count vectorize bi-grams >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< removed words pauses basic
< word_wait_shift >
< event_id rate of change >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 185)


In [6]:
import multiprocessing
estimators = [150,200,225,250]
cpu_count_mp = multiprocessing.cpu_count()

for est in estimators:

    lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': est,
    'num_threads': cpu_count_mp,
    'verbosity': -1
    }

    xgb_param={
    'reg_alpha': 0.00087,
    'reg_lambda': 2.5428,
    'colsample_bynode': 0.78390,
    'subsample': 0.89942, 
    'eta': 0.04730, 
    'max_depth': 3, 
    'n_estimators': est,
    'num_threads': cpu_count_mp,
    'eval_metric': 'rmse',
    }

    catboost_params = {
    'iterations': est, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'thread_count': cpu_count_mp,
    'bagging_temperature': 0.2
}

    print(f'Estimators {est}')
    test_preds_cat, valid_preds_cat, final_rmse_cat, model_cat = catboost_pipeline(train_feats, test_feats, catboost_params)
    print(f'Catboost: {final_rmse_cat:.4f}')
    test_preds_lgbm, valid_preds_lgbm, final_rmse_lgbm, model_lgbm = lgb_pipeline(train_feats, test_feats, lgb_params)
    print(f'LGBM: {final_rmse_lgbm:.4f}')
    test_preds_xgb, valid_preds_xgb, final_rmse_xgb, model_xgb = xgb_pipeline(train_feats, test_feats, xgb_param)
    print(f'XGB: {final_rmse_xgb:.4f}')


Estimators 150
Catboost: 0.6062
LGBM: 0.6182
XGB: 0.6093
Estimators 200
Catboost: 0.6066
LGBM: 0.6098
XGB: 0.6090
Estimators 225
Catboost: 0.6070
LGBM: 0.6077
XGB: 0.6088
Estimators 250
Catboost: 0.6073


KeyboardInterrupt: 

In [None]:
best_feats = []
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
added_feats = []
improved = True
results = pd.DataFrame()
train_feats = train_feats.sample(frac=1).reset_index(drop=True)
list_train_feats = [feat for feat in feat_list if feat.startswith('train_')]

while improved:
    print(f'Starting round {round} of training feats')
    improved = False
    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]

    for tr_feats_cand in list_train_feats:
        ts_feats_cand = tr_feats_cand.replace('train', 'test')

        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):

            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            #print(f'feats empty - setting up train_feats')
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        
        # print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        train_feats = train_feats.sample(frac=1).reset_index(drop=True)
        mean_rmse = []

        for i in range(15):
            train_feats = train_feats.sample(frac=1).reset_index(drop=True)
            test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
            mean_rmse.append(final_rmse)

        print(f'Training... {tr_feats_cand}. Score: {np.mean(mean_rmse):.4f}')
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': np.mean(mean_rmse)}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]

    if top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        print(f'Results improved!: Selected feat {top_feat} - score {top_score:.4f}')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
    else:
        print('Training Over!')

    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
    print(f'list_train_feats: {list_train_feats}')
    print(f'added_feats_list: {added_feats}')
    print(f'best feat: {top_feat}')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")
best_feats.append(added_feats)
added_feats = []

Starting round 0 of training feats
Training... train_categorical_nunique.pkl. Score: 0.9010
Training... train_essay_sentences.pkl. Score: 0.6615
Training... train_r_burst_feats.pkl. Score: 0.9346
Training... train_action_time_by_activity.pkl. Score: 0.8203
Training... train_add_word_pauses.pkl. Score: 0.8735
Training... train_word_wait_1.pkl. Score: 0.6999
Training... train_cursor_pos_acceleration.pkl. Score: 0.8064
Training... train_word_counts_rate_of_change.pkl. Score: 0.7045
Training... train_word_timings.pkl. Score: 0.7293
Training... train_create_pauses.pkl. Score: 0.7280
Training... train_word_wait_10.pkl. Score: 0.7186
Training... train_get_keys_pressed_per_second.pkl. Score: 0.7938
Training... train_punctuations.pkl. Score: 0.9793
Training... train_events_counts_acceleration.pkl. Score: 0.7593
Training... train_events_counts_rate_of_change.pkl. Score: 0.7807
Training... train_input_text_change.pkl. Score: 0.7459
Training... train_p_burst_feats.pkl. Score: 0.7323
Training... tr

In [None]:
best_feats = []

train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
added_feats = []
improved = True
results = pd.DataFrame()
train_feats = train_feats.sample(frac=1).reset_index(drop=True)
list_train_feats = [feat for feat in feat_list if feat.startswith('train_')]

while improved:
    print(f'Starting round {round} of training feats')
    improved = False
    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]

    for tr_feats_cand in list_train_feats:
        ts_feats_cand = tr_feats_cand.replace('train', 'test')

        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):

            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            #print(f'feats empty - setting up train_feats')
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        # print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        train_feats = train_feats.sample(frac=1).reset_index(drop=True)
        mean_rmse = []

        for i in range(15):
            train_feats = train_feats.sample(frac=1).reset_index(drop=True)
            test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
            mean_rmse.append(final_rmse)

        temp_res = {'feat_name': tr_feats_cand, 'RMSE': np.mean(mean_rmse)}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]

    if top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        print(f'Results improved!: Selected feat {top_feat} - score {top_score}')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
    else:
        print('Training Over!')

    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
    print(f'list_train_feats: {list_train_feats}')
    print(f'added_feats_list: {added_feats}')
    print(f'best feat: {top_feat}')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")
best_feats.append(added_feats)
added_feats = []

Starting round 0 of training feats
Training... train_categorical_nunique.pkl
Training... train_essay_sentences.pkl
Training... train_r_burst_feats.pkl
Training... train_action_time_by_activity.pkl
Training... train_word_count_time_based.pkl
Training... train_word_pauses.pkl
Training... train_cursor_pos_acceleration.pkl
Training... train_word_counts_rate_of_change.pkl
Training... train_create_pauses.pkl
Training... train_get_keys_pressed_per_second.pkl
Training... train_input_text_change_feats.pkl
Training... train_events_counts_acceleration.pkl
Training... train_sentence_pauses.pkl
Training... train_events_counts_rate_of_change.pkl
Training... train_p_burst_feats.pkl
Training... train_count_of_activities.pkl
Training... train_events_counts_baseline.pkl
Training... train_down_events_counts.pkl
Training... train_paragraph_pauses.pkl
Training... train_action_time_baseline_stats.pkl
Training... train_cursor_pos_time_based.pkl
Training... train_essay_words.pkl
Training... train_word_count_a

In [None]:
best_feats = []

for i in range(20):
    train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
    best_rmse = float('inf')
    round = 0
    added_feats = []
    improved = True
    results = pd.DataFrame()
    train_feats = train_feats.sample(frac=1).reset_index(drop=True)
    list_train_feats = [feat for feat in feat_list if feat.startswith('train_')]

    while improved:
        print(f'Starting round {round} of training feats')
        improved = False
        list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]

        for tr_feats_cand in list_train_feats:
            ts_feats_cand = tr_feats_cand.replace('train', 'test')

            tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
            ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

            existing_train_columns = set(train_feats.columns)
            existing_test_columns = set(test_feats.columns)

            if not(train_feats.empty & test_feats.empty):

                train_feats = train_feats.merge(tr_feats, on='id', how='left')
                test_feats = test_feats.merge(ts_feats, on='id', how='left')

                if 'score' not in train_feats.columns:
                    train_feats = train_feats.merge(train_scores, on='id', how='left')
                    
                assert train_feats.shape[1] == test_feats.shape[1] + 1
            else:
                #print(f'feats empty - setting up train_feats')
                train_feats = tr_feats
                test_feats = ts_feats
                train_feats = train_feats.merge(train_scores, on='id', how='left')

            print(f'Training... {tr_feats_cand}')
            # print(f'Train feats cols {train_feats.columns}')
            tr_cols = tr_feats.drop(columns=['id']).columns
            ts_cols = ts_feats.drop(columns=['id']).columns
            train_feats = train_feats.sample(frac=1).reset_index(drop=True)
            test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
            temp_res = {'feat_name': tr_feats_cand, 'RMSE': final_rmse}
            results = pd.concat([results, pd.DataFrame([temp_res])])

            train_feats.drop(columns=tr_cols, inplace=True)
            test_feats.drop(columns=ts_cols, inplace=True)

        results = results.sort_values('RMSE', ascending=True)
        top_score = results.head(1).RMSE.values[0]
        top_feat = results.head(1).feat_name.values[0]

        if top_score < best_rmse:
            best_rmse = top_score
            best_feat = top_feat
            improved = True
            print(f'Results improved!: Selected feat {top_feat} - score {top_score}')

            ts_top_feat = top_feat.replace('train', 'test')
            tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
            ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

            if round > 0:
                train_feats = train_feats.merge(tr_feats, on='id', how='left')
                test_feats = test_feats.merge(ts_feats, on='id', how='left')
            else:
                train_feats = tr_feats
                test_feats = ts_feats

            added_feats.append(top_feat)
            round += 1
        else:
            print('Training Over!')

        list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
        print(f'list_train_feats: {list_train_feats}')
        print(f'added_feats_list: {added_feats}')
        print(f'best feat: {top_feat}')

    print(f"Best RMSE: {best_rmse:.4f}")
    print(f"Best Feature Set: {added_feats}")
    best_feats.append(added_feats)
    added_feats = []

Starting round 0 of training feats
Training... train_categorical_nunique.pkl
Final RMSE over 50: 0.901243. Std 0.5134
RMSE by fold 0.901190. Std 0.0101
Training... train_essay_sentences.pkl
Final RMSE over 50: 0.660559. Std 0.7952
RMSE by fold 0.660309. Std 0.0176
Training... train_r_burst_feats.pkl
Final RMSE over 50: 0.935659. Std 0.5080
RMSE by fold 0.935426. Std 0.0209
Training... train_action_time_by_activity.pkl
Final RMSE over 50: 0.819622. Std 0.6524
RMSE by fold 0.819552. Std 0.0108
Training... train_word_count_time_based.pkl
Final RMSE over 50: 0.732145. Std 0.7475
RMSE by fold 0.731923. Std 0.0176
Training... train_cursor_pos_acceleration.pkl
Final RMSE over 50: 0.806624. Std 0.6582
RMSE by fold 0.806527. Std 0.0122
Training... train_word_counts_rate_of_change.pkl
Final RMSE over 50: 0.706646. Std 0.7648
RMSE by fold 0.706523. Std 0.0130
Training... train_create_pauses.pkl
Final RMSE over 50: 0.729678. Std 0.7360
RMSE by fold 0.729480. Std 0.0170
Training... train_IKI_stats.

In [15]:
newlist = [item for items in best_feats for item in items]
newlist = newlist + old_list
counts = {}
for item in newlist:
    counts[item] = counts.get(item, 0) + 1


In [14]:
old_list = ['train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_cursor_pos_acceleration.pkl',
 'train_count_of_activities.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_events_counts_acceleration.pkl',
 'train_essay_sentences.pkl',
 'train_word_counts_rate_of_change.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_cursor_pos_acceleration.pkl',
 'train_essay_sentences.pkl',
 'train_categorical_nunique.pkl',
 'train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_r_burst_feats.pkl']

In [16]:
sorted_counts = sorted(counts.items(), key=lambda x: x[1], reverse=True)
sorted_counts

[('train_down_events_counts.pkl', 26),
 ('train_vector_one_gram.pkl', 26),
 ('train_create_pauses.pkl', 26),
 ('train_essay_paragraphs.pkl', 26),
 ('train_cursor_pos_acceleration.pkl', 11),
 ('train_word_count_acceleration.pkl', 6),
 ('train_p_burst_feats.pkl', 5),
 ('train_r_burst_feats.pkl', 3),
 ('train_events_counts_acceleration.pkl', 3),
 ('train_essay_sentences.pkl', 3),
 ('train_categorical_nunique.pkl', 3),
 ('train_vector_two_gram.pkl', 2),
 ('train_cursor_pos_rate_of_change.pkl', 2),
 ('train_word_counts_rate_of_change.pkl', 2),
 ('train_count_of_activities.pkl', 2),
 ('train_action_time_by_activity.pkl', 1),
 ('train_product_to_keys.pkl', 1),
 ('train_IKI_based_fractals.pkl', 1),
 ('train_events_counts_baseline.pkl', 1)]

In [5]:
added_feats

[]

In [5]:
list_train_feats

['train_categorical_nunique.pkl',
 'train_essay_sentences.pkl',
 'train_r_burst_feats.pkl',
 'train_action_time_by_activity.pkl',
 'train_word_count_time_based.pkl',
 'train_word_counts_rate_of_change.pkl',
 'train_get_keys_pressed_per_second.pkl',
 'train_input_text_change_feats.pkl',
 'train_events_counts_acceleration.pkl',
 'train_events_counts_rate_of_change.pkl',
 'train_p_burst_feats.pkl',
 'train_count_of_activities.pkl',
 'train_events_counts_baseline.pkl',
 'train_action_time_baseline_stats.pkl',
 'train_cursor_pos_time_based.pkl',
 'train_essay_words.pkl',
 'train_cursor_pos_rate_of_change.pkl',
 'train_events_counts_time_based.pkl',
 'train_product_to_keys.pkl']

In [6]:
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()
best_rmse = 0.8
round = 0

for k in range(len(list_train_feats)):
    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]
        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        if not(train_feats.empty & test_feats.empty):
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
            assert train_feats.shape[1]==test_feats.shape[1]+1

        else:
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        tr_cols = train_feats.columns
        ts_cols = test_feats.columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name':tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)
        #train_feats, test_feats = pd.DataFrame(), pd.DataFrame()

    results = results.sort_values('RMSE', ascending=True)
    best_feat = results.loc[round].feat_name
    best_rmse = results.loc[round].RMSE
    list_train_feats = list(results.feat_name)
    round += 1

Training... train_word_counts_rate_of_change.pkl


In [5]:
list_train_feats

['train_word_counts_rate_of_change.pkl',
 'train_count_of_activities.pkl',
 'train_time_based_cursor_pos_stats.pkl']

In [None]:
file_name = 'input_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts'
tr, ts = events_counts(train_logs, test_logs,)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = rate_of_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_stats'
tr, ts = words_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'event_id_stats'
tr, ts = events_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_stats'
tr, ts = action_time_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_stats'
tr, ts = cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'time_based_cursor_pos_stats'
tr, ts = time_based_cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = rate_of_change_events(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
# file_name = 'word_count_acceleration'
# tr, ts = wc_acceleration_feats(train_logs, test_logs)
# tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_essays           = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

file_name = 'product_to_keys'
tr, ts = product_to_keys([train_logs.collect().to_pandas(),
                                      test_logs.collect().to_pandas()], 
                                      [train_essays,test_essays])
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

file_name = 'get_keys_pressed_per_second'
tr, ts = get_keys_pressed_per_second(train_logs.collect().to_pandas(), test_logs.collect().to_pandas())
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_essays = get_essay_df(train_logs)
test_essays = get_essay_df(test_logs)
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise_bigrams'
tr, ts = countvectorize_two_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >
< Action time by activities >
< Events counts features >
< Word counts rate of change features >
< Categorical # unique values features >
< word changes stats >
< Count of events feats >
< Cursor changes features >
< Cursor changes based on time >
< P-burst features >
< R-burst features >
< event_id rate of change >
< Idle time features >


100%|██████████| 2471/2471 [00:04<00:00, 610.13it/s]
100%|██████████| 3/3 [00:00<00:00, 3084.80it/s]
100%|██████████| 2471/2471 [00:03<00:00, 640.53it/s]
100%|██████████| 3/3 [00:00<00:00, 3256.45it/s]


< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >


In [None]:
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Action time by activities >
