In [1]:
import pandas as pd
import numpy as np
from m4_feats_polars import *
from m5_sb_models import lgb_pipeline
import polars as pl
import os, warnings
warnings.filterwarnings('ignore')

In [2]:
FEAT_STORE = 'feature_store'
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')
train_scores = train_scores.collect().to_pandas()

param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'force_col_wise': True,
         'verbosity': 0,}

feat_list = os.listdir(FEAT_STORE)
list_train_feats = [feat for feat in feat_list if feat.startswith('train')]
#test_feats = [feat for feat in feat_list if feat.startswith('test')]        

In [3]:
# Initialize empty DataFrames
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
used_features = set()
added_feats = []
improved = True
results = pd.DataFrame()

while improved:
    print(f'Starting round {round} of training feats')
    improved = False

    # Filter out features that have already been added
    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]

    for tr_feats_cand in list_train_feats:
        ts_feats_cand = tr_feats_cand.replace('train', 'test')

        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        # Keep track of columns before adding new features
        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):
            print(f'feats not empty - merging with existing')
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            print(f'feats empty - setting up train_feats')
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        # Remove recently added features
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

    # After testing all features in the round, sort and determine the best feature
    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]

    if top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        print(f'Results improved!: Selected feat {top_feat} - score {top_score}')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
    else:
        print('Training Over!')

    # Prepare for the next round
    list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
    print(f'list_train_feats: {list_train_feats}')
    print(f'added_feats_list: {added_feats}')
    print(f'best feat: {top_feat}')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")


Starting round 0 of training feats
feats empty - setting up train_feats
Training... train_categorical_nunique.pkl
Train feats cols Index(['id', 'activity_nunique', 'down_event_nunique', 'text_change_nunique',
       'score'],
      dtype='object')
Final RMSE over 50: 0.908480. Std 0.5426
RMSE by fold 0.908392. Std 0.0123
feats not empty - merging with existing
Training... train_essay_sentences.pkl
Train feats cols Index(['id', 'score', 'sent_count', 'sent_len_mean', 'sent_len_min',
       'sent_len_max', 'sent_len_first', 'sent_len_last', 'sent_len_q1',
       'sent_len_median', 'sent_len_q3', 'sent_len_sum',
       'sent_word_count_mean', 'sent_word_count_min', 'sent_word_count_max',
       'sent_word_count_first', 'sent_word_count_last', 'sent_word_count_q1',
       'sent_word_count_median', 'sent_word_count_q3', 'sent_word_count_sum'],
      dtype='object')
Final RMSE over 50: 0.665064. Std 0.8015
RMSE by fold 0.664968. Std 0.0112
feats not empty - merging with existing
Training... 

In [4]:
added_feats

['train_down_events_counts.pkl',
 'train_vector_one_gram.pkl',
 'train_create_pauses.pkl',
 'train_essay_paragraphs.pkl',
 'train_essay_sentences.pkl',
 'train_cursor_pos_acceleration.pkl',
 'train_word_count_acceleration.pkl',
 'train_categorical_nunique.pkl',
 'train_cursor_pos_rate_of_change.pkl',
 'train_get_keys_pressed_per_second.pkl']

In [3]:
# Initialize empty DataFrames
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
best_rmse = float('inf')
round = 0
used_features = set()
added_feats = []
improved = True
results = pd.DataFrame()

while improved:
    print(f'Starting round {round} of training feats')
    improved = False
    

    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]

        # Skip if this feature set has already been used
        if tr_feats_cand in used_features:
            continue

        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        # Keep track of columns before adding new features
        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):
            print(f'feats not empty - merging with existing')
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

            if 'score' not in train_feats.columns:
                train_feats = train_feats.merge(train_scores, on='id', how='left')
                
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            print(f'feats empty - setting up train_feats')
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        print(f'Train feats cols {train_feats.columns}')
        tr_cols = tr_feats.drop(columns=['id']).columns
        ts_cols = ts_feats.drop(columns=['id']).columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        # Remove recently added features
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

        # Add the current feature set to the used features
        used_features.add(tr_feats_cand)
        
    #print(results)
    results = results.sort_values('RMSE', ascending=True)
    top_score = results.head(1).RMSE.values[0]
    top_feat = results.head(1).feat_name.values[0]
    #print(f'Results: {results}')

    if  top_score < best_rmse:
        best_rmse = top_score
        best_feat = top_feat
        improved = True
        used_features = set()
        print(f'Results improved!')

        ts_top_feat = top_feat.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{top_feat}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_top_feat}')

        if round > 0:
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')

        else:
            train_feats = tr_feats
            test_feats = ts_feats

        added_feats.append(top_feat)
        round += 1
        list_train_feats = [feat for feat in list_train_feats if feat not in added_feats]
        print(f'list_train_feats: {list_train_feats}')
        print(f'added_feats_list: {added_feats}')
        print(f'best feat: {top_feat}')

    else:
        print('Training Over!')

print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Set: {added_feats}")

Starting round 0 of training feats
feats empty - setting up train_feats
Training... train_categorical_nunique.pkl
Train feats cols Index(['id', 'activity_nunique', 'down_event_nunique', 'text_change_nunique',
       'score'],
      dtype='object')
Final RMSE over 50: 0.908480. Std 0.5426
RMSE by fold 0.908392. Std 0.0123
feats not empty - merging with existing
Training... train_essay_sentences.pkl
Train feats cols Index(['id', 'score', 'sent_count', 'sent_len_mean', 'sent_len_min',
       'sent_len_max', 'sent_len_first', 'sent_len_last', 'sent_len_q1',
       'sent_len_median', 'sent_len_q3', 'sent_len_sum',
       'sent_word_count_mean', 'sent_word_count_min', 'sent_word_count_max',
       'sent_word_count_first', 'sent_word_count_last', 'sent_word_count_q1',
       'sent_word_count_median', 'sent_word_count_q3', 'sent_word_count_sum'],
      dtype='object')
Final RMSE over 50: 0.665064. Std 0.8015
RMSE by fold 0.664968. Std 0.0112
feats not empty - merging with existing
Training... 

In [5]:
list_train_feats

['train_categorical_nunique.pkl',
 'train_essay_sentences.pkl',
 'train_r_burst_feats.pkl',
 'train_action_time_by_activity.pkl',
 'train_word_count_time_based.pkl',
 'train_word_counts_rate_of_change.pkl',
 'train_get_keys_pressed_per_second.pkl',
 'train_input_text_change_feats.pkl',
 'train_events_counts_acceleration.pkl',
 'train_events_counts_rate_of_change.pkl',
 'train_p_burst_feats.pkl',
 'train_count_of_activities.pkl',
 'train_events_counts_baseline.pkl',
 'train_action_time_baseline_stats.pkl',
 'train_cursor_pos_time_based.pkl',
 'train_essay_words.pkl',
 'train_cursor_pos_rate_of_change.pkl',
 'train_events_counts_time_based.pkl',
 'train_product_to_keys.pkl']

In [3]:
file_name = 'input_text_change_feats'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_baseline_stats'
tr, ts = action_time_baseline_stats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'down_events_counts'
tr, ts = down_events_counts(train_logs, test_logs, n_events=20)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_time_based'
tr, ts = word_count_time_based(train_logs, test_logs, time_agg=12)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = word_counts_rate_of_change(train_logs, test_logs, time_agg=5)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_acceleration'
tr, ts = word_count_acceleration(train_logs, test_logs, time_agg=8)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_baseline'
tr, ts = events_counts_baseline(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_time_based'
tr, ts = events_counts_time_based(train_logs, test_logs, time_agg=5)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = events_counts_rate_of_change(train_logs, test_logs, time_agg=10)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_acceleration'
tr, ts = events_counts_acceleration(train_logs, test_logs, time_agg=4)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_time_based'
tr, ts = cursor_pos_time_based(train_logs, test_logs, time_agg=30)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_rate_of_change'
tr, ts = cursor_pos_rate_of_change(train_logs, test_logs, time_agg=10)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_acceleration'
tr, ts = cursor_pos_acceleration(train_logs, test_logs, time_agg=6)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst_feats'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst_feats'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'create_pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'get_keys_pressed_per_second'
tr, ts = get_keys_pressed_per_second(train_logs.collect().to_pandas(), test_logs.collect().to_pandas())
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_essays = get_essay_df(train_logs.collect().to_pandas())
test_essays = get_essay_df(test_logs.collect().to_pandas())

file_name = 'vector_one_gram'
tr, ts = countvectorize_one_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'vector_two_gram'
tr, ts = countvectorize_two_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

file_name = 'product_to_keys'
tr, ts = product_to_keys([train_logs.collect().to_pandas(), test_logs.collect().to_pandas()],
                          [train_essays, test_essays])
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >
< Action time baseline stats >
< Action time by activities >
< Events counts features >
< Categorical # unique values features >
< word changes stats time based>
< Word counts rate of change features >
< word count acceleration >
< Count of events baseline feats >
< Count of events time based feats >
< event_id rate of change >
< events counts acceleration >
< Cursor changes based on time >
< event_id rate of change >
< cursor position acceleration >
< P-burst features >
< R-burst features >
< Idle time features >
< Count vectorize one-grams >
< Count vectorize bi-grams >
< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >


In [6]:
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()
best_rmse = 0.8
round = 0

for k in range(len(list_train_feats)):
    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]
        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        if not(train_feats.empty & test_feats.empty):
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
            assert train_feats.shape[1]==test_feats.shape[1]+1

        else:
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        tr_cols = train_feats.columns
        ts_cols = test_feats.columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name':tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)
        #train_feats, test_feats = pd.DataFrame(), pd.DataFrame()

    results = results.sort_values('RMSE', ascending=True)
    best_feat = results.loc[round].feat_name
    best_rmse = results.loc[round].RMSE
    list_train_feats = list(results.feat_name)
    round += 1

Training... train_word_counts_rate_of_change.pkl


In [5]:
list_train_feats

['train_word_counts_rate_of_change.pkl',
 'train_count_of_activities.pkl',
 'train_time_based_cursor_pos_stats.pkl']

In [None]:
file_name = 'input_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts'
tr, ts = events_counts(train_logs, test_logs,)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = rate_of_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_stats'
tr, ts = words_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'event_id_stats'
tr, ts = events_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_stats'
tr, ts = action_time_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_stats'
tr, ts = cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'time_based_cursor_pos_stats'
tr, ts = time_based_cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = rate_of_change_events(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
# file_name = 'word_count_acceleration'
# tr, ts = wc_acceleration_feats(train_logs, test_logs)
# tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_essays           = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

file_name = 'product_to_keys'
tr, ts = product_to_keys([train_logs.collect().to_pandas(),
                                      test_logs.collect().to_pandas()], 
                                      [train_essays,test_essays])
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

file_name = 'get_keys_pressed_per_second'
tr, ts = get_keys_pressed_per_second(train_logs.collect().to_pandas(), test_logs.collect().to_pandas())
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_essays = get_essay_df(train_logs)
test_essays = get_essay_df(test_logs)
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise_bigrams'
tr, ts = countvectorize_two_one(train_essays, test_essays)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >
< Action time by activities >
< Events counts features >
< Word counts rate of change features >
< Categorical # unique values features >
< word changes stats >
< Count of events feats >
< Cursor changes features >
< Cursor changes based on time >
< P-burst features >
< R-burst features >
< event_id rate of change >
< Idle time features >


100%|██████████| 2471/2471 [00:04<00:00, 610.13it/s]
100%|██████████| 3/3 [00:00<00:00, 3084.80it/s]
100%|██████████| 2471/2471 [00:03<00:00, 640.53it/s]
100%|██████████| 3/3 [00:00<00:00, 3256.45it/s]


< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >


In [None]:
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Action time by activities >
