In [1]:
import pandas as pd
import numpy as np
from m4_feats_polars import *
from m5_sb_models import lgb_pipeline
import polars as pl
import os, warnings
warnings.filterwarnings('ignore')

In [2]:
FEAT_STORE = 'feature_store'
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')
train_scores = train_scores.collect().to_pandas()

param = {'n_estimators': 1024,
         'learning_rate': 0.005,
         'metric': 'rmse',
         'force_col_wise': True,
         'verbosity': 0,}

feat_list = os.listdir(FEAT_STORE)
list_train_feats = [feat for feat in feat_list if feat.startswith('train')]
#test_feats = [feat for feat in feat_list if feat.startswith('test')]        

In [3]:
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()
best_rmse = 0.8
round = 0

for k in range(len(list_train_feats)):
    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]
        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        if not(train_feats.empty & test_feats.empty):
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
            assert train_feats.shape[1]==test_feats.shape[1]+1

        else:
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        tr_cols = train_feats.columns
        ts_cols = test_feats.columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name':tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)
        #train_feats, test_feats = pd.DataFrame(), pd.DataFrame()

    results = results.sort_values('RMSE', ascending=True)
    best_feat = results.loc[round].feat_name
    best_rmse = results.loc[round].RMSE
    list_train_feats = list(results.feat_name)
    round += 1

Training... train_event_id_stats.pkl
Final RMSE over 50: 0.767516. Std 0.7233
RMSE by fold 0.767447. Std 0.0103
Training... train_categorical_nunique.pkl
Final RMSE over 50: 0.907111. Std 0.5422
RMSE by fold 0.906990. Std 0.0153
Training... train_essay_sentences.pkl
Final RMSE over 50: 0.666417. Std 0.8003
RMSE by fold 0.666253. Std 0.0147
Training... train_count_vectorise_bigrams.pkl
Final RMSE over 50: 0.726059. Std 0.7497
RMSE by fold 0.725895. Std 0.0156
Training... train_action_time_by_activity.pkl
Final RMSE over 50: 0.828619. Std 0.6658
RMSE by fold 0.828407. Std 0.0184
Training... train_word_counts_rate_of_change.pkl
Final RMSE over 50: 0.711108. Std 0.7715
RMSE by fold 0.710849. Std 0.0192
Training... train_word_count_stats.pkl
Final RMSE over 50: 0.734109. Std 0.7505
RMSE by fold 0.734021. Std 0.0114
Training... train_pauses.pkl
Final RMSE over 50: 0.735167. Std 0.7415
RMSE by fold 0.735035. Std 0.0136
Training... train_events_counts.pkl
Final RMSE over 50: 0.655458. Std 0.79

KeyboardInterrupt: 

In [3]:
# Initialize empty DataFrames
train_feats, test_feats = pd.DataFrame(), pd.DataFrame()
results = pd.DataFrame()
best_rmse = float('inf')
round = 0
used_features = set()
improved = True

while improved:
    improved = False
    for i in range(round, len(list_train_feats)):
        tr_feats_cand = list_train_feats[i]

        # Skip if this feature set has already been used
        if tr_feats_cand in used_features:
            continue

        ts_feats_cand = tr_feats_cand.replace('train', 'test')
        tr_feats = pd.read_pickle(f'{FEAT_STORE}/{tr_feats_cand}')
        ts_feats = pd.read_pickle(f'{FEAT_STORE}/{ts_feats_cand}')

        # Keep track of columns before adding new features
        existing_train_columns = set(train_feats.columns)
        existing_test_columns = set(test_feats.columns)

        if not(train_feats.empty & test_feats.empty):
            train_feats = train_feats.merge(tr_feats, on='id', how='left')
            test_feats = test_feats.merge(ts_feats, on='id', how='left')
            assert train_feats.shape[1] == test_feats.shape[1] + 1
        else:
            train_feats = tr_feats
            test_feats = ts_feats
            train_feats = train_feats.merge(train_scores, on='id', how='left')

        print(f'Training... {tr_feats_cand}')
        tr_cols = train_feats.columns
        ts_cols = test_feats.columns
        test_preds, valid_preds, final_rmse, cv_rm = lgb_pipeline(train_feats, test_feats, param)
        temp_res = {'feat_name': tr_feats_cand, 'RMSE': final_rmse}
        results = pd.concat([results, pd.DataFrame([temp_res])])

        # Remove recently added features
        train_feats.drop(columns=tr_cols, inplace=True)
        test_feats.drop(columns=ts_cols, inplace=True)

        # Add the current feature set to the used features
        used_features.add(tr_feats_cand)

    results = results.sort_values('RMSE', ascending=True)
    if results.loc[round].RMSE < best_rmse:
        best_rmse = results.loc[round].RMSE
        best_feat = results.loc[round].feat_name
        improved = True

    round += 1
    # Update list_train_feats to only include unused features
    list_train_feats = [feat for feat in list_train_feats if feat not in used_features]

# Final results
print(f"Best RMSE: {best_rmse}")
print(f"Best Feature Set: {best_feat}")

Training... train_event_id_stats.pkl
Final RMSE over 50: 0.767516. Std 0.7233
RMSE by fold 0.767447. Std 0.0103
Training... train_categorical_nunique.pkl
Final RMSE over 50: 0.907111. Std 0.5422
RMSE by fold 0.906990. Std 0.0153
Training... train_essay_sentences.pkl
Final RMSE over 50: 0.666417. Std 0.8003
RMSE by fold 0.666253. Std 0.0147
Training... train_count_vectorise_bigrams.pkl


In [3]:
file_name = 'input_change'
tr, ts = input_text_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_of_activities'
tr, ts = count_of_activities(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_by_activity'
tr, ts = action_time_by_activity(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts'
tr, ts = events_counts(train_logs, test_logs,)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_counts_rate_of_change'
tr, ts = rate_of_change_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'categorical_nunique'
tr, ts = categorical_nunique(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'word_count_stats'
tr, ts = words_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'event_id_stats'
tr, ts = events_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'action_time_stats'
tr, ts = action_time_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'cursor_pos_stats'
tr, ts = cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'time_based_cursor_pos_stats'
tr, ts = time_based_cursor_stats_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'events_counts_rate_of_change'
tr, ts = rate_of_change_events(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
# file_name = 'word_count_acceleration'
# tr, ts = wc_acceleration_feats(train_logs, test_logs)
# tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
# ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'pauses'
tr, ts = create_pauses(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
file_name = 'count_vectorise'
tr, ts = countvectorize_one_one(train_logs, test_logs)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'count_vectorise_bigrams'
tr, ts = countvectorize_one_one(train_logs, test_logs)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_words'
tr = word_feats(train_essays)
ts = word_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_sentences'
tr = sent_feats(train_essays)
ts = sent_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'essay_paragraphs'
tr = parag_feats(train_essays)
ts = parag_feats(test_essays)
tr.to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Input text change features >
< Action time by activities >
< Events counts features >
< Word counts rate of change features >
< Categorical # unique values features >
< word changes stats >
< Count of events feats >
< Cursor changes features >
< Cursor changes based on time >
< P-burst features >
< R-burst features >
< event_id rate of change >
< Idle time features >


100%|██████████| 2471/2471 [00:04<00:00, 610.13it/s]
100%|██████████| 3/3 [00:00<00:00, 3084.80it/s]
100%|██████████| 2471/2471 [00:03<00:00, 640.53it/s]
100%|██████████| 3/3 [00:00<00:00, 3256.45it/s]


< Essays word feats >
< Essays word feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays paragraphs feats >
< Essays paragraphs feats >


In [None]:
file_name = 'p_burst'
tr, ts = p_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')
file_name = 'r_burst'
tr, ts = r_burst_feats(train_logs, test_logs)
tr.collect().to_pandas().to_pickle(f'{FEAT_STORE}/train_{file_name}.pkl')
ts.collect().to_pandas().to_pickle(f'{FEAT_STORE}/test_{file_name}.pkl')

< Action time by activities >
