In [1]:
import pandas as pd
import numpy as np

from m4_feats_polars import *
from m5_sb_models import *

In [2]:
lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 400,
    'verbosity': -1
    }

import polars as pl
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [3]:
Best_Feature_Set = ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 'train_create_pauses.pkl', 
                   'train_essay_paragraphs.pkl', 'train_cursor_pos_acceleration.pkl', 
                   'train_word_count_acceleration.pkl', 'train_vector_two_gram.pkl']

# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

# train_down_events_counts
tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
# train_vector_one_gram
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
# train_create_pauses
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
# train_cursor_pos_acceleration
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
# train_word_count_acceleration
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
# train_vector_two_gram
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)

# nunique
tr_count_of_act, ts_count_of_act = r_burst_feats(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_count_of_act, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
test_feats = test_feats.join(ts_count_of_act, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
< word count acceleration >
< Count vectorize bi-grams >
< R-burst features >
< Essays paragraphs feats >
< Essays paragraphs feats >
train feats shape (2471, 114)


In [4]:
missing_cols = set(train_feats.columns) - set(test_feats.columns)
missing_cols.remove('score')

for col in missing_cols:
    test_feats[col] = np.nan

test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, lgb_params_1)



Final RMSE over 50: 0.603069. Std 0.8272
RMSE by fold 0.602983. Std 0.0100


Final RMSE over 50: 0.604760. Std 0.8296
RMSE by fold 0.604651. Std 0.0112
base
Final RMSE over 50: 0.605383. Std 0.8278
RMSE by fold 0.605283. Std 0.0107

In [5]:
# train_feats.to_pickle(f'feature_selection/train_feats.pkl')
# test_feats.to_pickle(f'feature_selection/test_feats.pkl')

In [6]:
test_ids = test_feats.id
y_pred = np.mean(test_preds, axis=0)

sub = pd.DataFrame({'id': test_ids, 'score': y_pred})
sub.to_csv('submission.csv', index=False)

In [7]:
sub

Unnamed: 0,id,score
0,0000aaaa,1.402126
1,4444cccc,1.353844
2,2222bbbb,1.336026
