In [1]:
import pandas as pd
import numpy as np
import polars as pl

from m4_feats_polars import *
from m5_sb_models import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.003188447814669599, 
    'reg_lambda': 0.0010228604507564066, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.01716485155812008, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 2000,
    'force_col_wise': True,
    'verbosity': -1,
    }

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs     = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores  = pl.scan_csv(f'{data_path}/train_scores.csv')

In [3]:
# [('train_down_events_counts.pkl', 26),
#  ('train_vector_one_gram.pkl', 26),
#  ('train_create_pauses.pkl', 26),
#  ('train_essay_paragraphs.pkl', 26),
#  ('train_cursor_pos_acceleration.pkl', 11),
#  ('train_word_count_acceleration.pkl', 6),
#  ('train_p_burst_feats.pkl', 5),
#  ('train_r_burst_feats.pkl', 3),
#  ('train_events_counts_acceleration.pkl', 3),
#  ('train_essay_sentences.pkl', 3),
#  ('train_categorical_nunique.pkl', 3),
#  ('train_vector_two_gram.pkl', 2),
#  ('train_cursor_pos_rate_of_change.pkl', 2),
#  ('train_word_counts_rate_of_change.pkl', 2),
#  ('train_count_of_activities.pkl', 2),
#  ('train_action_time_by_activity.pkl', 1),
#  ('train_product_to_keys.pkl', 1),
#  ('train_IKI_based_fractals.pkl', 1),
#  ('train_events_counts_baseline.pkl', 1)]

# best_feature_set_1 - PARTIAL
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nunique, ts_nunique = categorical_nunique(train_logs, test_logs)
tr_get_keys, ts_get_keys = get_keys_pressed_per_second(train_logs.collect().to_pandas(), 
                                                       test_logs.collect().to_pandas())

# tr_p_burst, ts_p_burst = p_burst_feats(train_logs, test_logs, 2)
# tr_event_acc, ts_event_acc = events_counts_acceleration(train_logs, test_logs)
# tr_time_by_act, ts_time_by_act = action_time_by_activity(train_logs, test_logs)
# tr_cursor_pos_roc, ts_cursor_pos_roc = cursor_pos_rate_of_change(train_logs, test_logs)
# 
# tr_act_count, ts_act_count = count_of_activities(train_logs, test_logs)
# 
# tr_input_change, ts_input_change = input_text_change_feats(train_logs, test_logs)
# tr_wc_roc, ts_wc_roc =  word_counts_rate_of_change(train_logs, test_logs)


train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_nunique, on='id', how='left')
train_feats = train_feats.join(tr_get_keys, on='id', how='left')
# train_feats = train_feats.join(tr_p_burst, on='id', how='left')
# train_feats = train_feats.join(tr_event_acc, on='id', how='left')
# train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
# train_feats = train_feats.join(tr_act_count, on='id', how='left')
# train_feats = train_feats.join(tr_cursor_pos_roc, on='id', how='left')
# train_feats = train_feats.join(tr_input_change, on='id', how='left')
# train_feats = train_feats.join(tr_time_by_act, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_nunique, on='id', how='left')
test_feats = test_feats.join(ts_get_keys, on='id', how='left')
# test_feats = test_feats.join(ts_p_burst, on='id', how='left')
# test_feats = test_feats.join(tr_event_acc, on='id', how='left')
# test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
# test_feats = test_feats.join(ts_act_count, on='id', how='left')
# test_feats = test_feats.join(ts_cursor_pos_roc, on='id', how='left')
# test_feats = test_feats.join(ts_input_change, on='id', how='left')
# test_feats = test_feats.join(ts_time_by_act, on='id', how='left')


train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.sort('id')
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')

# train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
# test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')

train_feats = train_feats.merge(train_scores, on='id', how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
< word count acceleration >
< Count vectorize bi-grams >
< R-burst features >
< Categorical # unique values features >
< Essays paragraphs feats >
< Essays paragraphs feats >
train feats shape (2471, 118)


In [4]:
test_preds, oof_preds, rmse, model = lgb_pipeline(train_feats, test_feats, lgb_params_1)
print(rmse)
# test_preds_w, oof_preds_w, rmse_w, model_w = lgb_w_pipeline(train_feats, test_feats, lgb_params_1)
# test_preds2, oof_preds, rmse, model = xgb_pipeline(train_feats, test_feats, lgb_params_1)

0.6183842119315204


In [6]:
from sklearn.preprocessing import StandardScaler

def preprocess_feats(feats, scaler=StandardScaler()):
    feats.replace([np.inf, -np.inf], np.nan, inplace=True)
    feats.fillna(-1e10, inplace=True)
    feats_columns = feats.columns
    feats.loc[:, feats_columns != 'id'] = scaler.fit_transform(feats.loc[:, feats_columns != 'id'])
    return feats

train_feats.iloc[:,:-1] = preprocess_feats(train_feats.iloc[:,:-1], StandardScaler())
test_feats = preprocess_feats(test_feats, StandardScaler())

alpha = 100
ridge_params = {'alpha': alpha}  # Create a dictionary with alpha
test_preds, oof_preds_rid, rmse, model = ridge_pipeline(train_feats, test_feats, ridge_params)

Final RMSE over 50: 0.663325. Std 0.7942
RMSE by fold 0.663288. Std 0.0068


In [7]:
comb_oof_preds = pd.concat([oof_preds, oof_preds_rid], axis=0).groupby(['id','score'])['preds'].mean().reset_index()
oof_rmse = mean_squared_error(comb_oof_preds['score'], comb_oof_preds['preds'], squared=False)
print(f'Combined RMSE: {oof_rmse}')

Combined RMSE: 0.6152482871990806


In [None]:
# OOF summary
oof_w_preds_filt = oof_preds_w[(oof_preds_w['score']<1.5) | (oof_preds_w['score']>5.5)]
comb_oof_preds = pd.concat([oof_preds, oof_w_preds_filt], axis=0).groupby(['id','score'])['preds'].mean().reset_index()
oof_rmse = mean_squared_error(comb_oof_preds['score'], comb_oof_preds['preds'], squared=False)
print(f'Combined RMSE: {oof_rmse}')

In [None]:
# Predictions
n_preds = pd.DataFrame(data={'id': test_feats.id, 'score': np.mean(test_preds, axis=0)})
w_preds = pd.DataFrame(data={'id': test_feats.id, 'score': np.mean(test_preds_w, axis=0)})
preds_filt = n_preds[(n_preds['score']<1.5) | (n_preds['score']>5.5)]
comb_preds = pd.concat([w_preds, preds_filt], axis=0).groupby(['id'])['score'].mean().reset_index()

In [None]:
comb_preds.to_csv('submission.csv', index=False)

In [None]:
# test_preds, model  = lgb_full_train_set(train_feats, test_feats, lgb_params_1)

In [None]:
# # scores = np.mean(np.vstack([test_preds, test_preds2]), axis=0)

# test_ids = test_feats.id
# y_pred = np.mean(test_preds, axis=0)

# sub = pd.DataFrame({'id': test_ids, 'score': y_pred})
# sub.to_csv('submission.csv', index=False)