In [1]:
import pandas as pd
import polars as pl
import numpy as np
import re
from joblib import Parallel, delayed

from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import skew, kurtosis
from m4_feats_polars import *
from m5_sb_models import *

In [2]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

xgb_params = {
    'alpha': 1,
    'colsample_bytree': 0.8,
    'gamma': 1.5,
    'learning_rate': 0.05,
    'max_depth': 4,
    'min_child_weight': 10,
    'subsample': 0.8,
    'device': 'cuda',
    'n_estimators': 350 
    }

catboost_params = {
    'iterations': 1000, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'bagging_temperature': 0.2
}

svr_params = {
    'C': 1.0, 
    'cache_size': 200, 
    'coef0': 0.0, 
    'degree': 3, 
    'epsilon': 0.1, 
    'gamma': 'scale', 
    'kernel': 'rbf', 
    'max_iter': -1, 
    'shrinking': True, 
    'tol': 0.001, 
    'verbose': False}

ridge_params = {'alpha':110}

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [3]:
# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)
tr_wc_roc, ts_wc_roc = word_counts_rate_of_change(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')
train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')
test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< event_id rate of change >
< Word counts rate of change features >
< removed words pauses basic
< Count vectorize bi-grams >
< word_wait_shift >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 177)


In [4]:
test_preds_lgbm, valid_preds_lgbm, final_rmse_lgbm, model_lgbm = lgb_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM completed')
test_preds_xgb, valid_preds_xgb, final_rmse_xgb, model_xgb = xgb_pipeline(train_feats, test_feats, xgb_params)
print(f'XGB completed')
test_preds_cat, valid_preds_cat, final_rmse_cat, model_cat = catboost_pipeline(train_feats, test_feats, catboost_params)
print(f'Catboost completed')
test_preds_svr, valid_preds_svr, final_rmse_svr, model_svr = svr_pipeline(train_feats, test_feats)
# print(f'SVR completed')
# test_preds_ridge, valid_preds_ridge, final_rmse_ridge, model_ridge = ridge_pipeline(train_feats, test_feats, ridge_params)
# print(f'Ridge completed')
# test_preds, oof_preds, final_rmse = automl_pipeline(train_feats, test_feats) 
print(f'NN Dense light completed')



In [None]:
import numpy as np
import itertools
from sklearn.metrics import mean_squared_error

models = {
    'xgboost': average_model_predictions(valid_preds_xgb),
    'lgbm': average_model_predictions(valid_preds_lgbm),
    'catboost': average_model_predictions(valid_preds_cat),
   # 'ridge': average_model_predictions(valid_preds_ridge),
   # 'svr': average_model_predictions(valid_preds_svr),
    'automl': average_model_predictions(valid_preds_svr)
}

true_values = train_scores.score.values

simple_avg = np.mean(list(models.values()), axis=0)
baseline_rmse = mean_squared_error(true_values, simple_avg, squared=False)
print(f"Baseline RMSE with simple average: {baseline_rmse}")

for L in range(1, len(models) + 1):
    for subset in itertools.combinations(models, L):
        model_subset = {model: models[model] for model in subset}

        for weights in itertools.product(np.linspace(0.5, 1.0, 4.0), repeat=len(subset)):
            weighted_avg = calculate_weighted_avg(weights, model_subset)
            rmse = mean_squared_error(true_values, weighted_avg, squared=False)
            if rmse < best_rmse:
                best_rmse = rmse
                best_combination = subset
                best_weights = weights

print(f"Best RMSE: {best_rmse}")
print(f"Best Model Combination: {best_combination}")
print(f"Best Weights: {best_weights}")

# # Averaging test predictions for each model
# avg_test_preds_lgbm = average_test_predictions(test_preds_lgbm)
# avg_test_preds_xgb = average_test_predictions(test_preds_xgb)
# avg_test_preds_cat = average_test_predictions(test_preds_cat)
# avg_test_preds_svr = average_test_predictions(test_preds_svr)
# avg_test_preds_ridge = average_test_predictions(test_preds_ridge)
# avg_test_preds_ridge = average_test_predictions(test_preds_ridge)

# # Dictionary of averaged test predictions
# test_predictions = {
#     'xgboost': avg_test_preds_xgb,
#     'lgbm': avg_test_preds_lgbm,
#     'catboost': avg_test_preds_cat,
#     'svr': avg_test_preds_svr,
#     'ridge': avg_test_preds_ridge
#     'automl': avg_test_preds_ridge
# }

blended_test_predictions = calculate_weighted_avg_for_test(best_weights, test_predictions)

Baseline RMSE with simple average: 0.602079556990036
Best RMSE: inf
Best Model Combination: None
Best Weights: None


In [None]:
test_ids = test_feats.id
preds_simple_avg = np.mean(list(test_predictions.values()), axis=0)
sub = pd.DataFrame({'id': test_ids, 'score': preds_simple_avg})
sub.to_csv('submission.csv', index=False)