In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

from m4_feats_polars import *
from m5_sb_models import *

In [2]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 425,
    'verbosity': -1
    }

xgb_params = {
    'alpha': 1,
    'colsample_bytree': 0.8,
    'gamma': 1.5,
    'learning_rate': 0.05,
    'max_depth': 4,
    'min_child_weight': 10,
    'subsample': 0.8,
    'device': 'cuda',
    'n_estimators': 400 
    }

catboost_params = {
    'iterations': 250, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'bagging_temperature': 0.2
}

ridge_params = {'alpha':325}

In [3]:
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_word_c_acc, ts_word_c_acc = word_count_acceleration(train_logs, test_logs)
tr_rem_words_time_spent, ts_rem_words_time_spent = remove_words_time_spent(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs, 1)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_word_c_acc, on='id', how='left')
train_feats = train_feats.join(tr_rem_words_time_spent, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_word_c_acc, on='id', how='left')
test_feats = test_feats.join(ts_rem_words_time_spent, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

tr_sent_df = split_essays_into_sentences(train_essays)
ts_sent_df = split_essays_into_sentences(test_essays)

train_feats           = train_feats.merge(sent_long_word_count(tr_sent_df), on='id', how='left')
test_feats            = test_feats.merge(sent_long_word_count(ts_sent_df), on='id', how='left')

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< word count acceleration >
< remove_words_time_spent >
< Count vectorize bi-grams >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< removed words pauses basic
< word_wait_shift >
< event_id rate of change >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 189)


In [4]:
import warnings
warnings.filterwarnings("ignore")

test_preds_lgbm, valid_preds_lgbm, final_rmse_lgbm, _ = lgb_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM completed: {final_rmse_lgbm:.4f}')
test_preds_xgb, valid_preds_xgb, final_rmse_xgb, _ = xgb_pipeline(train_feats, test_feats, xgb_params)
print(f'XGB completed: {final_rmse_xgb:.4f}')
test_preds_cat, valid_preds_cat, final_rmse_cat, _ = catboost_pipeline(train_feats, test_feats, catboost_params)
print(f'Catboost completed: {final_rmse_cat:.4f}')
test_preds_ridge, valid_preds_ridge, final_rmse_ridge, _ = ridge_pipeline(train_feats, test_feats, ridge_params)
print(f'Ridge completed: {final_rmse_ridge:.4f}')

LGBM completed: 0.6034
XGB completed: 0.6048
Catboost completed: 0.6106
Ridge completed: 0.6795


In [None]:
import itertools

valid_preds = {
    'xgboost': valid_preds_xgb,
    'lgbm': valid_preds_lgbm,
    'catboost': valid_preds_cat,
    'ridge': valid_preds_ridge,
}

test_preds= {

    'xgboost': test_preds_xgb,
    'lgbm': test_preds_lgbm,
    'catboost': test_preds_cat,
    'ridge': test_preds_ridge,
}

simple_avg_df = pd.concat(valid_preds).groupby(['id','score'])['preds'].mean().reset_index()
mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)

baseline_rmse = mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)
best_rmse = baseline_rmse
print(f"Baseline RMSE with simple average: {baseline_rmse}")

for L in range(1, len(valid_preds) + 1):
    for subset in itertools.combinations(valid_preds, L):
        model_subset = {model: valid_preds[model] for model in subset}

        for weights in itertools.product(np.linspace(0.1, 1.0, 10), repeat=len(subset)):
            weighted_avg = calculate_weighted_avg(weights, model_subset)
            rmse = mean_squared_error(simple_avg_df['score'], weighted_avg, squared=False)
            if rmse < best_rmse:
                best_rmse = rmse
                best_combination = subset
                best_weights = weights

print(f"Best RMSE: {best_rmse}")
print(f"Best Model Combination: {best_combination}")
print(f"Best Weights: {best_weights}")

Baseline RMSE with simple average: 0.6040003146261572
Best RMSE: 0.6006615399775729
Best Model Combination: ('xgboost', 'lgbm', 'catboost', 'ridge')
Best Weights: (0.4, 0.7000000000000001, 0.2, 0.1)


In [16]:
def apply_weights_to_test(test_preds, models_used, weights):
    """
    Apply the best weights to the test predictions and return a DataFrame with 'id' and weighted scores.
    """
    weighted_predictions_df = test_preds[next(iter(models_used))][['id']].copy()
    weighted_predictions_df['weighted_score'] = np.zeros_like(test_preds[next(iter(models_used))]['score'])

    for model, weight in zip(models_used, weights):
        print(f"Applying weight {weight} to model {model}")
        weighted_predictions_df['weighted_score'] += test_preds[model]['score'] * weight

    weighted_predictions_df['weighted_score'] /= np.sum(weights)
    return weighted_predictions_df

weighted_test_predictions_df = apply_weights_to_test(test_preds, best_combination, best_weights)
weighted_test_predictions_df.columns = ['id', 'score']
weighted_test_predictions_df.to_csv('submission.csv', index=False)
print(weighted_test_predictions_df)

Applying weight 0.4 to model xgboost
Applying weight 0.7000000000000001 to model lgbm
Applying weight 0.2 to model catboost
Applying weight 0.1 to model ridge
         id     score
0  0000aaaa  1.931795
1  2222bbbb  1.481453
2  4444cccc  1.479283


In [17]:
weighted_test_predictions_df

Unnamed: 0,id,score
0,0000aaaa,1.931795
1,2222bbbb,1.481453
2,4444cccc,1.479283
