In [1]:
import pandas as pd
import numpy as np
import polars as pl

from m4_feats_polars import *
from m5_sb_models import *

lgb_params_1 = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 400,
    'verbosity': -1,
    'force_col_wise':True
    }

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [2]:
Best_Feature_Set = ['train_down_events_counts.pkl', 'train_vector_one_gram.pkl', 'train_create_pauses.pkl', 
                   'train_essay_paragraphs.pkl', 'train_cursor_pos_acceleration.pkl', 
                   'train_word_count_acceleration.pkl', 'train_vector_two_gram.pkl']

# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

# train_down_events_counts
tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
# train_vector_one_gram
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
# train_create_pauses
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
# train_cursor_pos_acceleration
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
# train_word_count_acceleration
tr_word_count_acc, ts_word_count_acc = word_count_acceleration(train_logs, test_logs)
# train_vector_two_gram
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
# r-bursts
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
# nunique
tr_nunique, ts_nunique = categorical_nunique(train_logs, test_logs)
#get_keys_pressed_per_second
tr_get_keys, ts_get_keys = get_keys_pressed_per_second(train_logs.collect().to_pandas(), 
                                                       test_logs.collect().to_pandas())

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_word_count_acc, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_nunique, on='id', how='left')
train_feats = train_feats.join(tr_get_keys, on='id', how='left')


test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_word_count_acc, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_nunique, on='id', how='left')
test_feats = test_feats.join(ts_get_keys, on='id', how='left')


train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
print(f'train feats shape {train_feats.shape}')

missing_cols = set(train_feats.columns) - set(test_feats.columns)
for col in missing_cols:
    test_feats[col] = np.nan

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')


< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
< word count acceleration >
< Count vectorize bi-grams >
< R-burst features >
< Categorical # unique values features >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
train feats shape (2471, 136)


In [None]:
def lgb_full_train_set(train, test, param, iterations=50):
        
    x = train.drop(['id', 'score'], axis=1)
    y = train['score'].values
    test_x = test.drop(columns = ['id'])
 
    test_preds = []
    valid_preds = pd.DataFrame()

    for iter in range(iterations):

        model = LGBMRegressor(**param, random_state = 42 + iter)
        model.fit(x, y)
        test_predictions = model.predict(test_x)
        test_preds.append(test_predictions)

    return test_preds, model 

In [6]:
import lightgbm as lgb
def run_lgb_cv(train_feats, test_feats, train_cols, target_col, lgb_params, boosting_type, seed, n_repeats, n_splits):

    oof_results = pd.DataFrame(columns = ['id', 'iteration', 'score', 'prediction'])

    X = train_feats[train_cols]
    y = train_feats[target_col]
    X_test = test_feats[train_cols]
    test_preds = []

    for i in range(n_repeats):
        skf = KFold(n_splits=n_splits, shuffle=True, random_state=seed + i)

        for train_idx, valid_idx in skf.split(train_feats, y):
            X_train, y_train = X.loc[train_idx], y.loc[train_idx]
            X_valid, y_valid = X.loc[valid_idx], y.loc[valid_idx]

            model = lgb.LGBMRegressor(**lgb_params, verbose=-1, random_state=seed)
            if boosting_type != 'dart':
                model.fit(X_train, y_train, 
                        eval_set=[(X_valid, y_valid)], 
                        callbacks=[lgb.early_stopping(250, first_metric_only=True, verbose=False)])
            else:
                model.fit(X_train, y_train)  # No early stopping for DART

            valid_predictions = model.predict(X_valid, num_iteration=model.best_iteration_)
            test_predictions = model.predict(X_test, num_iteration=model.best_iteration_)
            test_preds.append(test_predictions)
        
            tmp_df = train_feats.loc[valid_idx][['id','score']]
            tmp_df['prediction'] = valid_predictions
            tmp_df['iteration'] = i + 1
            oof_results = pd.concat([oof_results, tmp_df])

    avg_preds = oof_results.groupby(['id','score'])['prediction'].mean().reset_index()
    rmse = mean_squared_error(avg_preds['score'], avg_preds['prediction'], squared=False)
    print(f"LGBM Average RMSE over {n_repeats * n_splits} folds: {rmse:.6f}")
    return test_preds, oof_results, rmse, model  

def cv_pipeline(train_feats, test_feats, lgb_params, boosting_type, seed=42, n_repeats=5, n_splits=10):

    target_col = ['score']
    drop_cols = ['id']
    train_cols = [col for col in train_feats.columns if col not in target_col + drop_cols]

    missing_cols = [col for col in train_cols if col not in test_feats.columns]
    missing_cols_df = pd.DataFrame({col: np.nan for col in missing_cols}, index=test_feats.index)
    test_feats = pd.concat([test_feats, missing_cols_df], axis=1)


    test_preds, oof_preds, rmse, model = run_lgb_cv(train_feats=train_feats, test_feats=test_feats, 
                                             train_cols=train_cols, target_col=target_col, 
                                             lgb_params=lgb_params, boosting_type=boosting_type,
                                             seed=seed, n_repeats=n_repeats, n_splits=n_splits)
    
    rmse_per_iteration = oof_preds.groupby('iteration').apply(calculate_rmse)
    print(f'Mean RMSE of all iterations: {np.mean(rmse_per_iteration):.6f}')

    return test_preds, oof_preds, rmse, model

def calculate_rmse(df):
    return mean_squared_error(df['score'], df['prediction'], squared=False)

In [7]:
test_preds, oof_preds, rmse, model = cv_pipeline(train_feats, test_feats, lgb_params_1, 'gbdt')

LGBM Average RMSE over 50 folds: 0.599216
Mean RMSE of all iterations: 0.602228


LGBM Average RMSE over 50 folds: 0.599971
Mean RMSE of all iterations: 0.602992

In [None]:
test_ids = test_feats.id
y_pred = np.mean(test_preds, axis=0)

sub = pd.DataFrame({'id': test_ids, 'score': y_pred})
sub.to_csv('submission.csv', index=False)

In [None]:
sub

Unnamed: 0,id,score
0,2222bbbb,1.2405
1,0000aaaa,1.2405
2,4444cccc,1.244537
