In [1]:
import pandas as pd
import polars as pl
import numpy as np
import re
from joblib import Parallel, delayed

from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import skew, kurtosis
from m4_feats_polars import *
from m5_sb_models import *
from m5_nn_models import *

In [2]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 425,
    'verbosity': -1
    }

xgb_params = {
    'alpha': 1,
    'colsample_bytree': 0.8,
    'gamma': 1.5,
    'learning_rate': 0.05,
    'max_depth': 4,
    'min_child_weight': 10,
    'subsample': 0.8,
    'device': 'cuda',
    'n_estimators': 400 
    }

catboost_params = {
    'iterations': 250, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'bagging_temperature': 0.2
}

svr_params = {
    'C': 1.0, 
    'cache_size': 200, 
    'coef0': 0.0, 
    'degree': 3, 
    'epsilon': 0.1, 
    'gamma': 'scale', 
    'kernel': 'rbf', 
    'max_iter': -1, 
    'shrinking': True, 
    'tol': 0.001, 
    'verbose': False}

ridge_params = {'alpha':325}

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [3]:
# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_word_c_acc, ts_word_c_acc = word_count_acceleration(train_logs, test_logs)
tr_rem_words_time_spent, ts_rem_words_time_spent = remove_words_time_spent(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs, 1)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_word_c_acc, on='id', how='left')
train_feats = train_feats.join(tr_rem_words_time_spent, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_word_c_acc, on='id', how='left')
test_feats = test_feats.join(ts_rem_words_time_spent, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

tr_sent_df = split_essays_into_sentences(train_essays)
ts_sent_df = split_essays_into_sentences(test_essays)

train_feats           = train_feats.merge(sent_long_word_count(tr_sent_df), on='id', how='left')
test_feats            = test_feats.merge(sent_long_word_count(ts_sent_df), on='id', how='left')

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< word count acceleration >
< remove_words_time_spent >
< Count vectorize bi-grams >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< removed words pauses basic
< word_wait_shift >
< event_id rate of change >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 185)


In [4]:
import warnings
warnings.filterwarnings("ignore")

test_preds_lgbm, valid_preds_lgbm, final_rmse_lgbm, _ = lgb_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM completed: {final_rmse_lgbm:.4f}')
test_preds_lgbm_w, valid_preds_lgbm_w, final_rmse_lgbm_w, _ = lgb_w_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM weights completed: {final_rmse_lgbm_w:.4f}')
test_preds_xgb, valid_preds_xgb, final_rmse_xgb, _ = xgb_pipeline(train_feats, test_feats, xgb_params)
print(f'XGB completed: {final_rmse_xgb:.4f}')
test_preds_cat, valid_preds_cat, final_rmse_cat, _ = catboost_pipeline(train_feats, test_feats, catboost_params)
print(f'Catboost completed: {final_rmse_cat:.4f}')
test_preds_ridge, valid_preds_ridge, final_rmse_ridge, _ = ridge_pipeline(train_feats, test_feats, ridge_params)
print(f'Ridge completed: {final_rmse_ridge:.4f}')
valid_preds_automl, test_preds_automl, final_rmse_automl = automl_pipeline(train_feats, test_feats) 
print(f'NN Dense light completed: {final_rmse_automl:.4f}')

LGBM completed: 0.6046
LGBM weights completed: 0.6049
XGB completed: 0.6036
Catboost completed: 0.6101
Ridge completed: 0.6844
NN Dense light completed: 0.6045


In [5]:
# model_dir = '/kaggle/input/lw-automl-models'
# model_files = [f for f in os.listdir(model_dir) if f.endswith('.joblib')]
# models = []

# for model_file in model_files:
#     model_path = os.path.join(model_dir, model_file)
#     model = joblib.load(model_path)
#     models.append(model)

# automl_predictions = []

# for model in models:
#     pred = model.predict(test_feats)
#     automl_predictions.append(pred)
    
# test_preds_stack = np.stack([p.data[:, 0] for p in automl_predictions])
# test_preds_mean = np.mean(test_preds_stack, axis=0)

In [None]:
valid_preds = {
    'xgboost': valid_preds_xgb,
    'lgbm': valid_preds_lgbm,
    'catboost': valid_preds_cat,
    'lgbm_w': valid_preds_lgbm_w,
    'ridge': valid_preds_ridge,
    'automl': valid_preds_automl,
}


test_preds= {

    'xgboost': test_preds_xgb,
    'lgbm': test_preds_lgbm,
    'catboost': test_preds_cat,
    'lgbm_w': test_preds_lgbm_w,
    'ridge': test_preds_ridge,
    'automl': test_preds_automl,

}

# import pickle

# with open('valid_preds.pkl', 'wb') as file:
#     pickle.dump(valid_preds, file)

# with open('test_preds.pkl', 'wb') as file:
#     pickle.dump(test_preds, file)

In [3]:
# import pickle
# with open('valid_preds.pkl', 'rb') as file:
#     valid_preds = pickle.load(file)

# with open('test_preds.pkl', 'rb') as file:
#     test_preds = pickle.load(file)

In [58]:
import itertools

simple_avg_df = pd.concat(valid_preds).groupby(['id','score'])['preds'].mean().reset_index()
mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)

baseline_rmse = mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)
best_rmse = baseline_rmse
print(f"Baseline RMSE with simple average: {baseline_rmse}")

for L in range(1, len(valid_preds) + 1):
    for subset in itertools.combinations(valid_preds, L):
        model_subset = {model: valid_preds[model] for model in subset}

        for weights in itertools.product(np.linspace(0.1, 1.0, 10), repeat=len(subset)):
            weighted_avg = calculate_weighted_avg(weights, model_subset)
            rmse = mean_squared_error(simple_avg_df['score'], weighted_avg, squared=False)
            if rmse < best_rmse:
                best_rmse = rmse
                best_combination = subset
                best_weights = weights

print(f"Best RMSE: {best_rmse}")
print(f"Best Model Combination: {best_combination}")
print(f"Best Weights: {best_weights}")

# Baseline RMSE with simple average: 0.5989250470999172
# Best RMSE: 0.5931609278035441
# Best Model Combination: ('xgboost', 'lgbm_w', 'automl')
# Best Weights: (0.2, 0.1, 0.4)

Baseline RMSE with simple average: 0.5989250470999172
Best RMSE: 0.5931609278035441
Best Model Combination: ('xgboost', 'lgbm_w', 'automl')
Best Weights: (0.2, 0.1, 0.4)


In [17]:
def segment_predictions_by_score(valid_preds, lower_bound, upper_bound):
    # Determine the segmentation indices from the 'preds' of the first model
    first_model_key = next(iter(valid_preds))
    mask = (valid_preds[first_model_key]['preds'] >= lower_bound) & (valid_preds[first_model_key]['preds'] <= upper_bound)
    segmented_models = {}
    for k, v in valid_preds.items():
        # Apply the same mask to all models
        segmented_models[k] = v[mask]
    return segmented_models


def calculate_weighted_avg(weights, model_subset):
    weighted_preds = sum(model['preds'] * weight for model, weight in zip(model_subset.values(), weights))
    return weighted_preds / sum(weights)

def find_best_weights_for_segment(segmented_models):
    best_rmse = float('inf')
    best_combination = None
    best_weights = None

    # Extract true values from the first model's DataFrame in the segmented models
    true_values_segment = next(iter(segmented_models.values()))['score']

    for L in range(1, len(segmented_models) + 1):
        for subset in itertools.combinations(segmented_models, L):
            model_subset = {model: segmented_models[model] for model in subset}

            for weights in itertools.product(np.linspace(0.1, 1.0, 10), repeat=len(subset)):
                weighted_avg = np.average([model_subset[model]['preds'] for model in subset], weights=weights, axis=0)
                rmse = mean_squared_error(true_values_segment, weighted_avg, squared=False)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_combination = subset
                    best_weights = weights
    
    return best_rmse, best_combination, best_weights

# Segment the datasets
segment1_models = segment_predictions_by_score(valid_preds, 0, 2.5)
segment2_models = segment_predictions_by_score(valid_preds, 5, 6.0)
segment3_models = segment_predictions_by_score(valid_preds, 2.5, 5)

# Find the best weights for each segment
best_weights_segment1 = find_best_weights_for_segment(segment1_models)
best_weights_segment2 = find_best_weights_for_segment(segment2_models)
best_weights_segment3 = find_best_weights_for_segment(segment3_models)

# Print best weights for each segment
print("Best Weights for Segment 1:", best_weights_segment1)
print("Best Weights for Segment 2:", best_weights_segment2)
print("Best Weights for Segment 3:", best_weights_segment3)

# Best Weights for Segment 1: (0.6409496887359202, ('lgbm_w', 'automl'), (0.7000000000000001, 0.5))
# Best Weights for Segment 2: (0.6896785907049993, ('xgboost', 'lgbm', 'automl'), (1.0, 0.2, 0.5))
# Best Weights for Segment 3: (0.58313018442317, ('xgboost', 'lgbm_w', 'ridge', 'automl'), (0.5, 0.1, 0.1, 1.0))

Best Weights for Segment 1: (0.6409496887359202, ('lgbm_w', 'automl'), (0.7000000000000001, 0.5))
Best Weights for Segment 2: (0.6896785907049993, ('xgboost', 'lgbm', 'automl'), (1.0, 0.2, 0.5))
Best Weights for Segment 3: (0.58313018442317, ('xgboost', 'lgbm_w', 'ridge', 'automl'), (0.5, 0.1, 0.1, 1.0))


In [24]:
def apply_weights_to_segment(segment_models, models_used, weights):
    # Use the DataFrame structure to maintain indices
    segment_predictions_df = pd.DataFrame(index=segment_models[next(iter(models_used))].index)
    segment_predictions_df['weighted_preds'] = np.zeros_like(segment_models[next(iter(models_used))]['preds'])

    for model, weight in zip(models_used, weights):
        segment_predictions_df['weighted_preds'] += segment_models[model]['preds'] * weight
    segment_predictions_df['weighted_preds'] /= sum(weights)
    return segment_predictions_df


# Apply weights to each validation segment
segment1_valid_df = apply_weights_to_segment(segment1_models, best_weights_segment1[1], best_weights_segment1[2])
segment2_valid_df = apply_weights_to_segment(segment2_models, best_weights_segment2[1], best_weights_segment2[2])
segment3_valid_df = apply_weights_to_segment(segment3_models, best_weights_segment3[1], best_weights_segment3[2])

# Combine the DataFrames
combined_valid_df = pd.concat([segment1_valid_df, segment2_valid_df, segment3_valid_df])
combined_valid_df_sorted = combined_valid_df.sort_index()

final_cv_rmse = mean_squared_error(train_scores.collect().to_pandas()['score'], combined_valid_df_sorted['weighted_preds'], squared=False)
print(f"Final CV RMSE: {final_cv_rmse}")

Final CV RMSE: 0.5924647034989626


In [43]:
def segment_test_predictions(models, lower_bound, upper_bound):
    """
    Segments the test predictions based on predicted score ranges.
    """
    segmented_models = {}
    for model_name, preds_df in models.items():
        segmented_models[model_name] = preds_df[(preds_df['score'] >= lower_bound) & (preds_df['score'] <= upper_bound)]
    return segmented_models

def apply_test_segment_weights(segment_models, models_used, weights):
    """
    Apply weights to the models' predictions in a test segment and return a DataFrame.
    """
    # Ensure weights sum is not zero to avoid division by zero
    total_weight = sum(weights)
    if total_weight == 0:
        raise ValueError("Sum of weights cannot be zero.")

    weighted_predictions_df = segment_models[next(iter(models_used))][['id']].copy()
    weighted_predictions_df['weighted_score'] = np.zeros_like(segment_models[next(iter(models_used))]['score'])

    for model, weight in zip(models_used, weights):
        # Check if the model is in the segment and has valid scores
        if model in segment_models and not segment_models[model]['score'].isnull().all():
            weighted_predictions_df['weighted_score'] += segment_models[model]['score'] * weight
        else:
            # If a model is missing or has invalid scores, handle accordingly
            print(f"Model {model} missing or has invalid scores in the segment.")

    weighted_predictions_df['weighted_score'] /= total_weight
    return weighted_predictions_df

test_segment1 = segment_test_predictions(test_preds, -10, 2.5)
test_segment2 = segment_test_predictions(test_preds, 5, 10)
test_segment3 = segment_test_predictions(test_preds, 2.5, 5)

test_segment1_df = apply_test_segment_weights(test_segment1, best_weights_segment1[1], best_weights_segment1[2])
test_segment2_df = apply_test_segment_weights(test_segment2, best_weights_segment2[1], best_weights_segment2[2])
test_segment3_df = apply_test_segment_weights(test_segment3, best_weights_segment3[1], best_weights_segment3[2])

Model xgboost missing or has invalid scores in the segment.
Model lgbm missing or has invalid scores in the segment.
Model automl missing or has invalid scores in the segment.
Model lgbm_w missing or has invalid scores in the segment.
Model automl missing or has invalid scores in the segment.


In [51]:
test_preds = pd.concat([test_segment1_df, test_segment2_df, test_segment3_df], axis=0)
test_preds = test_preds.groupby('id')['weighted_score'].mean().reset_index()
test_preds.columns = ['id', 'score']
test_preds.to_csv('submission.csv', Index=False)