In [1]:
import pandas as pd
import polars as pl
import numpy as np
import re
from joblib import Parallel, delayed

from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import skew, kurtosis
from m4_feats_polars import *
from m5_sb_models import *
from m5_nn_models import *

In [2]:
lgb_params = {
    'boosting_type': 'gbdt', 
    'metric': 'rmse',
    'reg_alpha': 0.0031, 
    'reg_lambda': 0.001, 
    'colsample_bytree': 0.8,  
    'subsample_freq': 1,  
    'subsample': 0.75,  
    'learning_rate': 0.017, 
    'num_leaves': 19, 
    'min_child_samples': 46,
    'n_estimators': 350,
    'verbosity': -1
    }

xgb_params = {
    'alpha': 1,
    'colsample_bytree': 0.8,
    'gamma': 1.5,
    'learning_rate': 0.05,
    'max_depth': 4,
    'min_child_weight': 10,
    'subsample': 0.8,
    'device': 'cuda',
    'n_estimators': 225 
    }

catboost_params = {
    'iterations': 275, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'loss_function': 'RMSE', 
    'od_wait': 20, 
    'od_type': 'Iter', 
    'verbose': False, 
    'metric_period': 50, 
    'eval_metric': 'RMSE', 
    'bagging_temperature': 0.2
}

svr_params = {
    'C': 1.0, 
    'cache_size': 200, 
    'coef0': 0.0, 
    'degree': 3, 
    'epsilon': 0.1, 
    'gamma': 'scale', 
    'kernel': 'rbf', 
    'max_iter': -1, 
    'shrinking': True, 
    'tol': 0.001, 
    'verbose': False}

ridge_params = {'alpha':110}

data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

In [3]:
# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_word_c_acc, ts_word_c_acc = word_count_acceleration(train_logs, test_logs)
tr_rem_words_time_spent, ts_rem_words_time_spent = remove_words_time_spent(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs, 1)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)

train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_word_c_acc, on='id', how='left')
train_feats = train_feats.join(tr_rem_words_time_spent, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')

test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_word_c_acc, on='id', how='left')
test_feats = test_feats.join(ts_rem_words_time_spent, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< word count acceleration >
< remove_words_time_spent >
< Count vectorize bi-grams >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< removed words pauses basic
< word_wait_shift >
< event_id rate of change >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 193)


In [4]:
import warnings
warnings.filterwarnings("ignore")

test_preds_lgbm, valid_preds_lgbm, final_rmse_lgbm, _ = lgb_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM completed: {final_rmse_lgbm:.4f}')
test_preds_lgbm_w, valid_preds_lgbm_w, final_rmse_lgbm_w, _ = lgb_w_pipeline(train_feats, test_feats, lgb_params)
print(f'LGBM weights completed: {final_rmse_lgbm_w:.4f}')
test_preds_xgb, valid_preds_xgb, final_rmse_xgb, _ = xgb_pipeline(train_feats, test_feats, xgb_params)
print(f'XGB completed: {final_rmse_xgb:.4f}')
test_preds_cat, valid_preds_cat, final_rmse_cat, _ = catboost_pipeline(train_feats, test_feats, catboost_params)
print(f'Catboost completed: {final_rmse_cat:.4f}')
test_preds_ridge, valid_preds_ridge, final_rmse_ridge, _ = ridge_pipeline(train_feats, test_feats, ridge_params)
print(f'Ridge completed: {final_rmse_ridge:.4f}')
oof_preds_automl, test_preds_automl, final_rmse_automl = automl_pipeline(train_feats, test_feats) 
print(f'NN Dense light completed: {final_rmse_automl:.4f}')


LGBM completed: 0.6059
LGBM weights completed: 0.6066
XGB completed: 0.6061
Catboost completed: 0.6102
Final RMSE over 50: 0.662327. Std 0.7668


NameError: name 'final_rmse_ridge' is not defined

In [None]:
# model_dir = '/kaggle/input/lw-automl-models'
# model_files = [f for f in os.listdir(model_dir) if f.endswith('.joblib')]
# models = []

# for model_file in model_files:
#     model_path = os.path.join(model_dir, model_file)
#     model = joblib.load(model_path)
#     models.append(model)

# automl_predictions = []

# for model in models:
#     pred = model.predict(test_feats)
#     automl_predictions.append(pred)
    
# test_preds_stack = np.stack([p.data[:, 0] for p in automl_predictions])
# test_preds_mean = np.mean(test_preds_stack, axis=0)

In [95]:
import numpy as np
import itertools
from sklearn.metrics import mean_squared_error

models = {
    'xgboost': valid_preds_xgb,
    'lgbm': valid_preds_lgbm,
    'catboost': valid_preds_cat,
    'lgbm_w': valid_preds_lgbm_w,
    'ridge': valid_preds_ridge,
    'automl': oof_preds_automl,
}

simple_avg_df = pd.concat(models).groupby(['id','score'])['preds'].mean().reset_index()
mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)

baseline_rmse = mean_squared_error(simple_avg_df['score'], simple_avg_df['preds'], squared=False)
best_rmse = baseline_rmse
print(f"Baseline RMSE with simple average: {baseline_rmse}")

for L in range(1, len(models) + 1):
    for subset in itertools.combinations(models, L):
        model_subset = {model: models[model] for model in subset}

        for weights in itertools.product(np.linspace(0.1, 1.0, 10), repeat=len(subset)):
            weighted_avg = calculate_weighted_avg(weights, model_subset)
            rmse = mean_squared_error(simple_avg_df['score'], weighted_avg, squared=False)
            if rmse < best_rmse:
                best_rmse = rmse
                best_combination = subset
                best_weights = weights

print(f"Best RMSE: {best_rmse}")
print(f"Best Model Combination: {best_combination}")
print(f"Best Weights: {best_weights}")

Baseline RMSE with simple average: 0.6002777923179674
Best RMSE: 0.5957565255661937
Best Model Combination: ('xgboost', 'catboost', 'lgbm_w', 'automl')
Best Weights: (0.30000000000000004, 0.1, 0.2, 0.8)


In [72]:
### TARGET BLENDS ###

import numpy as np
import itertools
from sklearn.metrics import mean_squared_error

def segment_predictions_by_score(models, lower_bound, upper_bound):

    segmented_models = {}
    for k, v in models.items():
        mask = (v['preds'] >= lower_bound) & (v['preds'] <= upper_bound)
        x = v[mask]
        segmented_models[k] = x
    return segmented_models

def calculate_weighted_avg(weights, model_subset):

    weighted_preds = sum(model['preds'] * weight for model, weight in zip(model_subset.values(), weights))
    return weighted_preds / sum(weights)

def find_best_weights_for_segment(models):
    best_rmse = float('inf')
    best_combination = None
    best_weights = None

    # Extract true values from any model's DataFrame
    true_values_segment = next(iter(models.values()))['score']

    for L in range(1, len(models) + 1):
        for subset in itertools.combinations(models, L):
            model_subset = {model: models[model]['preds'] for model in subset}

            for weights in itertools.product(np.linspace(0.1, 1.0, 10), repeat=len(subset)):
                weighted_avg = np.average([model_subset[model] for model in subset], 
                                          weights=weights, axis=0)
                rmse = mean_squared_error(true_values_segment, weighted_avg, squared=False)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_combination = subset
                    best_weights = weights
    
    return best_rmse, best_combination, best_weights

# Segment the datasets
segment1_models = segment_predictions_by_score(models, lower_bound=0, upper_bound=2.5)
segment2_models = segment_predictions_by_score(models, lower_bound=5, upper_bound=6.0)
segment3_models = segment_predictions_by_score(models, lower_bound=3, upper_bound=4.5)

# Find the best weights for each segment
best_weights_segment1 = find_best_weights_for_segment(segment1_models)
best_weights_segment2 = find_best_weights_for_segment(segment2_models)
best_weights_segment3 = find_best_weights_for_segment(segment3_models)

# Print best weights for each segment
print("Best Weights for Segment 1:", best_weights_segment1)
print("Best Weights for Segment 2:", best_weights_segment2)
print("Best Weights for Segment 3:", best_weights_segment3)

# Optional: Apply these weights to the corresponding segments of your validation or test dataset

Best Weights for Segment 1: (1.154713207676848, ('lgbm_w',), (0.1,))
Best Weights for Segment 2: (0.8705831743229661, ('lgbm_w', 'automl'), (0.6, 0.8))
Best Weights for Segment 3: (0.5171814898166595, ('xgboost', 'catboost', 'ridge', 'automl'), (0.9, 0.4, 0.30000000000000004, 0.7000000000000001))


In [81]:
import numpy as np
from sklearn.metrics import mean_squared_error

# Function to apply weights to a segment
def apply_segment_weights(segment_models, models_used, weights):
    """
    Apply weights to the models' predictions in a segment.
    """
    # Initialize an array of zeros for the segment predictions
    segment_predictions = np.zeros_like(next(iter(segment_models.values()))['score'])

    for model, weight in zip(models_used, weights):
        segment_predictions += segment_models[model]['preds'] * weight

    return segment_predictions / sum(weights)

# Extract model names and weights for each segment
_, models_used_segment1, weights_segment1 = best_weights_segment1
_, models_used_segment2, weights_segment2 = best_weights_segment2
_, models_used_segment3, weights_segment3 = best_weights_segment3

# Apply weights to each segment
segment1_predictions = apply_segment_weights(segment1_models, models_used_segment1, weights_segment1)
segment2_predictions = apply_segment_weights(segment2_models, models_used_segment2, weights_segment2)
segment3_predictions = apply_segment_weights(segment3_models, models_used_segment3, weights_segment3)

# Prepare an array for combined predictions
combined_predictions = np.zeros_like(train_scores['score'].values)

# Fill the combined predictions with segment predictions
# Ensure that indices in train_scores match those in each segment
combined_predictions[segment1_models[next(iter(segment1_models))].index] = segment1_predictions
combined_predictions[segment2_models[next(iter(segment2_models))].index] = segment2_predictions
combined_predictions[segment3_models[next(iter(segment3_models))].index] = segment3_predictions

# Calculate the final CV score
final_cv_score = mean_squared_error(train_scores['score'].values, combined_predictions, squared=False)
print(f"Final CV Score with segmented weights: {final_cv_score}")

Final CV Score with segmented weights: 0.5892737382828971


In [101]:
def segment_test_predictions(models, lower_bound, upper_bound):
    """
    Segments the test predictions based on predicted score ranges.
    """
    segmented_models = {}
    for model_name, preds_df in models.items():
        segmented_models[model_name] = preds_df[(preds_df['score'] >= lower_bound) & (preds_df['score'] <= upper_bound)]
    return segmented_models

def apply_test_segment_weights(segment_models, models_used, weights):
    """
    Apply weights to the models' predictions in a test segment and return a DataFrame.
    """
    # Initialize a DataFrame to store the weighted predictions
    weighted_predictions_df = segment_models[next(iter(models_used))][['id']].copy()
    weighted_predictions_df['weighted_score'] = np.zeros_like(segment_models[next(iter(models_used))]['score'])

    for model, weight in zip(models_used, weights):
        weighted_predictions_df['weighted_score'] += segment_models[model]['score'] * weight

    weighted_predictions_df['weighted_score'] /= sum(weights)
    return weighted_predictions_df



test_predictions = {

    'xgboost': test_preds_xgb,
    'lgbm': test_preds_lgbm,
    'catboost': test_preds_cat,
    'lgbm_w': test_preds_lgbm_w,
    'ridge': test_preds_ridge,
    'automl': test_preds_automl,

}

# Segment test data
test_segment1 = segment_test_predictions(test_predictions, lower_bound=0.0, upper_bound=2.5)
test_segment2 = segment_test_predictions(test_predictions, lower_bound=5.0, upper_bound=6.0)
test_segment3 = segment_test_predictions(test_predictions, lower_bound=3.0, upper_bound=4.5)

# Apply weights to each test segment
test_segment1_df = apply_test_segment_weights(test_segment1, models_used_segment1, weights_segment1)
test_segment2_df = apply_test_segment_weights(test_segment2, models_used_segment2, weights_segment2)
test_segment3_df = apply_test_segment_weights(test_segment3, models_used_segment3, weights_segment3)

# Concatenate the DataFrames
combined_test_predictions_df = pd.concat([test_segment1_df, test_segment2_df, test_segment3_df])
combined_test_predictions_df = combined_test_predictions_df.sort_values(by='id')

combined_test_predictions_df.to_csv('submission.csv', index=False)
combined_test_predictions_df

In [None]:
# # Averaging test predictions for each model
# avg_test_preds_lgbm = average_test_predictions(test_preds_lgbm)
# avg_test_preds_xgb = average_test_predictions(test_preds_xgb)
# avg_test_preds_cat = average_test_predictions(test_preds_cat)
# avg_test_preds_svr = average_test_predictions(test_preds_svr)
# avg_test_preds_ridge = average_test_predictions(test_preds_ridge)
# avg_test_preds_automl = average_test_predictions(test_preds_automl)

# # Dictionary of averaged test predictions
# test_predictions = {
#     'xgboost': avg_test_preds_xgb,
#     'lgbm': avg_test_preds_lgbm,
#     'catboost': avg_test_preds_cat,
#     'svr': avg_test_preds_svr,
#     'ridge': avg_test_preds_ridge
#     'automl': avg_test_preds_automl
# }

# Baseline RMSE with simple average: 0.602766801612399
# Best RMSE: 0.5971908162938702
# Best Model Combination: ('xgboost', 'lgbm', 'catboost', 'ridge', 'automl')
# Best Weights: (0.7000000000000001, 1.0, 0.7000000000000001, 0.1, 1.0)


# Baseline RMSE with simple average: 0.6002777923179674
# Best RMSE: 0.5957565286697363
# Best Model Combination: ('xgboost', 'catboost', 'lgbm_w', 'automl')
# Best Weights: (0.30000000000000004, 0.1, 0.2, 0.8)