In [1]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import joblib
from m4_feats_polars import *
from sklearn.metrics import mean_squared_error
from m5_sb_models import *
import polars as pl
import torch
import numpy as np

In [2]:
# utils

def map_class(x, task, reader):
    if task.name == 'multiclass':
        return reader[x]
    else:
        return x

mapped = np.vectorize(map_class)

def score(task, y_true, y_pred):
    if task.name == 'binary':
        return roc_auc_score(y_true, y_pred)
    elif task.name == 'multiclass':
        return log_loss(y_true, y_pred)
    elif task.name == 'reg' or task.name == 'multi:reg':
        return mean_squared_error(y_true, y_pred, squared=False)
    else:
        raise 'Task is not correct.'
        
def take_pred_from_task(pred, task):
    if task.name == 'binary' or task.name == 'reg':
        return pred[:, 0]
    elif task.name == 'multiclass' or task.name == 'multi:reg':
        return pred
    else:
        raise 'Task is not correct.'
        
def use_plr(USE_PLR):
    if USE_PLR:
        return "plr"
    else:
        return "cont"

In [3]:
data_path     = 'kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs    = pl.scan_csv(f'{data_path}/train_logs.csv')
test_logs    = pl.scan_csv(f'{data_path}/test_logs.csv')
train_scores = pl.scan_csv(f'{data_path}/train_scores.csv')

# PANDAS FEATS
train_essays          = get_essay_df(train_logs.collect().to_pandas())
test_essays           = get_essay_df(test_logs.collect().to_pandas())

tr_down_events_counts, ts_down_events_counts = down_events_counts(train_logs, test_logs)
tr_vect_one, ts_vect_one = countvectorize_one_one(train_essays, test_essays)
tr_pauses, ts_pauses = create_pauses(train_logs, test_logs)
tr_cursor_pos_acc, ts_cursor_pos_acc = cursor_pos_acceleration(train_logs, test_logs)
tr_r_burst, ts_r_burst = r_burst_feats(train_logs, test_logs)
tr_nuni, ts_nuni = categorical_nunique(train_logs, test_logs)
tr_e_counts_roc, ts_e_counts_roc = events_counts_rate_of_change(train_logs, test_logs, time_agg=3)
tr_wc_roc, ts_wc_roc = word_counts_rate_of_change(train_logs, test_logs)
tr_remove_pause, ts_remove_pause = remove_word_pauses(train_logs, test_logs)
tr_vect_two, ts_vect_two = countvectorize_two_one(train_essays, test_essays)
tr_word_wait, ts_word_wait = word_wait_shift(train_logs, test_logs, 1)
tr_rem_words_time_spent, ts_rem_words_time_spent = remove_words_time_spent(train_logs, test_logs)


train_feats = tr_down_events_counts.join(tr_vect_one, on='id', how='left')
train_feats = train_feats.join(tr_pauses, on='id', how='left')
train_feats = train_feats.join(tr_cursor_pos_acc, on='id', how='left')
train_feats = train_feats.join(tr_r_burst, on='id', how='left')
train_feats = train_feats.join(tr_nuni, on='id', how='left')
train_feats = train_feats.join(tr_e_counts_roc, on='id', how='left')
train_feats = train_feats.join(tr_wc_roc, on='id', how='left')
train_feats = train_feats.join(tr_remove_pause, on='id', how='left')
train_feats = train_feats.join(tr_vect_two, on='id', how='left')
train_feats = train_feats.join(tr_word_wait, on='id', how='left')
train_feats = train_feats.join(tr_rem_words_time_spent, on='id', how='left')


test_feats = ts_down_events_counts.join(ts_vect_one, on='id', how='left')
test_feats = test_feats.join(ts_pauses, on='id', how='left')
test_feats = test_feats.join(ts_cursor_pos_acc, on='id', how='left')
test_feats = test_feats.join(ts_r_burst, on='id', how='left')
test_feats = test_feats.join(ts_nuni, on='id', how='left')
test_feats = test_feats.join(ts_e_counts_roc, on='id', how='left')
test_feats = test_feats.join(ts_wc_roc, on='id', how='left')
test_feats = test_feats.join(ts_remove_pause, on='id', how='left')
test_feats = test_feats.join(ts_vect_two, on='id', how='left')
test_feats = test_feats.join(ts_word_wait, on='id', how='left')
test_feats = test_feats.join(ts_rem_words_time_spent, on='id', how='left')

train_logs = train_logs.collect().to_pandas()
test_logs = test_logs.collect().to_pandas()
train_scores = train_scores.collect().to_pandas()
train_feats = train_feats.collect().to_pandas()
test_feats = test_feats.collect().to_pandas()

train_feats           = train_feats.merge(parag_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(parag_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(sent_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(sent_feats(test_essays), on='id', how='left')
train_feats           = train_feats.merge(word_feats(train_essays), on='id', how='left')
test_feats            = test_feats.merge(word_feats(test_essays), on='id', how='left')

# tr_ids = train_feats.id
# ts_ids = test_feats.id

# feats = pd.concat([train_feats,test_feats], axis=0)
# feats = preprocess_feats(feats)
# train_feats = feats[feats['id'].isin(tr_ids)]
# test_feats = feats[feats['id'].isin(ts_ids)]

train_feats           = train_feats.merge(train_scores, on=['id'], how='left')
print(f'train feats shape {train_feats.shape}')

< Events counts features >
< Count vectorize one-grams >
< Idle time features >
< cursor position acceleration >
< R-burst features >
< Categorical # unique values features >
< event_id rate of change >
< Word counts rate of change features >
< removed words pauses basic
< Count vectorize bi-grams >
< word_wait_shift >
< remove_words_time_spent >
< Essays paragraphs feats >
< Essays paragraphs feats >
< Essays sentences feats >
< Essays sentences feats >
< Essays word feats >
< Essays word feats >
train feats shape (2471, 186)


In [45]:
oof_preds

array([4.5921516, 2.4715698, 3.1458085, ..., 3.7550766, 3.0411086,
       4.064627 ], dtype=float32)

In [42]:
oof_preds = []
test_preds = []
ITERATIONS = 1
TRAIN_BS = [128,192,256,316,512]  # list(np.arange(64,64*6,64)) #[156,192,256,316,512] 

snap_params = {'early_stopping': True, 'patience': 20, 'swa': True}
RANDOM_STATE = 42

for i in range(ITERATIONS):
    for b in TRAIN_BS:
        N_THREADS = 2
        N_FOLDS = 10
        TEST_SIZE = 0.15
        VAL_SIZE = 0.15
        TIMEOUT = 10000
        ADVANCED_ROLES = False
        USE_QNT = True
        TASK = 'reg'
        USE_PLR = True
        USE_FS = True
        TARGET_NAME = 'score'
        FEATURE_RATIO = 0.8
            
        np.random.seed(RANDOM_STATE+b)
        torch.set_num_threads(N_THREADS)

        task = Task(TASK)

    # example for binary classification
        roles = {
            'target': TARGET_NAME,
            'drop': ['id']
        }
        algo = 'denselight'
        automl = TabularAutoML(
            task = task, 
            timeout = TIMEOUT,
            cpu_limit = N_THREADS,
            general_params = {"use_algos": [[algo]]}, # ['nn', 'mlp', 'dense', 'denselight', 'resnet', 'snn', 'node', 'autoint', 'fttransformer'] or custom torch model
            nn_params = {
                "n_epochs": 2000, 
                "bs": b, 
                "num_workers": 0, 
                "path_to_save": None, 
                "freeze_defaults": True,
            },
            nn_pipeline_params = {"use_qnt": USE_QNT, "use_te": False},
            reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE, 'advanced_roles': ADVANCED_ROLES},
        )

        oof_pred = automl.fit_predict(train_feats, roles = roles, verbose = 3)
        test_pred = automl.predict(test_feats)    
        joblib.dump(automl, f'automl_model_{b}_{i}.joblib')            

        oof_preds.append(oof_pred)
        test_preds.append(test_pred)
        oof = score(task, mapped(train_feats[TARGET_NAME].values, task, automl.reader.class_mapping), take_pred_from_task(oof_pred.data, task))
        denselight_list = [(oof, oof_pred.data[:, 0], test_pred.data[:, 0])]
        print(f'RMSE: {oof}')

stacked_preds = np.stack([p.data[:, 0] for p in oof_preds])
avg_preds = np.mean(stacked_preds, axis=0)
y = train_feats[TARGET_NAME].values
final_oof_rmse = mean_squared_error(y, avg_preds, squared=False)
print(f'Final RMSE: {final_oof_rmse}')

TypeError: TabularAutoML.__init__() got an unexpected keyword argument 'snap_params'

In [32]:
test_preds_stack = np.stack([p.data[:, 0] for p in test_preds])
test_preds_mean = np.mean(test_preds_stack, axis=0)
ts_ids = test_feats.id

sub = pd.DataFrame({'id': ts_ids, 'score': test_preds_mean})
sub.to_csv('submission.csv', index=False)