In [1]:

import os

import pandas as pd
from datasets import Dataset
from sklearn.metrics import cohen_kappa_score
from transformers import AutoModelForSequenceClassification, AutoConfig
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer

from kaggle_scripts.comp_config import CFG, PATHS
from kaggle_scripts.evaluation.classification_metrics import kaggle_qwk
from kaggle_scripts.evaluation.folding_strategies import create_folds
from kaggle_scripts.preprocessing.transformers import get_tokenizer

2024-06-12 13:10:13.936669: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-12 13:10:13.938865: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-12 13:10:13.960744: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_valid = pd.read_csv(PATHS.train_valid_path)
train_valid['label'] = train_valid['score'].apply(lambda x: x - 1)
train_valid["label"] = train_valid["label"].astype('int32')

In [3]:
train_valid = create_folds(train_valid, CFG.folding_strategy, CFG.n_splits, CFG.seed)

In [4]:
training_args = TrainingArguments(
    output_dir=f'output_v{CFG.VER}',
    learning_rate=CFG.lr,
    per_device_train_batch_size=CFG.train_batch_size,
    per_device_eval_batch_size=CFG.eval_batch_size,
    num_train_epochs=CFG.train_epochs,
    weight_decay=CFG.weight_decay,
    evaluation_strategy='epoch',
    metric_for_best_model='qwk',
    save_strategy='epoch',
    save_total_limit=1,
    load_best_model_at_end=True,
    report_to='none',
    warmup_ratio=CFG.warmup_ratio,
    lr_scheduler_type='linear',
    optim='adamw_torch',
    logging_first_step=True,
)

In [5]:
tokenizer = get_tokenizer(tokenization_strategy=CFG.tokenization_strategy,
                          model_path=PATHS.model_path,
                          max_length=CFG.max_length,
                          truncation=True,
                          batched=True,
                          tokenization_column=CFG.tokenization_column)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer.tokenizer)

In [6]:
train_valid_ds = Dataset.from_pandas(train_valid)
tokenized_train_valid_ds = tokenizer.tokenize_dataset(train_valid_ds)

In [7]:
PREDICTION_COLS = [f'p{x}' for x in range(CFG.num_labels)]
for fold in range(CFG.n_splits):
    train = tokenized_train_valid_ds.filter(lambda example: example['fold'] != fold)
    valid = tokenized_train_valid_ds.filter(lambda example: example['fold'] == fold)
    config = AutoConfig.from_pretrained(PATHS.model_path)
    config.num_labels = CFG.num_labels
    model = AutoModelForSequenceClassification.from_pretrained(PATHS.model_path, config=config)
    model.resize_token_embeddings(len(tokenizer.tokenizer))
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train,
        eval_dataset=valid,
        data_collator=data_collator,
        tokenizer=tokenizer.tokenizer,
        compute_metrics=kaggle_qwk
    )
    trainer.train()
    predictions0 = trainer.predict(valid).predictions
    trainer.save_model(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')
    tokenizer.tokenizer.save_pretrained(f'deberta-v3-small_AES2_fold_{fold}_v{CFG.VER}')
    valid[PREDICTION_COLS] = predictions0
    valid.to_csv(f'valid_df_fold_{fold}_v{CFG.VER}.csv', index=False)

Epoch,Training Loss,Validation Loss,Qwk
1,0.8971,0.937312,0.742822


TypeError: 'Dataset' object does not support item assignment

In [None]:
dfs = []
for k in range(CFG.n_splits):
    dfs.append(pd.read_csv(f'valid_df_fold_{k}_v{CFG.VER}.csv'))
    os.system(f'rm valid_df_fold_{k}_v{CFG.VER}.csv')
dfs = pd.concat(dfs)
dfs.to_csv(f'valid_df_v{CFG.VER}.csv', index=False)
print('Valid OOF shape:', dfs.shape)
display(dfs.head())

In [None]:
m = cohen_kappa_score(dfs.score.values, dfs.iloc[:, -6:].values.argmax(axis=1) + 1, weights='quadratic')
print('Overall QWK CV =', m)