In [1]:
!pip install '/kaggle/input/lal-aes2-wheels/iterative_stratification-0.1.7-py3-none-any.whl'

Processing /kaggle/input/lal-aes2-wheels/iterative_stratification-0.1.7-py3-none-any.whl
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7


In [2]:
import pandas as pd
import polars as pl
import numpy as np
from pathlib import Path
import random
import yaml
import time
import datetime
import os
import sys
import gc
import warnings
from tqdm.notebook import trange, tqdm
from types import SimpleNamespace

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.cuda.amp import autocast, GradScaler

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
from tokenizers import AddedToken

from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, GroupKFold
from sklearn.metrics import cohen_kappa_score

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

print(f'python version: {sys.version}') 
print(f'torch version: {torch.__version__}')
print(f'transfromers version: {transformers.__version__}')

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter('ignore')
transformers.utils.logging.set_verbosity_error() 

2024-06-30 04:46:20.460487: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-30 04:46:20.460604: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-30 04:46:20.588761: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


python version: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
torch version: 2.1.2
transfromers version: 4.38.2


# Seeding

In [3]:
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Split

In [4]:
def preprocess_data(df_):
    df = df_.copy()
    
    drop_id_list = ['e9be80d', '6017fea']
    
    df = df[~df['essay_id'].isin(drop_id_list)].reset_index(drop=True)

    df['label'] = df['score'].copy() - 1
    df['tokened_text'] = df['tokened_text'].str.replace('\xa0', ' ')
    df['tokened_text'] = df['tokened_text'].str.replace('\n\n', '[PARAGRAPH]')
    df['tokened_text'] = df['tokened_text'].str.strip()
    return df

In [5]:
def split_fold(conf, df_):
    df = df_.copy(deep=True)
    
    df["fold"] = -1
    X = df['tokened_text']
    y = df['label']
    
    if conf.cv_strat == 'stratified_kfold':
        skf = StratifiedKFold(n_splits=conf.num_folds, shuffle=True, random_state=conf.seed)
        for fold, (_, valid_idx) in enumerate(skf.split(X, y)):
            df.loc[valid_idx, 'fold'] = fold
            
    elif conf.cv_strat == 'stratified_group_kfold' and 'prompt_name' in df.columns:
        g = df['prompt_name']
        sgkf = StratifiedGroupKFold(n_splits=conf.num_folds, shuffle=True, random_state=conf.seed)
        for fold, (_, valid_idx) in enumerate(sgkf.split(X, y, g)):
            df.loc[valid_idx, 'fold'] = fold
        
    elif conf.cv_strat == 'group_kfold' and 'prompt_name' in df.columns:
        g = df['prompt_name']
        sgkf = GroupKFold(n_splits=conf.num_folds)
        for fold, (_, valid_idx) in enumerate(sgkf.split(X, y, g)):
            df.loc[valid_idx, 'fold'] = fold
            
    elif conf.cv_strat == 'multilabel_stratified_kfold' and 'prompt_name' in df.columns:
        target_cols = ['prompt_name', 'score']
        labels_map = {
            0: 'Car-free cities',
            1: '"A Cowboy Who Rode the Waves"',
            2: 'Exploring Venus',
            3: 'Facial action coding system',
            4: 'The Face on Mars',
            5: 'Driverless cars',
            6: 'Does the electoral college work?'
        }
        rev_labels_map = {v: k for k, v in labels_map.items()}
        df['prompt_name'] = df['prompt_name'].replace(rev_labels_map).astype(int)
        y = pd.get_dummies(data=df[target_cols], columns=target_cols, dtype=int)
        
        mskf = MultilabelStratifiedKFold(n_splits=conf.num_folds, shuffle=True, random_state=conf.seed)
        for fold, (_, valid_idx) in enumerate(mskf.split(X, y)):
            df.loc[valid_idx, 'fold'] = fold
        
    df['fold'] = df['fold'].astype(int)
    return df 

# Data

In [6]:
class AE2Dataset(Dataset):
    def __init__(self, conf, df, tokenizer, output_tokens_only=False):
        self.conf = conf

        self.full_texts = df[self.conf.train_col].reset_index(drop=True).values
        self.tokenizer = tokenizer
        self.output_tokens_only = output_tokens_only
        
        if not self.output_tokens_only:
            self.essay_ids = df['essay_id'].reset_index(drop=True).values
            self.labels = df[self.conf.target_col].reset_index(drop=True).values
            if self.conf.num_labels == 1: # regression
                self.label_dtype = torch.float
                if self.conf.criterion == 'bce':
                    self.labels = self.labels / 5.0 
            else: # classication
                self.label_dtype = torch.long
                
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, idx):
        tokens = self._get_token(idx)
        if self.output_tokens_only:
            return tokens
        else:
            ids = self.essay_ids[idx]
            labels = self._get_label(idx)
        return {'tokens': tokens, 'labels': labels, 'ids': ids}
    
    def _get_token(self, idx):
        tokenized = self.tokenizer(
            self.full_texts[idx],
            add_special_tokens=True,
            max_length=self.conf.max_len,
            padding="max_length",
            truncation=True,
            return_tensors=None,
        )
        
        return {k: torch.tensor(v, dtype=torch.long) for k, v in tokenized.items()}
    
    def _get_label(self, idx):
        return torch.tensor(self.labels[idx], dtype=self.label_dtype)

# Model

In [7]:
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class ConcatPooling(nn.Module):
    def __init__(self, pooling_last=4):
        super().__init__()
        self.pooling_last = pooling_last
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        concat_pooling = torch.cat(tuple(all_hidden_states[-l] for l in range(1, self.pooling_last + 1)), -1)
        concat_pooling = concat_pooling[:, 0] # select the first one
        return concat_pooling
    
# https://www.kaggle.com/competitions/google-quest-challenge/discussion/129840    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]
    
# https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super().__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out
    
class GeMPooling(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super().__init__()
        self.dim = dim
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        last_hidden_state = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = last_hidden_state / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret
    
class AttentionPooling(nn.Module):
    def __init__(self, hiddendim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hiddendim, hiddendim),
            nn.LayerNorm(hiddendim),
            nn.Tanh(),
            nn.Linear(hiddendim, 1),
        )
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

In [8]:
class CustomModel(nn.Module):
    def __init__(self, conf, conf_path=None):
        super().__init__()
        self.conf_path = conf_path
        self.multi_dropout = conf.multi_dropout
        if not self.conf_path:
            self.model_conf = AutoConfig.from_pretrained(conf.model_name, output_hidden_states=True)
            self.model_conf = self._set_dropout(self.model_conf)
            self.backbone = AutoModel.from_pretrained(conf.model_name, config=self.model_conf)
        else:
            self.model_conf = torch.load(self.conf_path)
            self.backbone = AutoModel.from_config(self.model_conf)
        
        if conf.gradient_checkpointing:
            self.backbone.gradient_checkpointing_enable()
            
        if conf.freeze_embeddings:
            self._freeze(self.backbone.embeddings)
            
        if conf.freeze_n_layers > 0:
            self._freeze(self.backbone.encoder.layer[: conf.freeze_n_layers])
        
        self.pooler, hidden_size = self.get_pooling_layer(conf)
        self.fc = nn.Linear(hidden_size, conf.num_labels)
        self._init_weights(self.fc)
        
        if self.multi_dropout and conf.num_labels > 1:
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
    
    def _set_dropout(self, model_conf, ratio=0.):
        model_conf.attention_dropout = ratio
        model_conf.attention_probs_dropout_prob = ratio
        model_conf.hidden_dropout = ratio
        model_conf.hidden_dropout_prob = ratio
        
        return model_conf
    
    def _freeze(self, module):
        for parameter in module.parameters():
            parameter.require_grad = False
    
    def get_pooling_layer(self, conf):
        if conf.pooling_layer == 'mean_pooling':
            hidden_size = self.model_conf.hidden_size
            return MeanPooling(), hidden_size
        if conf.pooling_layer == 'concat_pooling':
            hidden_size = self.model_conf.hidden_size * conf.ccp_pooling_last
            return ConcatPooling(conf.ccp_pooling_last), hidden_size
        if conf.pooling_layer == 'weighted_layer_pooling':
            hidden_size = self.model_conf.hidden_size
            return WeightedLayerPooling(self.model_conf.num_hidden_layers, conf.wlp_layer_start), hidden_size
        if conf.pooling_layer == 'lstm_pooling':
            hidden_size = self.model_conf.hidden_size
            return LSTMPooling(self.model_conf.num_hidden_layers, hidden_size, conf.lstm_hidden), hidden_size
        if conf.pooling_layer == 'gem_pooling':
            hidden_size = self.model_conf.hidden_size
            return GeMPooling(), hidden_size
        if conf.pooling_layer == 'attention_pooling':
            hidden_size = self.model_conf.hidden_size
            return AttentionPooling(hidden_size), hidden_size
        else:
            raise Exception('Invalid pooling layer name')
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, inputs):
        backbone_outputs = self.backbone(**inputs)
        pooler_outputs = self.pooler(backbone_outputs, inputs)
        if self.multi_dropout:
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    h = self.fc(dropout(pooler_outputs))
                else:
                    h += self.fc(dropout(pooler_outputs))

            outputs = h / len(self.dropouts)
        else:
            outputs = self.fc(pooler_outputs)
        
        return outputs

# Utils

In [9]:
# https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train/notebook
class Averager:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
    def get_average(self):
        return self.avg
    
    def get_value(self):
        return self.val

In [10]:
class TimerError(Exception):
    """A custom exception used to report errors in use of Timer class"""

class Timer:
    def __init__(self):
        self.split_time = []
        self._start_time = None

    def start(self):
        """Start a new timer"""
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        """Stop the timer, and report the elapsed time"""
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")
            
        self._start_time = None
    
    def get_time(self):
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")
            
        return time.perf_counter() - self._start_time
    
    def split(self):
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")
            
        self.split_time.append(time.perf_counter() - self._start_time)
    
    def get_split_time(self, idx):
        return self.split_time[idx]
    
    @staticmethod
    def formatting(second):
        return str(datetime.timedelta(seconds=round(second)))

In [11]:
def extract_config(class_obj):
    config_dict = {}
    for k, v in zip(class_obj.__dict__.keys(), class_obj.__dict__.values()):
        if not k[0].startswith('_'):
            config_dict[k] = v

    with open(class_obj.save_path + 'config.yaml', 'w+') as file:
        yaml.dump(config_dict, file)

    print('Extracted config')

def get_tokenizer(conf):
    addition_tokens = [
        '[UNANNOTATED]',
        '[LEAD]',
        '[POSITION]',
        '[CLAIM]',
        '[EVIDENCE]',
        '[CONCLUDE]',
        '[COUNTER]',
        '[REBUTTAL]',
    ] # not using this anymore, but left it right here

    tokenizer = AutoTokenizer.from_pretrained(conf.model_name)
#     tokenizer.add_tokens([AddedToken("\n", normalized=False)])
    tokenizer.add_special_tokens({'additional_special_tokens': ['[PARAGRAPH]'] + addition_tokens})
    tokenizer_file = Path(conf.save_path, 'tokenizers/')

    if not tokenizer_file.is_file():
        tokenizer.save_pretrained(tokenizer_file) # save tokenizer for later infer

    return tokenizer

# optimize padding size
# https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train/notebook
def collator(inputs):
    mask_len = int(inputs['attention_mask'].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

def get_dataloader(conf, df, tokenizer, fold_num):
    
    training_set = df[df['fold'] != fold_num]
    validating_set = df[df['fold'] == fold_num]

    train_dataset = AE2Dataset(conf, training_set, tokenizer, output_tokens_only=False)
    valid_dataset = AE2Dataset(conf, validating_set, tokenizer, output_tokens_only=False)
    
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=conf.batch_size,
        num_workers=4,
        shuffle=True,
        pin_memory=True,
        drop_last=True,
    )
    
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=conf.batch_size,
        num_workers=4,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    
    return train_dataloader, valid_dataloader

def get_model(conf):
    model = CustomModel(conf)
    model_config_file = Path(conf.save_path, Path(conf.model_name).name + '_config.pt')
    
    if not model_config_file.is_file():
        torch.save(model.model_conf, model_config_file)
        
    return model

def load_model(conf, fold_num, device, model_path):
    model_config_path = list(Path(model_path).glob('*_config.pt'))[0].as_posix()
    pretrained_model_path = f'{model_path}/best_score_fold{fold_num}.pt'

    model = CustomModel(conf, conf_path=model_config_path)
    model_config_file = Path(conf.save_path, Path(conf.model_name).name + '_config.pt')

    if not model_config_file.is_file():
        torch.save(model.model_conf, model_config_file)
    state_dict = torch.load(pretrained_model_path, map_location=device)['model_state_dict']
    model.load_state_dict(state_dict)
    print(f'Pretrained Model Fold {fold_num} Loaded')
    return model


def get_optimizer(conf):
    optimizer_dict = {
        'adamw' : optim.AdamW,
    }
    
    return optimizer_dict[conf.optimizer]

def get_optimizer_grouped_params(conf, model):
    layerwise_lr_decay = conf.layerwise_lr_decay
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    # initialize lr for task specific layer
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "backbone" not in n],
            "weight_decay": 0.0,
            "lr": conf.decoder_lr,
        },
    ]
    # initialize lr for extra params in encoder
    extra_params = [
        (n, p)
        for n, p in model.named_parameters()
        if "backbone" in n
        and "backbone.embeddings" not in n
        and "backbone.encoder.layer" not in n
    ]
    optimizer_grouped_parameters += [
        {
            "params": [
                p for n, p in extra_params if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": conf.weight_decay,
            "lr": conf.encoder_lr,
        },
    ]
    optimizer_grouped_parameters += [
        {
            "params": [p for n, p in extra_params if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
            "lr": conf.encoder_lr,
        },
    ]
    # initialize lrs for every layer
    layers = [model.backbone.embeddings] + list(model.backbone.encoder.layer)
    layers.reverse()
    lr = conf.decoder_lr
    for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": conf.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
        lr *= layerwise_lr_decay
    return optimizer_grouped_parameters

def get_optimizer_params(conf, model, weight_decay=0.0):
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': conf.encoder_lr, 'weight_decay': conf.weight_decay},
        {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': conf.encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
         'lr': conf.decoder_lr, 'weight_decay': 0.0},
    ]
    return optimizer_parameters


def get_scheduler(conf, samples_per_epoch):
    scheduler_dict = {
        'cosine_warmup': {
            'scheduler': transformers.get_cosine_schedule_with_warmup,
            'hparams': {
                'num_warmup_steps': int(samples_per_epoch * conf.num_epochs * conf.warmup_ratios),
                'num_training_steps': samples_per_epoch * conf.num_epochs,
            }
        },
        'linear_warmup': {
            'scheduler':transformers.get_linear_schedule_with_warmup,
            'hparams': {
                'num_warmup_steps': int(samples_per_epoch * conf.num_epochs * conf.warmup_ratios),
                'num_training_steps': samples_per_epoch * conf.num_epochs,
            }
        },
        'poly_warmup': {
            'scheduler':transformers.get_polynomial_decay_schedule_with_warmup,
            'hparams': {
                'num_warmup_steps': int(samples_per_epoch * conf.num_epochs * conf.warmup_ratios),
                'num_training_steps': samples_per_epoch * conf.num_epochs,
                'power': conf.poly_power
            }
        },
        'constant_warmup': {
            'scheduler':transformers.get_constant_schedule_with_warmup,
            'hparams': {
                'num_warmup_steps': int(samples_per_epoch * conf.num_epochs * conf.warmup_ratios),
            }
        },
    }
    return scheduler_dict[conf.scheduler]['scheduler'], scheduler_dict[conf.scheduler]['hparams']
    
def get_criterion(conf):
    criterion_dict = {
        'ce': nn.CrossEntropyLoss(), # classification
        'bce': nn.BCEWithLogitsLoss(), # reg as cls
        'mse': nn.MSELoss(), # regression
        'huber': nn.HuberLoss(),
    }
    return criterion_dict[conf.criterion]

def process_outputs(y_trues, y_preds, loss_fn):
    
    if loss_fn == 'mse' or loss_fn == 'huber':
        y_preds = y_preds.detach().cpu().numpy()
        y_preds = np.rint(y_preds.reshape(-1)).astype(int)
        y_trues = y_trues.detach().cpu().numpy()
        
    elif loss_fn == 'ce':
        y_preds = y_preds.detach().cpu().numpy().argmax(-1)
        y_trues = y_trues.detach().cpu().numpy()
        
    elif loss_fn == 'bce':
        y_preds = y_preds.sigmoid().detach().cpu().numpy()
        y_trues = y_trues.detach().cpu().numpy()
        y_preds = np.rint(y_preds * 5.0).astype(int) # scaled back
        y_trues = np.rint(y_trues * 5.0).astype(int) # scaled back
        
    else:
        raise Exception('loss_fn is invalid')
        
    return y_trues, y_preds

def calculate_qwk_score(y_trues, y_preds):
    qwk = cohen_kappa_score(y_trues, y_preds, weights='quadratic')
    return qwk

# Trainer

In [12]:
class Trainer:
    def __init__(self, debug_run, fold, conf, device, model, optimizer, optim_params, scheduler, scheduler_hparams, criterion):
        self.debug_run = debug_run
        self.current_fold = fold
        self.device = device
        self.model = model
        self.optimizer = optimizer(optim_params, lr=conf.encoder_lr, eps=conf.optim_eps, betas=(conf.optim_betas1, conf.optim_betas2))
        self.scheduler = scheduler(self.optimizer, **scheduler_hparams)
        self.criterion = criterion

        self.save_path = conf.save_path
        self.num_epochs = conf.num_epochs
        self.verbose_step = conf.verbose_step
        self.apex = conf.apex
        self.clip_grad_norm = conf.clip_grad_norm
        self.max_grad_norm = conf.max_grad_norm
        self.criterion_name = conf.criterion
        self.num_labels = conf.num_labels
        self.exp_num = conf.exp

        self.scaler = GradScaler(enabled=self.apex)

        self.best_train_loss = torch.tensor(10000)
        self.best_valid_loss = torch.tensor(10000)
        self.best_qwk_score = -np.inf
        
        self.best_fold_y_preds = np.array([])
        self.best_fold_y_preds_raw = torch.tensor([])
        
        self.record_df_columns = ['fold', 'epoch', 'train_loss', 'valid_loss', 'qwk_score']
        self.record_df = pd.DataFrame(columns=self.record_df_columns)

        
    def fit(self, train_loader, valid_loader):
        self.model.to(self.device)

        self.log(f'exp: {self.exp_num}')
        self.log(f'--- FOLD {self.current_fold} ---')
        
        for epoch in range(self.num_epochs):
            self.current_epoch = epoch
            
            train_loss = self._train_fn(train_loader)
            valid_loss, ids, y_trues, raw_y_preds = self._valid_fn(valid_loader)
            
            y_trues, y_preds = process_outputs(y_trues, raw_y_preds, self.criterion_name)
            
            qwk_score = calculate_qwk_score(y_trues, y_preds)
            
            self._compare_and_save(qwk_score, train_loss, valid_loss, y_preds, raw_y_preds.detach().cpu())
            self._record([self.current_fold, self.current_epoch, train_loss, valid_loss, qwk_score])

            self.log(f'-- [Fold: {self.current_fold}, Epoch: {self.current_epoch + 1}] DONE --\n')

        folds_preds_dict = {}
        folds_preds_dict['essay_id'] = ids
        if  self.criterion_name == 'ce':
            softmaxed = self.best_fold_y_preds_raw.softmax(1).numpy()
            for c in range(self.num_labels):
                folds_preds_dict[f'pred_p_{c}'] = softmaxed[:, c]
            for c in range(self.num_labels):
                folds_preds_dict[f'raw_pred_p_{c}'] = self.best_fold_y_preds_raw[:, c].numpy()
            folds_preds_dict['pred_c'] = self.best_fold_y_preds + 1
        else:
            folds_preds_dict['score'] = self.best_fold_y_preds + 1
            folds_preds_dict['raw_score'] = self.best_fold_y_preds_raw.numpy() + 1
    
        fold_preds = pd.DataFrame(folds_preds_dict)
        
        return self.record_df, fold_preds
    
    def _train_fn(self, train_loader):
        self.log('TRAINL_LOOP')
        self.model.train()
        total_loss = Averager()
        current_lr = self.scheduler.get_lr()[0]
        timer = Timer()
        timer.start()

        for step, batch in enumerate(train_loader):
            
            inputs = batch['tokens']
            labels = batch['labels']
            
            inputs = collator(inputs)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            labels = labels.to(self.device)
            batchsize = len(labels)
            
            with autocast(enabled=self.apex):
                outputs = self.model(inputs)
                outputs = outputs.squeeze() if self.num_labels == 1 else outputs
                loss = self.criterion(outputs, labels)

            total_loss.update(loss.item(), batchsize)
            
            current_lr = self.scheduler.get_lr()[0]
            
            self.scaler.scale(loss).backward()
            
            if self.clip_grad_norm:
                self.scaler.unscale_(self.optimizer)
                grad_norm = nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
            self.scaler.step(self.optimizer)
            self.scaler.update()
            
            self.scheduler.step()
            self.optimizer.zero_grad()
            
            if step % self.verbose_step == 0 or step == (len(train_loader) - 1):
                self.log(
                    f'[TRAIN_F{self.current_fold}], ' + \
                    f'E: {self.current_epoch + 1}/{self.num_epochs}, ' + \
                    f'S: {str(step).zfill(len(str(len(train_loader))))}/{len(train_loader)}, ' + \
                    f'L: {total_loss.get_average():.5f}, ' + \
                    f'LR: {current_lr:.8f}, ' + \
#                     f'G: {grad_norm:.4f}, ' + \
                    f'T: {Timer.formatting(timer.get_time())}'
                )
            
            # end of the train loop
            if self.debug_run: break
        
        timer.stop()
        
        torch.cuda.empty_cache()
            
        return total_loss.get_average() 
        
    def _valid_fn(self, valid_loader):
        self.log("\nVALID_LOOP")
        self.model.eval()

        total_loss = Averager()
        timer = Timer()
        timer.start()
        
        ids_list = []
        outputs_list = []
        labels_list = []
        
        for step, batch in enumerate(valid_loader):
            
            ids = batch['ids']
            inputs = batch['tokens']
            labels = batch['labels']
            
            inputs = collator(inputs)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            labels = labels.to(self.device)
            batchsize = len(labels)
            
            with torch.no_grad():
                outputs = self.model(inputs)
                outputs = outputs.squeeze() if self.num_labels == 1 else outputs
                loss = self.criterion(outputs, labels)
                
            total_loss.update(loss.item(), batchsize)
            _ = [ids_list.append(i) for i in ids]
            labels_list.append(labels.view(-1) if labels.size() == torch.Size([]) else labels)
            outputs_list.append(outputs.view(-1) if outputs.size() == torch.Size([]) else outputs)
            
            if step % self.verbose_step == 0 or step == (len(valid_loader) - 1):
                self.log(
                    f'[VALID_F{self.current_fold}], ' + \
                    f'E: {self.current_epoch + 1}/{self.num_epochs}, ' + \
                    f'S: {str(step).zfill(len(str(len(valid_loader))))}/{len(valid_loader)}, ' + \
                    f'L: {total_loss.get_average():.5f}, ' + \
                    f'T: {Timer.formatting(timer.get_time())}'
                )
            if self.debug_run: break
            # end of the valid loop

        labels_list = torch.cat(labels_list)
        outputs_list = torch.cat(outputs_list)
        
        timer.stop()
        
        torch.cuda.empty_cache()

        return total_loss.get_average(), ids_list, labels_list, outputs_list
    
    def _compare_and_save(self, qwk_score, train_loss, valid_loss, y_preds, raw_y_preds):
        if qwk_score > self.best_qwk_score:
            self.best_qwk_score = qwk_score
            self.best_train_loss = train_loss
            self.best_valid_loss = valid_loss
            self.best_fold_y_preds = y_preds
            self.best_fold_y_preds_raw = raw_y_preds
            
            file_name = f'best_score_fold{self.current_fold}.pt'
            
            self.model.eval()
            torch.save({
                'model_state_dict': self.model.state_dict(),
                'best_train_loss': self.best_train_loss,
                'best_valid_loss': self.best_valid_loss,
                'best_qwk_score': self.best_qwk_score,
            }, Path(self.save_path, file_name))
            
            self.log(f'\n-> [SAVED] Fold: {self.current_fold}, Epoch: {self.current_epoch + 1}, QWK: {self.best_qwk_score}\n')
            
    def _record(self, new_record_values):
        new_record_dict = {k: [v] for k, v in zip(self.record_df_columns, new_record_values)}
        new_record = pd.DataFrame.from_dict(new_record_dict)
        self.record_df = pd.concat([self.record_df, new_record], axis=0)

    def log(self, msg):
        print(msg)
        if not self.debug_run:
            with open(Path(self.save_path, 'train.log'), mode='a+', encoding='utf-8') as log:
                log.write(f'{msg}\n')

# Config

In [13]:
class CONF:
    exp = '200'
    
    data_path = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/"
    extra_data_path = '/kaggle/input/lal-aes2-create-prompt-data/cleaned_persaude.csv'
    save_path = '/kaggle/working/'
    
    train_col = 'full_text'
    target_col = 'label'
    num_labels = 1  # 1 for mse and bce, 6 for ce

    seed = 42
    num_folds = 4
    train_fold_list = [0, 1, 2, 3]
    cv_strat = 'multilabel_stratified_kfold' # ['stratified_kfold', 'stratified_group_kfold', 'group_kfold', 'multilabel_stratified_kfold']

    model_name = 'microsoft/deberta-v3-base'
    pooling_layer = 'mean_pooling'
    # ['mean_pooling', 'concat_pooling', 'weighted_layer_pooling', 'lstm_pooling', 'gem_pooling', 'attention_pooling']

    wlp_layer_start = 4
    ccp_pooling_last = 4
    lstm_hidden = 768

    gradient_checkpointing = True
    freeze_embeddings = False
    freeze_n_layers = 0
    optimizer = 'adamw'
    optim_eps = 1e-6
    optim_betas1 = 0.9
    optim_betas2 = 0.999
    scheduler = 'linear_warmup' # ['cosine_warmup', 'linear_warmup', 'poly_warmup', 'constant_warmup']
    encoder_lr = 1e-5
    decoder_lr = 1e-5
    layerwise_lr_decay = 0.9
    weight_decay = 0.01
    multi_dropout = False # DO NOT USE THIS WITH REGRESSION
    criterion = 'mse' # ['mse', 'ce', 'bce', 'huber']
    apex = True
    max_len = 1024
    batch_size = 8
    num_epochs = 3
    warmup_ratios = 0.0
    poly_power = 0.05
    verbose_step = 200
    clip_grad_norm = True
    max_grad_norm = 1.0

# Load data

In [14]:
# train_df = pd.read_csv(CONF.data_path + "train.csv")
# train_df_with_prompt = pd.read_csv('/kaggle/input/lal-aes2-create-prompt-data/train_df_with_prompt.csv')

train_df_with_prompt = pl.read_csv('/kaggle/input/lal-aes2-create-tokened-text/train_combined_tokened.csv')

included_token = ['Position', 'Evidence', 'Concluding', 'Rebuttal']

all_token = ['Lead', 'Position', 'Claim', 'Evidence', 'Concluding', 'Counterclaim', 'Rebuttal']
token_filter_pat = ''.join(['<(Unannotated'] + [f'|{t}' for t in all_token if t not in included_token] + [')>'])

train_df_with_prompt = train_df_with_prompt.with_columns(pl.col('tokened_text').str.replace_all(token_filter_pat, '')).to_pandas()

# remove_topics = ['Car-free cities', 'Does the electoral college work?']
# train_df_with_prompt =  train_df_with_prompt[~train_df_with_prompt['prompt_name'].isin(remove_topics)].reset_index(drop=True)

In [15]:
## debugging
# cfg = CONF()
# tokenizer = get_tokenizer(cfg)
# df = preprocess_data(train_df_with_prompt)
# df = split_fold(cfg, df)
## train_dataset = AE2Dataset(cfg, df[df['fold'] != 0], tokenizer, output_tokens_only=True)
# train_loader, valild_loader = get_dataloader(cfg, df, tokenizer, 0)
# model = get_model(CONF)
# for _ , (inputs, labels) in enumerate(train_loader):
#     inputs = collator(inputs)
#     outputs = model(inputs)
#     break

# Train

In [16]:
def run_training(conf, df, debug_run=True, pretrained_path=None):
    seed_everything(conf.seed)
    cv_record_df = pd.DataFrame()
    oof_df = pd.DataFrame()

    extract_config(conf)
    df = preprocess_data(df)
    df = split_fold(conf, df)
    print(df.groupby('fold').size())

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    if debug_run:
        print("DEBUG_RUN")
        conf.batch_size = 2

    for f in conf.train_fold_list:
        seed_everything(conf.seed)
        tokenizer = get_tokenizer(conf)
        train_loader, valid_loader = get_dataloader(conf, df, tokenizer, f)
        if not pretrained_path:
            model = get_model(conf)
        else:
            model = load_model(conf, f, device, pretrained_path)
        model.backbone.resize_token_embeddings(len(tokenizer))
        optimizer = get_optimizer(conf)
        optim_params = get_optimizer_grouped_params(conf, model) # LLRD
#         optim_params = get_optimizer_params(conf, model) # Diff LR
        scheduler, scheduler_hparams = get_scheduler(conf, len(train_loader))
        criterion = get_criterion(conf)

        trainer = Trainer(debug_run, f, conf, device, model, optimizer, optim_params, scheduler, scheduler_hparams, criterion)
        record_df, fold_preds_df = trainer.fit(train_loader, valid_loader)

        oof_df = pd.concat([oof_df, fold_preds_df], axis=0)
        cv_record_df = pd.concat([cv_record_df, record_df], axis=0)

        if debug_run and len(conf.train_fold_list) == 1: break

    oof_df = oof_df.sort_values(by='essay_id').reset_index(drop=True)
    score_df = df.copy().merge(oof_df, on='essay_id', how='inner')
    if conf.num_labels == 1:
        y_trues = score_df['score_x'].values
        y_preds = score_df['score_y'].values
    else:
        y_trues = score_df['score'].values
        y_preds = score_df['pred_c'].values
    qwk = cohen_kappa_score(y_trues, y_preds, weights='quadratic')
    overall_cv = pd.DataFrame({'fold': [np.nan], 'epoch': [np.nan], 'train_loss': [np.nan], 'valid_loss': [np.nan], 'qwk_score': [qwk]})
    cv_record_df = pd.concat([cv_record_df, overall_cv], axis=0).reset_index(drop=True)
    best_epoch_idx = [cv_record_df[cv_record_df['fold'] == i]['qwk_score'].idxmax() for i in conf.train_fold_list]
    best_epoch_record  = pd.concat([cv_record_df[cv_record_df.index.isin(best_epoch_idx)], overall_cv], axis=0).reset_index(drop=True)

    display(cv_record_df)
    display(best_epoch_record)
    cv_record_df.to_csv(conf.save_path + 'cv_record.csv', index=False)
    best_epoch_record.to_csv(conf.save_path + 'best_epoch_record.csv', index=False)
    oof_df.to_csv(conf.save_path + 'oof_df.csv', index=False)

# Run Training

In [17]:
run_training(CONF, train_df_with_prompt, debug_run=False, pretrained_path=None)

Extracted config
fold
0    4326
1    4326
2    4327
3    4326
dtype: int64


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

exp: 200
--- FOLD 0 ---
TRAINL_LOOP
[TRAIN_F0], E: 1/3, S: 0000/1622, L: 5.89558, LR: 0.00001000, T: 0:00:02
[TRAIN_F0], E: 1/3, S: 0200/1622, L: 0.73856, LR: 0.00000959, T: 0:04:31
[TRAIN_F0], E: 1/3, S: 0400/1622, L: 0.58617, LR: 0.00000918, T: 0:08:54
[TRAIN_F0], E: 1/3, S: 0600/1622, L: 0.51282, LR: 0.00000877, T: 0:13:17
[TRAIN_F0], E: 1/3, S: 0800/1622, L: 0.47523, LR: 0.00000836, T: 0:17:42
[TRAIN_F0], E: 1/3, S: 1000/1622, L: 0.45235, LR: 0.00000794, T: 0:22:06
[TRAIN_F0], E: 1/3, S: 1200/1622, L: 0.43199, LR: 0.00000753, T: 0:26:38
[TRAIN_F0], E: 1/3, S: 1400/1622, L: 0.42038, LR: 0.00000712, T: 0:31:16
[TRAIN_F0], E: 1/3, S: 1600/1622, L: 0.40932, LR: 0.00000671, T: 0:35:48
[TRAIN_F0], E: 1/3, S: 1621/1622, L: 0.40807, LR: 0.00000667, T: 0:36:15

VALID_LOOP
[VALID_F0], E: 1/3, S: 000/541, L: 0.41967, T: 0:00:01
[VALID_F0], E: 1/3, S: 200/541, L: 0.32224, T: 0:01:09
[VALID_F0], E: 1/3, S: 400/541, L: 0.32552, T: 0:02:21
[VALID_F0], E: 1/3, S: 540/541, L: 0.32632, T: 0:03:11

-

Unnamed: 0,fold,epoch,train_loss,valid_loss,qwk_score
0,0.0,0.0,0.408075,0.326321,0.769493
1,0.0,1.0,0.282846,0.303899,0.80413
2,0.0,2.0,0.234511,0.285956,0.821575
3,1.0,0.0,0.413916,0.326767,0.80665
4,1.0,1.0,0.282998,0.303143,0.795432
5,1.0,2.0,0.236754,0.2896,0.819221
6,2.0,0.0,0.40947,0.338669,0.761817
7,2.0,1.0,0.283495,0.291195,0.834602
8,2.0,2.0,0.23532,0.281618,0.828678
9,3.0,0.0,0.409942,0.336999,0.799193


Unnamed: 0,fold,epoch,train_loss,valid_loss,qwk_score
0,0.0,2.0,0.234511,0.285956,0.821575
1,1.0,2.0,0.236754,0.2896,0.819221
2,2.0,1.0,0.283495,0.291195,0.834602
3,3.0,2.0,0.237034,0.2888,0.821621
4,,,,,0.824406
