# 1. Libraries

In [34]:
# Libraries
import os
import gc
import time
import math
import pickle
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


# 2. Configuration

In [35]:
## Configuration
class CFG:
    # Data
    input_path = '../input'
    output_path = './'
    config_path = output_path+'/config.pth'
    var_col = 'full_text'
    target_col = 'engagement_count'
#     num_classes = 6
    
    # General
    seed = 2023
    nfolds = 5

    # Model
    model_path = 'D:/Kaggle/Hackathon/20_deberta/input/deberta-base/deberta-base'
    model_name = 'deberta-base'
    tokenizer = None
    max_len = 512
    batch_size = 8
    num_workers = 4

    # Optimizer params
    epochs = 4
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    weight_decay = 0.01 # a regularization method to make models generalize better by learning smoother functions
    eps = 1e-6
    betas = (0.9, 0.999)
    max_norm = 1000
    
    # scheduler
    scheduler='cosine' # ['linear', 'cosine']
    num_cycles=0.5
    num_warmup_steps=0
    
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
tokenizer.save_pretrained(os.path.join(CFG.output_path, 'tokenizer'))
CFG.tokenizer = tokenizer

# 3. Utils

In [36]:
### Utils
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
def RMSE(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
    return score

# def rmse(predictions, targets):
#     return np.sqrt(((predictions - targets) ** 2).mean())

def MCRMSE(y_trues, y_preds):
    y_trues = np.asarray(y_trues)
    y_preds = np.asarray(y_preds)
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score

class LossAggregator():
    def __init__(self):
        self.count = 0
        self.sum = 0
        self.avg = 0
    def update(self, count, value):
        self.count += count
        self.sum += value * count
        self.avg = self.sum / self.count

In [37]:
### Dataset and Model
class TrainDataset(Dataset):
    def __init__(self, df, cfg):
        self.df = df
        self.cfg = cfg
        self.texts = df[CFG.var_col].values
        self.targets = df[CFG.target_col].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        inputs = self.cfg.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.cfg.max_len,
                        pad_to_max_length=True
                    )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
            
        targets = torch.tensor(self.targets[index], dtype=torch.float)
        return inputs, targets

class TestDataset(Dataset):
    def __init__(self, df, cfg):
        self.df = df
        self.cfg = cfg
        self.texts = df[CFG.var_col].values
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = self.texts[index]
        inputs = self.cfg.tokenizer.encode_plus(
                        text,
                        truncation=True,
                        add_special_tokens=True,
                        max_length=self.cfg.max_len,
                        pad_to_max_length=True
                    )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
            
        return inputs
    
def Collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs
    
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class TWModel(nn.Module):
    def __init__(self, model_path, config_path=None, model_from_pretrained=True):
        super().__init__()
        if config_path is None:
            self.config = AutoConfig.from_pretrained(CFG.model_path, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
        else:
            self.config = torch.load(config_path)
            
        if model_from_pretrained:
            self.model = AutoModel.from_pretrained(CFG.model_path, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
            
#         self.drop = nn.Dropout(p=0.2)
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CFG.num_classes)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)        
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler == 'linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=config.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler == 'cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
        )
    return scheduler

In [38]:
seed_everything(CFG.seed)

In [39]:
##### Train 
# Read data
print('>>> Reading train dataset...\n')
train = pd.read_csv(os.path.join(CFG.input_path, 'expert_training.csv'))
    
# kfold
print('>>> Preparing Kfold train dataset...\n')
kfold = StratifiedKFold(n_splits=CFG.nfolds, shuffle=True, random_state=CFG.seed)
for i, (tr_index, vl_index) in enumerate(kfold.split(train, train[CFG.target_col])):
    train.loc[vl_index, 'fold'] = int(i)
train['fold'] = train['fold'].astype(int)

>>> Reading train dataset...

>>> Preparing Kfold train dataset...



In [40]:
# Run for Fold
print('>>> Started training...\n')
df_oof = pd.DataFrame()

for fold in range(CFG.nfolds):
    print(f'<<Fold {fold+1} of {CFG.nfolds}>>')

# Fold data, dataloader
    df_train = train[train['fold'] != fold].reset_index(drop=True)
    df_valid = train[train['fold'] == fold].reset_index(drop=True)

    ds_train = TrainDataset(df_train, CFG)
    ds_valid = TrainDataset(df_valid, CFG)
    
    train_loader = DataLoader(ds_train, batch_size = CFG.batch_size, shuffle=True, num_workers=CFG.num_workers, pin_memory=True)
    valid_loader = DataLoader(ds_valid, batch_size = CFG.batch_size, shuffle=False, num_workers=CFG.num_workers, pin_memory=True)

# Model
    model = TWModel(CFG.model_path, config_path=None, model_from_pretrained=True).to(device)
    torch.save(model.config, os.path.join(CFG.output_path, 'config.pth'))

>>> Started training...

<<Fold 1 of 5>>


RuntimeError: Failed to import transformers.models.deberta.modeling_deberta because of the following error (look up to see its traceback):
No module named 'six.moves.collections_abc'