# About this notebook
- This notebook is a modified version of the PyTorch pipeline from Y.Nakama's starter NLP notebook from Feedback Prize 3 competition [here](https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train). Don't forget to upvote his work!
- Inference notebook is [here](https://www.kaggle.com/mohammad2012191/debertav3-pytorch-baseline-inference-cv-0-467)

In [None]:
!nvidia-smi

Wed Sep 27 01:26:27 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    47W / 400W |      0MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import runtime



# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    exp='exp092'
    is_exp=False
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    epochs=3
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.98)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    awp=False
    nth_awp_start_epoch= 3
    adv_lr = 1e-4
    adv_eps = 1e-2
    eval_steps =70
    save_strategy='epoch'
    pooling='ConcatPooling'
    n_layers=10
    freeze=True
    freeze_top_num_layer=14
    lr_weight_decay=0.95
    reinit=False


if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = f'/content/drive/MyDrive/Kaggle/outputs/{CFG.exp}/'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [None]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:

    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install -q transformers')
!pip install transformers==4.31.0
os.system('pip install -q tokenizers')
!pip install tokenizers==0.13.3
!pip install sentencepiece


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers==4.31.0)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m 

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
input_path = '/content/drive/MyDrive/Kaggle/inputs/'
train = pd.read_csv(input_path+'summaries_train.csv')
test = pd.read_csv(input_path+'summaries_test.csv')
submission = pd.read_csv(input_path+'sample_submission.csv')
prompt_train = pd.read_csv(input_path+'prompts_train.csv')
prompt_test = pd.read_csv(input_path+'prompts_test.csv')
train = pd.merge(train,prompt_train,how='left',on='prompt_id')
test = pd.merge(test,prompt_test,how='left',on='prompt_id')
print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (7165, 8)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


test.shape: (4, 6)


Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...
1,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...
2,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...


submission.shape: (4, 3)


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [None]:
# oof_df=pd.read_pickle(input_path+'oof_df.pkl')

In [None]:
train['text'] = ' question: '+train['prompt_question'] + ' [SEP] summary: '+ train['text']+' [SEP] source: '+train['prompt_text'].str[:6000]
test['text'] =  ' question: '+test['prompt_question'] + ' [SEP] summary:  '+ test['text']+' [SEP] source: '+test['prompt_text'].str[:6000]

#################################################
# prompt_textも
#################################################

# # "text"列の長さを計算して新しい列"length"に追加
# train['length'] = train['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# train['text'] = train['length'].astype(str) + '[SEP]' + train['prompt_question'] + '[SEP]' +train['prompt_title'] + 'summary(' + train['text'] +') [SEP] source of summary('+train['prompt_text']+')'

# # "text"列の長さを計算して新しい列"length"に追加
# test['length'] = test['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# test['text'] = test['length'].astype(str) + '[SEP]' + test['prompt_question'] + '[SEP]' +test['prompt_title'] + 'summary(' + test['text'] +') [SEP] source of summary('+test['prompt_text']+')'


# CV split

In [None]:
# ====================================================
# CV split
# ====================================================
# Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
#     train.loc[val_index, 'fold'] = int(n)
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

train["fold"] = train["prompt_id"].map(id2fold)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1103
1    2057
2    2009
3    1996
dtype: int64

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=3000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [None]:
CFG.model

'microsoft/deberta-v3-large'

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
train['text'].iloc[2]

' question: In complete sentences, summarize the structure of the ancient Egyptian system of government. How were different social classes involved in this government? Cite evidence from the text. [SEP] summary: In Egypt, there were many occupations and social classes involved in day-to-day living. In many instances if you were at the bottom of the social ladder you could climb up, you didn\'t have to stay a peasant you could work to bring your status up. Everyone worshipped the gods Ra, Osiris, and Isis, but also they would worship their pharaohs like gods as well. Under the pharaohs were the priests, they had the responsibility to entertain or please the said god. The Chain of Command was placed to keep everyone in check, not one person could handle all the civilians and treasures without any aid. Like the tax collector, called a vizier like stated they were in charge of collecting the peoples\' tax. They were also one of the rare instances who were able to read and write, that\'s ho

In [None]:
# テキストをエンコード
text = 'unnko'
encoded = tokenizer(text, return_tensors='pt')

# デコードして元のテキストを取得

decoded_tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
decoded_text = " ".join(decoded_tokens)

print(f"Original text: {text}")
print(f"Encoded: {encoded}")
print(f"Decoded text: {decoded_text}")

Original text: unnko
Encoded: {'input_ids': tensor([[   1, 1655,  673, 4712,    2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Decoded text: [CLS] ▁un n ko [SEP]


# Dataset

In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
CFG.max_len=1024
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 1024
INFO:__main__:max_len: 1024


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [None]:
#ref:https://github.com/shu421/kagglib/blob/main/nlp/model.py
# ====================================================
# Model
# ====================================================

def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False
# =====================================================
# Pooling
# =====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings



class AttentionPooling(nn.Module):
    """
    Usage:
        self.pool = AttentionPooling(self.config.hidden_size)
    """
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
        )

        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float("-inf")
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings



class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]

class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm, dropout_rate, is_lstm=True):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm

        if is_lstm:
            self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        else:
            self.lstm = nn.GRU(self.hidden_size, self.hiddendim_lstm, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

class ConcatPooling(nn.Module):
    def __init__(self, n_layers=4):
        super(ConcatPooling, self, ).__init__()

        self.n_layers = n_layers

    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling

# GeM
class GeMText(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

# ===========================================
# custom Model
# ===========================================

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if cfg.pooling =='LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
            self.fc = nn.Linear(self.cfg.hidden_size, 2)
        elif cfg.pooling == "GeM":
            self.pool = GeMText()
            self.fc = nn.Linear(self.config.hidden_size, 2)
        elif cfg.pooling=='ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 2)
        elif cfg.pooling=='WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
            self.fc = nn.Linear(self.config.hidden_size, 2)


        self._init_weights(self.fc)


        # Freeze
        if self.cfg.freeze:
            freeze(self.model.encoder.layer[:self.cfg.freeze_top_num_layer])

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        if self.cfg.pooling=='MeanPooling':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling=='GeM':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output


# initialize layer
def reinit_bert(model):
    """_summary_

    Args:
        model (AutoModel): _description_

    Returns:
        model (AutoModel): _description_

    Usage:
        model = reinit_bert(model)
    """
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

# Loss

In [None]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss



class WeightedSmoothL1Loss(nn.Module):
    def __init__(self,weights = torch.tensor([0.5, 1.2], device = device )):
        super(WeightedSmoothL1Loss, self).__init__()
        self.weights=weights

    def forward(self, inputs, targets):
        """
        inputs: ネットワークの出力 (予測値)
        targets: 正解ラベル
        weights: 各サンプルに対する重み
        """
        # Smooth L1 損失を計算
        loss = nn.SmoothL1Loss(reduction='none')(inputs, targets)

        # 重みを適用して損失を計算
        weighted_loss = torch.mean(loss * self.weights)

        return weighted_loss


class MCRMSELoss(nn.Module):
    def __init__(self):
        super(MCRMSELoss, self).__init__()

    def forward(self, y_true, y_pred):
        colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=0)
        return torch.mean(torch.sqrt(colwise_mse), dim=0)

# AWP

In [None]:
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        criterion: _Loss,
        optimizer: Optimizer,
        apex: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.apex = apex
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, label: Tensor) -> Tensor:
        with torch.cuda.amp.autocast(enabled=self.apex):
            self._save()
            self._attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 直前に損失関数に通してパラメータの勾配を取得できるようにしておく必要あり
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# Helpler functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):

    if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
        LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    awp = AWP(
            model,
            criterion,
            optimizer,
            CFG.apex,
            adv_lr=CFG.adv_lr,
            adv_eps=CFG.adv_eps
        )
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions





# def train_fn_by_step(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, now_step):

#     # if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
#     #     LOGGER.info(f'AWP training with epoch {epoch+1}')
#     model.train()
#     # awp = AWP(
#     #         model,
#     #         criterion,
#     #         optimizer,
#     #         CFG.apex,
#     #         adv_lr=CFG.adv_lr,
#     #         adv_eps=CFG.adv_eps
#     #     )
#     if now_step==0:
#       scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
#       losses = AverageMeter()
#       start = end = time.time()
#       global_step = 0
#     for step, (inputs, labels) in enumerate(train_loader):
#         if now_step>step:
#           continue
#         inputs = collate(inputs)
#         for k, v in inputs.items():
#             inputs[k] = v.to(device)
#         labels = labels.to(device)
#         batch_size = labels.size(0)
#         with torch.cuda.amp.autocast(enabled=CFG.apex):
#             y_preds = model(inputs)
#             loss = criterion(y_preds, labels)
#         if CFG.gradient_accumulation_steps > 1:
#             loss = loss / CFG.gradient_accumulation_steps
#         losses.update(loss.item(), batch_size)
#         scaler.scale(loss).backward()
#         grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

#         # if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
#         #     loss = awp.attack_backward(inputs, labels)
#         #     scaler.scale(loss).backward()
#         #     awp._restore()

#         if (step + 1) % CFG.gradient_accumulation_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             global_step += 1
#             if CFG.batch_scheduler:
#                 scheduler.step()
#         end = time.time()
#         if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
#             print('Epoch: [{0}][{1}/{2}] '
#                   'Elapsed {remain:s} '
#                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
#                   'Grad: {grad_norm:.4f}  '
#                   'LR: {lr:.8f}  '
#                   .format(epoch+1, step, len(train_loader),
#                           remain=timeSince(start, float(step+1)/len(train_loader)),
#                           loss=losses,
#                           grad_norm=grad_norm,
#                           lr=scheduler.get_lr()[0]))

#         if CFG.wandb:
#             wandb.log({f"[fold{fold}] loss": losses.val,
#                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
#         if step%CFG.eval_steps==0:
#           return losses.avg, step+1 ,epoch

#     return losses.avg, step+1 ,epoch+1




# train loop

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def get_optimizer_grouped_parameters(cfg, model):
        """Layerwise Learning Rate Decay"""
        model_type = "model"
        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if model_type not in n],
                "lr": cfg.decoder_lr,
                "weight_decay": 0.0,
            },
        ]
        num_layers = model.config.num_hidden_layers
        layers = [getattr(model, model_type).embeddings] + list(
            getattr(model, model_type).encoder.layer
        )
        layers.reverse()
        lr = cfg.encoder_lr
        for layer in layers:
            optimizer_grouped_parameters += [
                {
                    "params": [
                        p
                        for n, p in layer.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": cfg.weight_decay,
                    "lr": lr,
                },
                {
                    "params": [
                        p
                        for n, p in layer.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": 0.0,
                    "lr": lr,
                },
            ]

            lr *= cfg.lr_weight_decay
        return optimizer_grouped_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    # optimizer_parameters = get_optimizer_grouped_parameters(CFG,model)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1,
                       f"[fold{fold}] avg_train_loss": avg_loss,
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds












# ====================================================
# train loop by steps
# ====================================================
def train_loop_steps(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    if CFG.reinit:
      model=reinit_bert(model)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        model.train()
        # if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
        #   LOGGER.info(f'AWP training with epoch {epoch+1}')

        # awp = AWP(
        #     model,
        #     criterion,
        #     optimizer,
        #     CFG.apex,
        #     adv_lr=CFG.adv_lr,
        #     adv_eps=CFG.adv_eps
        #     )
        scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
        losses = AverageMeter()
        start = end = time.time()
        global_step = 0
        for step, (inputs, labels) in enumerate(tqdm(train_loader)):

            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            batch_size = labels.size(0)
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                y_preds = model(inputs)
                loss = criterion(y_preds, labels)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                if CFG.batch_scheduler:
                    scheduler.step()
            end = time.time()
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.8f}  '
                      .format(epoch+1, step, len(train_loader),
                              remain=timeSince(start, float(step+1)/len(train_loader)),
                              loss=losses,
                              grad_norm=grad_norm,
                              lr=scheduler.get_lr()[0]))
            if CFG.wandb:
                wandb.log({f"[fold{fold}] loss": losses.val,
                          f"[fold{fold}] lr": scheduler.get_lr()[0]})

            if (step % CFG.eval_steps==0 and step!=0) or step == (len(train_loader)-1):

                  # valid
                  losses_val = AverageMeter()
                  model.eval()
                  preds = []

                  for val_step, (inputs, labels) in enumerate(valid_loader):

                      inputs = collate(inputs)
                      for k, v in inputs.items():
                          inputs[k] = v.to(device)
                      labels = labels.to(device)
                      batch_size = labels.size(0)
                      with torch.no_grad():
                          y_preds = model(inputs)
                          loss = criterion(y_preds, labels)
                      if CFG.gradient_accumulation_steps > 1:
                          loss = loss / CFG.gradient_accumulation_steps
                      losses_val.update(loss.item(), batch_size)
                      preds.append(y_preds.to('cpu').numpy())

                      if val_step % CFG.print_freq == 0 or val_step == (len(valid_loader)-1):
                          print('EVAL: [{0}/{1}] '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                .format(val_step, len(valid_loader),
                                        loss=losses_val))
                  predictions = np.concatenate(preds)

                  # scoring
                  score, scores = get_score(valid_labels, predictions)

                  elapsed = time.time() - start_time



                  if best_score > score:
                      best_score = score
                      LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                      torch.save({'model': model.state_dict(),
                                  'predictions': predictions},
                                  OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")


                  LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {losses.avg:.4f}  avg_val_loss: {losses_val.avg:.4f}')
                  LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
                  # if CFG.wandb:
                  #     wandb.log({f"[fold{fold}] epoch": epoch+1,
                  #                f"[fold{fold}] avg_train_loss": avg_loss,
                  #                f"[fold{fold}] avg_val_loss": avg_val_loss,
                  #                f"[fold{fold}] score": score})
                  model.train()

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds






# ====================================================
# train loop by steps
# ====================================================
def prediction(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    if CFG.reinit:
      model=reinit_bert(model)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    model.eval()
    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds

In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                if CFG.save_strategy=='epoch':
                  _oof_df = train_loop(train, fold)
                elif CFG.save_strategy=='step':
                  _oof_df = train_loop_steps(train,fold)
                elif CFG.save_strategy=='prediction':
                  _oof_df = prediction(train,fold)
                oof_df = pd.concat([oof_df, _oof_df])

                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

    if CFG.wandb:
        wandb.finish()
    runtime.unassign()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch: [1][0/757] Elapsed 0m 5s (remain 65m 57s) Loss: 0.9861(0.9861) Grad: inf  LR: 0.00000009  
Epoch: [1][20/757] Elapsed 0m 24s (remain 14m 9s) Loss: 0.9393(1.2215) Grad: 99944.4531  LR: 0.00000185  
Epoch: [1][40/757] Elapsed 0m 43s (remain 12m 37s) Loss: 0.8005(1.0863) Grad: 85489.2031  LR: 0.00000361  
Epoch: [1][60/757] Elapsed 1m 2s (remain 11m 49s) Loss: 0.7167(1.0467) Grad: 388117.3438  LR: 0.00000537  
Epoch: [1][80/757] Elapsed 1m 21s (remain 11m 16s) Loss: 1.0258(0.9948) Grad: 359737.8750  LR: 0.00000714  
Epoch: [1][100/757] Elapsed 1m 40s (remain 10m 50s) Loss: 0.6835(0.9402) Grad: 571556.2500  LR: 0.00000890  
Epoch: [1][120/757] Elapsed 1m 59s (remain 10m 27s) Loss: 0.6780(0.9006) Grad: 236600.3281  LR: 0.00001066  
Epoch: [1][140/757] Elapsed 2m 18s (remain 10m 4s) Loss: 0.8513(0.8619) Grad: 249007.3281  LR: 0.00001242  
Epoch: [1][160/757] Elapsed 2m 37s (remain 9m 43s) Loss: 0.6394(0.8433) Grad: 150115.9688  LR: 0.00001419  
Epoch: [1][180/757] Elapsed 2m 56s (rema

Epoch 1 - avg_train_loss: 0.5833  avg_val_loss: 0.6624  time: 799s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5833  avg_val_loss: 0.6624  time: 799s
Epoch 1 - Score: 0.6785  Scores: [0.5540214195433778, 0.8028901100415703]
INFO:__main__:Epoch 1 - Score: 0.6785  Scores: [0.5540214195433778, 0.8028901100415703]
Epoch 1 - Save Best Score: 0.6785 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6785 Model


EVAL: [68/69] Elapsed 1m 13s (remain 0m 0s) Loss: 0.9612(0.6624) 
Epoch: [2][0/757] Elapsed 0m 1s (remain 14m 51s) Loss: 0.3503(0.3503) Grad: inf  LR: 0.00001686  
Epoch: [2][20/757] Elapsed 0m 19s (remain 11m 33s) Loss: 0.2778(0.4786) Grad: 102246.5234  LR: 0.00001663  
Epoch: [2][40/757] Elapsed 0m 38s (remain 11m 14s) Loss: 0.5601(0.4544) Grad: 113027.0469  LR: 0.00001640  
Epoch: [2][60/757] Elapsed 0m 57s (remain 10m 59s) Loss: 0.4905(0.4400) Grad: 133465.8438  LR: 0.00001616  
Epoch: [2][80/757] Elapsed 1m 16s (remain 10m 42s) Loss: 0.3294(0.4493) Grad: 67458.6719  LR: 0.00001591  
Epoch: [2][100/757] Elapsed 1m 35s (remain 10m 22s) Loss: 0.5408(0.4472) Grad: 55273.1641  LR: 0.00001566  
Epoch: [2][120/757] Elapsed 1m 55s (remain 10m 4s) Loss: 0.2114(0.4563) Grad: 71772.2578  LR: 0.00001541  
Epoch: [2][140/757] Elapsed 2m 14s (remain 9m 46s) Loss: 0.3068(0.4578) Grad: 30938.0020  LR: 0.00001515  
Epoch: [2][160/757] Elapsed 2m 33s (remain 9m 27s) Loss: 0.3654(0.4561) Grad: 11731

Epoch 2 - avg_train_loss: 0.4387  avg_val_loss: 0.5519  time: 796s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4387  avg_val_loss: 0.5519  time: 796s
Epoch 2 - Score: 0.5650  Scores: [0.5103575770084716, 0.6196013618203523]
INFO:__main__:Epoch 2 - Score: 0.5650  Scores: [0.5103575770084716, 0.6196013618203523]
Epoch 2 - Save Best Score: 0.5650 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5650 Model


EVAL: [68/69] Elapsed 1m 13s (remain 0m 0s) Loss: 0.7755(0.5519) 
Epoch: [3][0/757] Elapsed 0m 1s (remain 14m 59s) Loss: 0.5771(0.5771) Grad: inf  LR: 0.00000604  
Epoch: [3][20/757] Elapsed 0m 20s (remain 11m 44s) Loss: 0.2492(0.4114) Grad: 124716.1172  LR: 0.00000576  
Epoch: [3][40/757] Elapsed 0m 39s (remain 11m 25s) Loss: 0.3673(0.4124) Grad: 74738.1562  LR: 0.00000549  
Epoch: [3][60/757] Elapsed 0m 58s (remain 11m 4s) Loss: 0.3310(0.4011) Grad: 27653.2090  LR: 0.00000521  
Epoch: [3][80/757] Elapsed 1m 17s (remain 10m 45s) Loss: 0.3234(0.4011) Grad: 83773.8672  LR: 0.00000495  
Epoch: [3][100/757] Elapsed 1m 36s (remain 10m 26s) Loss: 0.4681(0.4001) Grad: 113445.2188  LR: 0.00000468  
Epoch: [3][120/757] Elapsed 1m 55s (remain 10m 7s) Loss: 0.4618(0.4016) Grad: 101065.9922  LR: 0.00000443  
Epoch: [3][140/757] Elapsed 2m 14s (remain 9m 47s) Loss: 0.2858(0.3962) Grad: 87506.5156  LR: 0.00000417  
Epoch: [3][160/757] Elapsed 2m 33s (remain 9m 26s) Loss: 0.4805(0.3932) Grad: 58111.

Epoch 3 - avg_train_loss: 0.3675  avg_val_loss: 0.6139  time: 794s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3675  avg_val_loss: 0.6139  time: 794s
Epoch 3 - Score: 0.6286  Scores: [0.5723919164536118, 0.6848512246914695]
INFO:__main__:Epoch 3 - Score: 0.6286  Scores: [0.5723919164536118, 0.6848512246914695]


EVAL: [68/69] Elapsed 1m 13s (remain 0m 0s) Loss: 0.8597(0.6139) 


Score: 0.5650  Scores: [0.5103575770084716, 0.6196013618203523]
INFO:__main__:Score: 0.5650  Scores: [0.5103575770084716, 0.6196013618203523]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size

Epoch: [1][0/638] Elapsed 0m 1s (remain 13m 7s) Loss: 1.7260(1.7260) Grad: inf  LR: 0.00000010  
Epoch: [1][20/638] Elapsed 0m 20s (remain 9m 56s) Loss: 1.2687(1.6267) Grad: 1251775.7500  LR: 0.00000220  
Epoch: [1][40/638] Elapsed 0m 39s (remain 9m 34s) Loss: 1.5753(1.3919) Grad: 919441.8125  LR: 0.00000429  
Epoch: [1][60/638] Elapsed 0m 58s (remain 9m 14s) Loss: 1.1463(1.2483) Grad: 666866.8750  LR: 0.00000639  
Epoch: [1][80/638] Elapsed 1m 17s (remain 8m 54s) Loss: 0.6129(1.1664) Grad: 2771121.5000  LR: 0.00000848  
Epoch: [1][100/638] Elapsed 1m 36s (remain 8m 34s) Loss: 0.6529(1.0927) Grad: 2677764.5000  LR: 0.00001058  
Epoch: [1][120/638] Elapsed 1m 55s (remain 8m 15s) Loss: 0.7784(1.0472) Grad: 782994.6250  LR: 0.00001267  
Epoch: [1][140/638] Elapsed 2m 15s (remain 7m 56s) Loss: 0.7543(1.0145) Grad: 525673.6875  LR: 0.00001476  
Epoch: [1][160/638] Elapsed 2m 34s (remain 7m 36s) Loss: 0.7329(0.9814) Grad: 1131583.6250  LR: 0.00001686  
Epoch: [1][180/638] Elapsed 2m 53s (rem

Epoch 1 - avg_train_loss: 0.7173  avg_val_loss: 0.5817  time: 753s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7173  avg_val_loss: 0.5817  time: 753s
Epoch 1 - Score: 0.5913  Scores: [0.6102958961758471, 0.5723110165897597]
INFO:__main__:Epoch 1 - Score: 0.5913  Scores: [0.6102958961758471, 0.5723110165897597]
Epoch 1 - Save Best Score: 0.5913 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5913 Model


EVAL: [128/129] Elapsed 2m 23s (remain 0m 0s) Loss: 0.5025(0.5817) 
Epoch: [2][0/638] Elapsed 0m 1s (remain 12m 50s) Loss: 0.4959(0.4959) Grad: inf  LR: 0.00001685  
Epoch: [2][20/638] Elapsed 0m 20s (remain 9m 52s) Loss: 0.5726(0.4936) Grad: 74735.0000  LR: 0.00001658  
Epoch: [2][40/638] Elapsed 0m 39s (remain 9m 32s) Loss: 0.5596(0.4977) Grad: 57000.2852  LR: 0.00001630  
Epoch: [2][60/638] Elapsed 0m 58s (remain 9m 12s) Loss: 0.5799(0.4946) Grad: 123864.4453  LR: 0.00001601  
Epoch: [2][80/638] Elapsed 1m 17s (remain 8m 53s) Loss: 0.4717(0.4849) Grad: 149995.6562  LR: 0.00001572  
Epoch: [2][100/638] Elapsed 1m 36s (remain 8m 34s) Loss: 0.4505(0.5038) Grad: 52000.4375  LR: 0.00001541  
Epoch: [2][120/638] Elapsed 1m 55s (remain 8m 14s) Loss: 0.3638(0.4951) Grad: 101025.1797  LR: 0.00001510  
Epoch: [2][140/638] Elapsed 2m 14s (remain 7m 55s) Loss: 0.5099(0.4978) Grad: 55850.9414  LR: 0.00001479  
Epoch: [2][160/638] Elapsed 2m 33s (remain 7m 35s) Loss: 0.3869(0.5009) Grad: 122367.3

Epoch 2 - avg_train_loss: 0.4776  avg_val_loss: 0.4920  time: 753s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4776  avg_val_loss: 0.4920  time: 753s
Epoch 2 - Score: 0.5001  Scores: [0.4563724140893895, 0.5437784964515058]
INFO:__main__:Epoch 2 - Score: 0.5001  Scores: [0.4563724140893895, 0.5437784964515058]
Epoch 2 - Save Best Score: 0.5001 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5001 Model


EVAL: [128/129] Elapsed 2m 23s (remain 0m 0s) Loss: 0.4570(0.4920) 
Epoch: [3][0/638] Elapsed 0m 1s (remain 12m 53s) Loss: 0.3140(0.3140) Grad: inf  LR: 0.00000603  
Epoch: [3][20/638] Elapsed 0m 20s (remain 9m 55s) Loss: 0.6198(0.4035) Grad: 74967.1172  LR: 0.00000570  
Epoch: [3][40/638] Elapsed 0m 39s (remain 9m 33s) Loss: 0.3309(0.4129) Grad: 86514.4453  LR: 0.00000537  
Epoch: [3][60/638] Elapsed 0m 58s (remain 9m 13s) Loss: 0.3009(0.4068) Grad: 80261.1797  LR: 0.00000505  
Epoch: [3][80/638] Elapsed 1m 17s (remain 8m 53s) Loss: 0.4690(0.4109) Grad: 235836.3594  LR: 0.00000474  
Epoch: [3][100/638] Elapsed 1m 36s (remain 8m 32s) Loss: 0.5462(0.4178) Grad: 67710.5859  LR: 0.00000443  
Epoch: [3][120/638] Elapsed 1m 55s (remain 8m 13s) Loss: 0.3145(0.4207) Grad: 71801.2109  LR: 0.00000413  
Epoch: [3][140/638] Elapsed 2m 14s (remain 7m 54s) Loss: 0.5648(0.4189) Grad: 63995.2461  LR: 0.00000384  
Epoch: [3][160/638] Elapsed 2m 33s (remain 7m 35s) Loss: 0.5275(0.4172) Grad: 88775.7422

Epoch 3 - avg_train_loss: 0.4044  avg_val_loss: 0.4546  time: 753s
INFO:__main__:Epoch 3 - avg_train_loss: 0.4044  avg_val_loss: 0.4546  time: 753s
Epoch 3 - Score: 0.4636  Scores: [0.4142017583793012, 0.5130377705589038]
INFO:__main__:Epoch 3 - Score: 0.4636  Scores: [0.4142017583793012, 0.5130377705589038]
Epoch 3 - Save Best Score: 0.4636 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4636 Model


EVAL: [128/129] Elapsed 2m 23s (remain 0m 0s) Loss: 0.3908(0.4546) 


Score: 0.4636  Scores: [0.4142017583793012, 0.5130377705589038]
INFO:__main__:Score: 0.4636  Scores: [0.4142017583793012, 0.5130377705589038]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size

Epoch: [1][0/644] Elapsed 0m 1s (remain 13m 8s) Loss: 1.1447(1.1447) Grad: inf  LR: 0.00000010  
Epoch: [1][20/644] Elapsed 0m 20s (remain 10m 2s) Loss: 0.5419(1.0952) Grad: 128770.7266  LR: 0.00000218  
Epoch: [1][40/644] Elapsed 0m 39s (remain 9m 38s) Loss: 0.6212(1.0517) Grad: 102766.7734  LR: 0.00000425  
Epoch: [1][60/644] Elapsed 0m 58s (remain 9m 18s) Loss: 1.0342(1.0164) Grad: 146498.9531  LR: 0.00000632  
Epoch: [1][80/644] Elapsed 1m 17s (remain 8m 59s) Loss: 1.1921(0.9803) Grad: 1295099.1250  LR: 0.00000839  
Epoch: [1][100/644] Elapsed 1m 36s (remain 8m 40s) Loss: 0.5622(0.9566) Grad: 1848592.8750  LR: 0.00001047  
Epoch: [1][120/644] Elapsed 1m 55s (remain 8m 20s) Loss: 0.8650(0.9088) Grad: 1390417.2500  LR: 0.00001254  
Epoch: [1][140/644] Elapsed 2m 14s (remain 8m 0s) Loss: 0.8542(0.8802) Grad: 1002350.6250  LR: 0.00001461  
Epoch: [1][160/644] Elapsed 2m 33s (remain 7m 41s) Loss: 0.4301(0.8537) Grad: 611889.9375  LR: 0.00001668  
Epoch: [1][180/644] Elapsed 2m 53s (rema

Epoch 1 - avg_train_loss: 0.6276  avg_val_loss: 0.7696  time: 759s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6276  avg_val_loss: 0.7696  time: 759s
Epoch 1 - Score: 0.7819  Scores: [0.620771975481113, 0.9429453064209558]
INFO:__main__:Epoch 1 - Score: 0.7819  Scores: [0.620771975481113, 0.9429453064209558]
Epoch 1 - Save Best Score: 0.7819 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7819 Model


EVAL: [125/126] Elapsed 2m 23s (remain 0m 0s) Loss: 0.6346(0.7696) 
Epoch: [2][0/644] Elapsed 0m 1s (remain 13m 11s) Loss: 0.6878(0.6878) Grad: inf  LR: 0.00001685  
Epoch: [2][20/644] Elapsed 0m 20s (remain 10m 1s) Loss: 0.5908(0.5286) Grad: 95538.5078  LR: 0.00001658  
Epoch: [2][40/644] Elapsed 0m 39s (remain 9m 39s) Loss: 0.5683(0.5007) Grad: 86240.6719  LR: 0.00001631  
Epoch: [2][60/644] Elapsed 0m 58s (remain 9m 19s) Loss: 0.3387(0.4909) Grad: 129774.2656  LR: 0.00001602  
Epoch: [2][80/644] Elapsed 1m 17s (remain 8m 59s) Loss: 0.3878(0.4747) Grad: 26776.8301  LR: 0.00001573  
Epoch: [2][100/644] Elapsed 1m 36s (remain 8m 40s) Loss: 0.4897(0.4778) Grad: 130081.5391  LR: 0.00001543  
Epoch: [2][120/644] Elapsed 1m 55s (remain 8m 21s) Loss: 0.6060(0.4848) Grad: 155081.1250  LR: 0.00001512  
Epoch: [2][140/644] Elapsed 2m 15s (remain 8m 1s) Loss: 0.4785(0.4759) Grad: 67172.3281  LR: 0.00001481  
Epoch: [2][160/644] Elapsed 2m 33s (remain 7m 41s) Loss: 0.5274(0.4712) Grad: 139200.87

Epoch 2 - avg_train_loss: 0.4522  avg_val_loss: 0.5562  time: 758s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4522  avg_val_loss: 0.5562  time: 758s
Epoch 2 - Score: 0.5668  Scores: [0.5120275815362262, 0.6216015226680706]
INFO:__main__:Epoch 2 - Score: 0.5668  Scores: [0.5120275815362262, 0.6216015226680706]
Epoch 2 - Save Best Score: 0.5668 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5668 Model


EVAL: [125/126] Elapsed 2m 23s (remain 0m 0s) Loss: 0.4063(0.5562) 
Epoch: [3][0/644] Elapsed 0m 1s (remain 13m 3s) Loss: 0.3388(0.3388) Grad: inf  LR: 0.00000603  
Epoch: [3][20/644] Elapsed 0m 20s (remain 10m 2s) Loss: 0.2857(0.3904) Grad: 93235.1172  LR: 0.00000570  
Epoch: [3][40/644] Elapsed 0m 39s (remain 9m 40s) Loss: 0.3440(0.3921) Grad: 71455.8281  LR: 0.00000538  
Epoch: [3][60/644] Elapsed 0m 58s (remain 9m 20s) Loss: 0.3419(0.3874) Grad: 77528.2812  LR: 0.00000506  
Epoch: [3][80/644] Elapsed 1m 17s (remain 9m 0s) Loss: 0.3266(0.3816) Grad: 110419.2734  LR: 0.00000475  
Epoch: [3][100/644] Elapsed 1m 36s (remain 8m 40s) Loss: 0.3652(0.3812) Grad: 97408.5078  LR: 0.00000445  
Epoch: [3][120/644] Elapsed 1m 55s (remain 8m 20s) Loss: 0.2333(0.3815) Grad: 38735.8594  LR: 0.00000415  
Epoch: [3][140/644] Elapsed 2m 15s (remain 8m 1s) Loss: 0.3388(0.3810) Grad: 102466.4922  LR: 0.00000386  
Epoch: [3][160/644] Elapsed 2m 34s (remain 7m 42s) Loss: 0.3803(0.3807) Grad: 100417.9375 

Epoch 3 - avg_train_loss: 0.3714  avg_val_loss: 0.4992  time: 758s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3714  avg_val_loss: 0.4992  time: 758s
Epoch 3 - Score: 0.5098  Scores: [0.4368407311072702, 0.5827776890856115]
INFO:__main__:Epoch 3 - Score: 0.5098  Scores: [0.4368407311072702, 0.5827776890856115]
Epoch 3 - Save Best Score: 0.5098 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5098 Model


EVAL: [125/126] Elapsed 2m 23s (remain 0m 0s) Loss: 0.2821(0.4992) 


Score: 0.5098  Scores: [0.4368407311072702, 0.5827776890856115]
INFO:__main__:Score: 0.5098  Scores: [0.4368407311072702, 0.5827776890856115]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size

Epoch: [1][0/646] Elapsed 0m 0s (remain 10m 31s) Loss: 1.2212(1.2212) Grad: inf  LR: 0.00000010  
Epoch: [1][20/646] Elapsed 0m 17s (remain 8m 39s) Loss: 1.0362(1.1059) Grad: 134395.8281  LR: 0.00000218  
Epoch: [1][40/646] Elapsed 0m 33s (remain 8m 20s) Loss: 0.8986(1.0938) Grad: 145497.6250  LR: 0.00000425  
Epoch: [1][60/646] Elapsed 0m 50s (remain 8m 6s) Loss: 1.0158(1.0847) Grad: 148990.1562  LR: 0.00000632  
Epoch: [1][80/646] Elapsed 1m 7s (remain 7m 48s) Loss: 0.8733(1.0438) Grad: 195638.2656  LR: 0.00000839  
Epoch: [1][100/646] Elapsed 1m 23s (remain 7m 32s) Loss: 0.5360(0.9877) Grad: 499724.0625  LR: 0.00001047  
Epoch: [1][120/646] Elapsed 1m 39s (remain 7m 13s) Loss: 0.6082(0.9261) Grad: 256313.6562  LR: 0.00001254  
Epoch: [1][140/646] Elapsed 1m 56s (remain 6m 56s) Loss: 0.7217(0.8870) Grad: 650705.0625  LR: 0.00001461  
Epoch: [1][160/646] Elapsed 2m 12s (remain 6m 39s) Loss: 1.0437(0.8688) Grad: 415284.2188  LR: 0.00001668  
Epoch: [1][180/646] Elapsed 2m 29s (remain 6

Epoch 1 - avg_train_loss: 0.6291  avg_val_loss: 0.5762  time: 682s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6291  avg_val_loss: 0.5762  time: 682s
Epoch 1 - Score: 0.5840  Scores: [0.6345501725076598, 0.533399471895208]
INFO:__main__:Epoch 1 - Score: 0.5840  Scores: [0.6345501725076598, 0.533399471895208]
Epoch 1 - Save Best Score: 0.5840 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5840 Model


EVAL: [124/125] Elapsed 2m 36s (remain 0m 0s) Loss: 0.4932(0.5762) 
Epoch: [2][0/646] Elapsed 0m 1s (remain 11m 46s) Loss: 0.6355(0.6355) Grad: inf  LR: 0.00001684  
Epoch: [2][20/646] Elapsed 0m 17s (remain 8m 28s) Loss: 0.4103(0.5345) Grad: 110244.0000  LR: 0.00001657  
Epoch: [2][40/646] Elapsed 0m 32s (remain 8m 4s) Loss: 0.5430(0.5176) Grad: 187120.1250  LR: 0.00001630  
Epoch: [2][60/646] Elapsed 0m 49s (remain 7m 51s) Loss: 0.3712(0.5138) Grad: 75158.3281  LR: 0.00001602  
Epoch: [2][80/646] Elapsed 1m 5s (remain 7m 36s) Loss: 0.7367(0.5136) Grad: 182314.3906  LR: 0.00001572  
Epoch: [2][100/646] Elapsed 1m 21s (remain 7m 20s) Loss: 0.3655(0.5089) Grad: 70281.9609  LR: 0.00001542  
Epoch: [2][120/646] Elapsed 1m 38s (remain 7m 5s) Loss: 0.3049(0.4981) Grad: 84260.1562  LR: 0.00001512  
Epoch: [2][140/646] Elapsed 1m 54s (remain 6m 49s) Loss: 0.4008(0.4918) Grad: 51424.3125  LR: 0.00001481  
Epoch: [2][160/646] Elapsed 2m 10s (remain 6m 32s) Loss: 0.3671(0.4913) Grad: 121193.2812

Epoch 2 - avg_train_loss: 0.4626  avg_val_loss: 0.4991  time: 682s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4626  avg_val_loss: 0.4991  time: 682s
Epoch 2 - Score: 0.5079  Scores: [0.43044144007941115, 0.585284801772327]
INFO:__main__:Epoch 2 - Score: 0.5079  Scores: [0.43044144007941115, 0.585284801772327]
Epoch 2 - Save Best Score: 0.5079 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5079 Model


EVAL: [124/125] Elapsed 2m 36s (remain 0m 0s) Loss: 0.4856(0.4991) 
Epoch: [3][0/646] Elapsed 0m 1s (remain 11m 50s) Loss: 0.5323(0.5323) Grad: inf  LR: 0.00000602  
Epoch: [3][20/646] Elapsed 0m 17s (remain 8m 34s) Loss: 0.3489(0.4040) Grad: 87065.6719  LR: 0.00000569  
Epoch: [3][40/646] Elapsed 0m 33s (remain 8m 14s) Loss: 0.3604(0.3855) Grad: 107629.1719  LR: 0.00000537  
Epoch: [3][60/646] Elapsed 0m 50s (remain 8m 0s) Loss: 0.3021(0.3870) Grad: 104349.3359  LR: 0.00000505  
Epoch: [3][80/646] Elapsed 1m 6s (remain 7m 42s) Loss: 0.2740(0.3844) Grad: 44464.6133  LR: 0.00000474  
Epoch: [3][100/646] Elapsed 1m 22s (remain 7m 26s) Loss: 0.4380(0.3869) Grad: 117302.6875  LR: 0.00000444  
Epoch: [3][120/646] Elapsed 1m 38s (remain 7m 8s) Loss: 0.2677(0.3875) Grad: 116527.5469  LR: 0.00000414  
Epoch: [3][140/646] Elapsed 1m 54s (remain 6m 50s) Loss: 0.4630(0.3873) Grad: 73726.7422  LR: 0.00000386  
Epoch: [3][160/646] Elapsed 2m 10s (remain 6m 32s) Loss: 0.4583(0.3863) Grad: 104477.679

Epoch 3 - avg_train_loss: 0.3829  avg_val_loss: 0.4455  time: 683s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3829  avg_val_loss: 0.4455  time: 683s
Epoch 3 - Score: 0.4537  Scores: [0.3951309021594487, 0.5123606191049076]
INFO:__main__:Epoch 3 - Score: 0.4537  Scores: [0.3951309021594487, 0.5123606191049076]
Epoch 3 - Save Best Score: 0.4537 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4537 Model


EVAL: [124/125] Elapsed 2m 36s (remain 0m 0s) Loss: 0.4131(0.4455) 
