# About this notebook
- This notebook is a modified version of the PyTorch pipeline from Y.Nakama's starter NLP notebook from Feedback Prize 3 competition [here](https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train). Don't forget to upvote his work!
- Inference notebook is [here](https://www.kaggle.com/mohammad2012191/debertav3-pytorch-baseline-inference-cv-0-467)

In [1]:
!nvidia-smi

Wed Aug 30 17:17:45 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from google.colab import runtime



# CFG

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    exp='exp049'
    is_exp=False
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.98)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    awp=False
    nth_awp_start_epoch= 3
    adv_lr = 1e-4
    adv_eps = 1e-2
    eval_steps = 150
    save_strategy='step'
    pooling='WeightedLayerPooling'
    freeze=True
    freeze_top_num_layer=10

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

# Directory settings

In [5]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = f'/content/drive/MyDrive/Kaggle/outputs/{CFG.exp}/'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [6]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:

    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [7]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install -q transformers')
!pip install transformers==4.31.0
os.system('pip install -q tokenizers')
!pip install tokenizers==0.13.3
!pip install sentencepiece


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers==4.31.0)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [

# Utils

In [8]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

# Data Loading

In [9]:
# ====================================================
# Data Loading
# ====================================================
input_path = '/content/drive/MyDrive/Kaggle/inputs/'
train = pd.read_csv(input_path+'summaries_train.csv')
test = pd.read_csv(input_path+'summaries_test.csv')
submission = pd.read_csv(input_path+'sample_submission.csv')
prompt_train = pd.read_csv(input_path+'prompts_train.csv')
prompt_test = pd.read_csv(input_path+'prompts_test.csv')
train = pd.merge(train,prompt_train,how='left',on='prompt_id')
test = pd.merge(test,prompt_test,how='left',on='prompt_id')
print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (7165, 8)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


test.shape: (4, 6)


Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...
1,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...
2,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...


submission.shape: (4, 3)


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [10]:
# oof_df=pd.read_pickle(input_path+'oof_df.pkl')

In [11]:
train['text'] =  train['prompt_question'] + ' [SEP] ' + train['text']
test['text'] =  test['prompt_question'] + '[SEP]' + test['text']


#################################################
# prompt_textも
#################################################

# # "text"列の長さを計算して新しい列"length"に追加
# train['length'] = train['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# train['text'] = train['length'].astype(str) + '[SEP]' + train['prompt_question'] + '[SEP]' +train['prompt_title'] + 'summary(' + train['text'] +') [SEP] source of summary('+train['prompt_text']+')'

# # "text"列の長さを計算して新しい列"length"に追加
# test['length'] = test['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# test['text'] = test['length'].astype(str) + '[SEP]' + test['prompt_question'] + '[SEP]' +test['prompt_title'] + 'summary(' + test['text'] +') [SEP] source of summary('+test['prompt_text']+')'


# CV split

In [12]:
# ====================================================
# CV split
# ====================================================
# Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
#     train.loc[val_index, 'fold'] = int(n)
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

train["fold"] = train["prompt_id"].map(id2fold)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1103
1    2057
2    2009
3    1996
dtype: int64

In [13]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=3000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [14]:
CFG.model

'microsoft/deberta-v3-large'

In [15]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
train['text'].iloc[8]

'Summarize at least 3 elements of an ideal tragedy, as described by Aristotle. [SEP] 1 element of an ideal tragedy is that it should be arranged on a complex plan.  Another element of an ideal tragedy is that it should only have one main issue. The last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.'

In [17]:
# テキストをエンコード
text = 'unnko'
encoded = tokenizer(text, return_tensors='pt')

# デコードして元のテキストを取得

decoded_tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
decoded_text = " ".join(decoded_tokens)

print(f"Original text: {text}")
print(f"Encoded: {encoded}")
print(f"Decoded text: {decoded_text}")

Original text: unnko
Encoded: {'input_ids': tensor([[   1, 1655,  673, 4712,    2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Decoded text: [CLS] ▁un n ko [SEP]


# Dataset

In [18]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 854
INFO:__main__:max_len: 854


In [19]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [20]:
#ref:https://github.com/shu421/kagglib/blob/main/nlp/model.py
# ====================================================
# Model
# ====================================================

def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False
# =====================================================
# Pooling
# =====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings



class AttentionPooling(nn.Module):
    """
    Usage:
        self.pool = AttentionPooling(self.config.hidden_size)
    """
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
        )

        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float("-inf")
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings



class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]

class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm, dropout_rate, is_lstm=True):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm

        if is_lstm:
            self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        else:
            self.lstm = nn.GRU(self.hidden_size, self.hiddendim_lstm, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

class ConcatPooling(nn.Module):
    def __init__(self, n_layers=4):
        super(ConcatPooling, self, ).__init__()

        self.n_layers = n_layers

    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling

# GeM
class GeMText(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

# ===========================================
# custom Model
# ===========================================

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if cfg.pooling =='LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
            self.fc = nn.Linear(self.cfg.hidden_size, 2)
        elif cfg.pooling == "GeM":
            self.pool = GeMText()
            self.fc = nn.Linear(self.config.hidden_size, 2)
        elif cfg.pooling=='ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 2)
        elif cfg.pooling=='WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
            self.fc = nn.Linear(self.config.hidden_size, 2)


        self._init_weights(self.fc)


        # Freeze
        if self.cfg.freeze:
            freeze(self.model.encoder.layer[:self.cfg.freeze_top_num_layer])

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        if self.cfg.pooling=='MeanPooling':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling=='GeM':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output


# initialize layer
def reinit_bert(model):
    """_summary_

    Args:
        model (AutoModel): _description_

    Returns:
        model (AutoModel): _description_

    Usage:
        model = reinit_bert(model)
    """
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

# Loss

In [21]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss



class WeightedSmoothL1Loss(nn.Module):
    def __init__(self,weights = torch.tensor([0.5, 1.2], device = device )):
        super(WeightedSmoothL1Loss, self).__init__()
        self.weights=weights

    def forward(self, inputs, targets):
        """
        inputs: ネットワークの出力 (予測値)
        targets: 正解ラベル
        weights: 各サンプルに対する重み
        """
        # Smooth L1 損失を計算
        loss = nn.SmoothL1Loss(reduction='none')(inputs, targets)

        # 重みを適用して損失を計算
        weighted_loss = torch.mean(loss * self.weights)

        return weighted_loss


class MCRMSELoss(nn.Module):
    def __init__(self):
        super(MCRMSELoss, self).__init__()

    def forward(self, y_true, y_pred):
        colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=0)
        return torch.mean(torch.sqrt(colwise_mse), dim=0)

# AWP

In [22]:
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        criterion: _Loss,
        optimizer: Optimizer,
        apex: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.apex = apex
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, label: Tensor) -> Tensor:
        with torch.cuda.amp.autocast(enabled=self.apex):
            self._save()
            self._attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 直前に損失関数に通してパラメータの勾配を取得できるようにしておく必要あり
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# Helpler functions

In [23]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):

    if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
        LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    awp = AWP(
            model,
            criterion,
            optimizer,
            CFG.apex,
            adv_lr=CFG.adv_lr,
            adv_eps=CFG.adv_eps
        )
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions





# def train_fn_by_step(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, now_step):

#     # if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
#     #     LOGGER.info(f'AWP training with epoch {epoch+1}')
#     model.train()
#     # awp = AWP(
#     #         model,
#     #         criterion,
#     #         optimizer,
#     #         CFG.apex,
#     #         adv_lr=CFG.adv_lr,
#     #         adv_eps=CFG.adv_eps
#     #     )
#     if now_step==0:
#       scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
#       losses = AverageMeter()
#       start = end = time.time()
#       global_step = 0
#     for step, (inputs, labels) in enumerate(train_loader):
#         if now_step>step:
#           continue
#         inputs = collate(inputs)
#         for k, v in inputs.items():
#             inputs[k] = v.to(device)
#         labels = labels.to(device)
#         batch_size = labels.size(0)
#         with torch.cuda.amp.autocast(enabled=CFG.apex):
#             y_preds = model(inputs)
#             loss = criterion(y_preds, labels)
#         if CFG.gradient_accumulation_steps > 1:
#             loss = loss / CFG.gradient_accumulation_steps
#         losses.update(loss.item(), batch_size)
#         scaler.scale(loss).backward()
#         grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

#         # if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
#         #     loss = awp.attack_backward(inputs, labels)
#         #     scaler.scale(loss).backward()
#         #     awp._restore()

#         if (step + 1) % CFG.gradient_accumulation_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             global_step += 1
#             if CFG.batch_scheduler:
#                 scheduler.step()
#         end = time.time()
#         if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
#             print('Epoch: [{0}][{1}/{2}] '
#                   'Elapsed {remain:s} '
#                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
#                   'Grad: {grad_norm:.4f}  '
#                   'LR: {lr:.8f}  '
#                   .format(epoch+1, step, len(train_loader),
#                           remain=timeSince(start, float(step+1)/len(train_loader)),
#                           loss=losses,
#                           grad_norm=grad_norm,
#                           lr=scheduler.get_lr()[0]))

#         if CFG.wandb:
#             wandb.log({f"[fold{fold}] loss": losses.val,
#                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
#         if step%CFG.eval_steps==0:
#           return losses.avg, step+1 ,epoch

#     return losses.avg, step+1 ,epoch+1




# train loop

In [24]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1,
                       f"[fold{fold}] avg_train_loss": avg_loss,
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds












# ====================================================
# train loop by steps
# ====================================================
def train_loop_steps(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        model.train()
        scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
        losses = AverageMeter()
        start = end = time.time()
        global_step = 0
        for step, (inputs, labels) in enumerate(train_loader):

            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            batch_size = labels.size(0)
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                y_preds = model(inputs)
                loss = criterion(y_preds, labels)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                if CFG.batch_scheduler:
                    scheduler.step()
            end = time.time()
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.8f}  '
                      .format(epoch+1, step, len(train_loader),
                              remain=timeSince(start, float(step+1)/len(train_loader)),
                              loss=losses,
                              grad_norm=grad_norm,
                              lr=scheduler.get_lr()[0]))
            if CFG.wandb:
                wandb.log({f"[fold{fold}] loss": losses.val,
                          f"[fold{fold}] lr": scheduler.get_lr()[0]})

            if (step % CFG.eval_steps==0 and step!=0) or step == (len(train_loader)-1):

                  # valid
                  losses_val = AverageMeter()
                  model.eval()
                  preds = []

                  for val_step, (inputs, labels) in enumerate(valid_loader):

                      inputs = collate(inputs)
                      for k, v in inputs.items():
                          inputs[k] = v.to(device)
                      labels = labels.to(device)
                      batch_size = labels.size(0)
                      with torch.no_grad():
                          y_preds = model(inputs)
                          loss = criterion(y_preds, labels)
                      if CFG.gradient_accumulation_steps > 1:
                          loss = loss / CFG.gradient_accumulation_steps
                      losses_val.update(loss.item(), batch_size)
                      preds.append(y_preds.to('cpu').numpy())

                      if val_step % CFG.print_freq == 0 or val_step == (len(valid_loader)-1):
                          print('EVAL: [{0}/{1}] '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                .format(val_step, len(valid_loader),
                                        loss=losses_val))
                  predictions = np.concatenate(preds)

                  # scoring
                  score, scores = get_score(valid_labels, predictions)

                  elapsed = time.time() - start_time



                  if best_score > score:
                      best_score = score
                      LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                      torch.save({'model': model.state_dict(),
                                  'predictions': predictions},
                                  OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")


                  LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {losses.avg:.4f}  avg_val_loss: {losses_val.avg:.4f}')
                  LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
                  # if CFG.wandb:
                  #     wandb.log({f"[fold{fold}] epoch": epoch+1,
                  #                f"[fold{fold}] avg_train_loss": avg_loss,
                  #                f"[fold{fold}] avg_val_loss": avg_val_loss,
                  #                f"[fold{fold}] score": score})
                  model.train()

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds


In [25]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                if CFG.save_strategy=='epoch':
                  _oof_df = train_loop(train, fold)
                elif CFG.save_strategy=='step':
                  _oof_df = train_loop_steps(train,fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

    if CFG.wandb:
        wandb.finish()
    runtime.unassign()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch: [1][0/757] Elapsed 0m 4s (remain 53m 37s) Loss: 1.1463(1.1463) Grad: 63597.6680  LR: 0.00000007  
Epoch: [1][20/757] Elapsed 0m 9s (remain 5m 20s) Loss: 0.8650(0.8809) Grad: 27126.3691  LR: 0.00000139  
Epoch: [1][40/757] Elapsed 0m 13s (remain 4m 4s) Loss: 0.9485(0.9176) Grad: 32774.6211  LR: 0.00000271  
Epoch: [1][60/757] Elapsed 0m 19s (remain 3m 47s) Loss: 0.8111(0.9158) Grad: 30405.8652  LR: 0.00000403  
Epoch: [1][80/757] Elapsed 0m 25s (remain 3m 32s) Loss: 1.0701(0.9456) Grad: 112035.7812  LR: 0.00000535  
Epoch: [1][100/757] Elapsed 0m 30s (remain 3m 15s) Loss: 0.6550(0.9227) Grad: 83293.9453  LR: 0.00000667  
Epoch: [1][120/757] Elapsed 0m 36s (remain 3m 10s) Loss: 0.7023(0.9001) Grad: 203740.5469  LR: 0.00000799  
Epoch: [1][140/757] Elapsed 0m 42s (remain 3m 4s) Loss: 0.6583(0.8723) Grad: 323125.2812  LR: 0.00000931  
EVAL: [0/69] Loss: 0.9941(0.9941) 
EVAL: [20/69] Loss: 0.7428(0.8712) 
EVAL: [40/69] Loss: 0.7343(0.8366) 
EVAL: [60/69] Loss: 0.7729(0.8378) 
EVAL: [

Epoch 1 - Save Best Score: 0.8690 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8690 Model
Epoch 1 - avg_train_loss: 0.8613  avg_val_loss: 0.8533
INFO:__main__:Epoch 1 - avg_train_loss: 0.8613  avg_val_loss: 0.8533
Epoch 1 - Score: 0.8690  Scores: [0.7164752783966943, 1.021437404239216]
INFO:__main__:Epoch 1 - Score: 0.8690  Scores: [0.7164752783966943, 1.021437404239216]


Epoch: [1][160/757] Elapsed 1m 22s (remain 5m 5s) Loss: 0.5790(0.8443) Grad: 367940.5312  LR: 0.00001063  
Epoch: [1][180/757] Elapsed 1m 28s (remain 4m 42s) Loss: 0.7998(0.8251) Grad: 182253.2812  LR: 0.00001195  
Epoch: [1][200/757] Elapsed 1m 34s (remain 4m 22s) Loss: 0.4624(0.7951) Grad: 198299.7500  LR: 0.00001327  
Epoch: [1][220/757] Elapsed 1m 45s (remain 4m 15s) Loss: 0.6800(0.7743) Grad: 223041.1250  LR: 0.00001459  
Epoch: [1][240/757] Elapsed 1m 51s (remain 3m 59s) Loss: 0.4729(0.7620) Grad: 127046.5391  LR: 0.00001591  
Epoch: [1][260/757] Elapsed 1m 59s (remain 3m 47s) Loss: 0.4593(0.7460) Grad: 448137.5000  LR: 0.00001723  
Epoch: [1][280/757] Elapsed 2m 4s (remain 3m 31s) Loss: 0.8430(0.7287) Grad: 238456.4844  LR: 0.00001855  
Epoch: [1][300/757] Elapsed 2m 11s (remain 3m 19s) Loss: 0.6211(0.7158) Grad: 387751.3750  LR: 0.00001987  
EVAL: [0/69] Loss: 0.8970(0.8970) 
EVAL: [20/69] Loss: 0.6133(0.7030) 
EVAL: [40/69] Loss: 0.5894(0.6819) 
EVAL: [60/69] Loss: 0.5718(0.67

Epoch 1 - Save Best Score: 0.7048 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7048 Model
Epoch 1 - avg_train_loss: 0.7158  avg_val_loss: 0.6891
INFO:__main__:Epoch 1 - avg_train_loss: 0.7158  avg_val_loss: 0.6891
Epoch 1 - Score: 0.7048  Scores: [0.5332546108952564, 0.8763199725511567]
INFO:__main__:Epoch 1 - Score: 0.7048  Scores: [0.5332546108952564, 0.8763199725511567]


Epoch: [1][320/757] Elapsed 2m 43s (remain 3m 42s) Loss: 0.4216(0.7030) Grad: 397181.9062  LR: 0.00002000  
Epoch: [1][340/757] Elapsed 2m 51s (remain 3m 29s) Loss: 1.0218(0.6956) Grad: 411840.2188  LR: 0.00001999  
Epoch: [1][360/757] Elapsed 2m 58s (remain 3m 15s) Loss: 0.4491(0.6886) Grad: 186960.3281  LR: 0.00001998  
Epoch: [1][380/757] Elapsed 3m 5s (remain 3m 2s) Loss: 0.5659(0.6791) Grad: 234192.1250  LR: 0.00001996  
Epoch: [1][400/757] Elapsed 3m 11s (remain 2m 50s) Loss: 0.4269(0.6712) Grad: 48446.5938  LR: 0.00001994  
Epoch: [1][420/757] Elapsed 3m 18s (remain 2m 38s) Loss: 0.5390(0.6647) Grad: 152273.2031  LR: 0.00001991  
Epoch: [1][440/757] Elapsed 3m 24s (remain 2m 26s) Loss: 0.5173(0.6584) Grad: 124573.8828  LR: 0.00001987  
EVAL: [0/69] Loss: 0.9203(0.9203) 
EVAL: [20/69] Loss: 0.6490(0.7187) 
EVAL: [40/69] Loss: 0.6612(0.6980) 
EVAL: [60/69] Loss: 0.5700(0.6927) 


Epoch 1 - avg_train_loss: 0.6559  avg_val_loss: 0.6995
INFO:__main__:Epoch 1 - avg_train_loss: 0.6559  avg_val_loss: 0.6995
Epoch 1 - Score: 0.7150  Scores: [0.557217551190583, 0.8727893331819139]
INFO:__main__:Epoch 1 - Score: 0.7150  Scores: [0.557217551190583, 0.8727893331819139]


EVAL: [68/69] Loss: 0.9315(0.6995) 
Epoch: [1][460/757] Elapsed 3m 49s (remain 2m 27s) Loss: 0.4435(0.6528) Grad: 95535.1875  LR: 0.00001983  
Epoch: [1][480/757] Elapsed 3m 56s (remain 2m 15s) Loss: 0.5903(0.6494) Grad: 67998.7266  LR: 0.00001979  
Epoch: [1][500/757] Elapsed 4m 1s (remain 2m 3s) Loss: 0.4139(0.6419) Grad: 137358.8438  LR: 0.00001974  
Epoch: [1][520/757] Elapsed 4m 6s (remain 1m 51s) Loss: 0.3883(0.6375) Grad: 45082.9336  LR: 0.00001969  
Epoch: [1][540/757] Elapsed 4m 13s (remain 1m 41s) Loss: 0.4675(0.6333) Grad: 64361.7695  LR: 0.00001963  
Epoch: [1][560/757] Elapsed 4m 18s (remain 1m 30s) Loss: 0.5416(0.6297) Grad: 84999.8203  LR: 0.00001956  
Epoch: [1][580/757] Elapsed 4m 25s (remain 1m 20s) Loss: 0.3925(0.6233) Grad: 72841.7969  LR: 0.00001949  
Epoch: [1][600/757] Elapsed 4m 30s (remain 1m 10s) Loss: 0.4730(0.6173) Grad: 42709.4922  LR: 0.00001942  
EVAL: [0/69] Loss: 0.8978(0.8978) 
EVAL: [20/69] Loss: 0.6306(0.7256) 
EVAL: [40/69] Loss: 0.6586(0.6986) 
EVA

Epoch 1 - avg_train_loss: 0.6173  avg_val_loss: 0.7053
INFO:__main__:Epoch 1 - avg_train_loss: 0.6173  avg_val_loss: 0.7053
Epoch 1 - Score: 0.7223  Scores: [0.657672807866929, 0.786906371699982]
INFO:__main__:Epoch 1 - Score: 0.7223  Scores: [0.657672807866929, 0.786906371699982]


EVAL: [68/69] Loss: 0.9778(0.7053) 
Epoch: [1][620/757] Elapsed 4m 55s (remain 1m 4s) Loss: 0.6993(0.6131) Grad: 112384.2734  LR: 0.00001934  
Epoch: [1][640/757] Elapsed 5m 1s (remain 0m 54s) Loss: 0.4373(0.6112) Grad: 66444.7656  LR: 0.00001925  
Epoch: [1][660/757] Elapsed 5m 6s (remain 0m 44s) Loss: 0.4395(0.6074) Grad: 64340.5312  LR: 0.00001916  
Epoch: [1][680/757] Elapsed 5m 12s (remain 0m 34s) Loss: 0.4791(0.6034) Grad: 71523.1406  LR: 0.00001907  
Epoch: [1][700/757] Elapsed 5m 18s (remain 0m 25s) Loss: 0.4242(0.6002) Grad: 57264.7500  LR: 0.00001897  
Epoch: [1][720/757] Elapsed 5m 23s (remain 0m 16s) Loss: 0.2573(0.5973) Grad: 46940.9297  LR: 0.00001886  
Epoch: [1][740/757] Elapsed 5m 29s (remain 0m 7s) Loss: 0.5277(0.5950) Grad: 153766.0625  LR: 0.00001875  
EVAL: [0/69] Loss: 0.8735(0.8735) 
EVAL: [20/69] Loss: 0.5162(0.6261) 
EVAL: [40/69] Loss: 0.5701(0.6049) 
EVAL: [60/69] Loss: 0.5013(0.6058) 


Epoch 1 - Save Best Score: 0.6315 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6315 Model


EVAL: [68/69] Loss: 0.8068(0.6156) 


Epoch 1 - avg_train_loss: 0.5936  avg_val_loss: 0.6156
INFO:__main__:Epoch 1 - avg_train_loss: 0.5936  avg_val_loss: 0.6156
Epoch 1 - Score: 0.6315  Scores: [0.5526865118427391, 0.7103367321414578]
INFO:__main__:Epoch 1 - Score: 0.6315  Scores: [0.5526865118427391, 0.7103367321414578]


Epoch: [1][756/757] Elapsed 6m 0s (remain 0m 0s) Loss: 0.5058(0.5930) Grad: 126659.8203  LR: 0.00001866  
EVAL: [0/69] Loss: 0.8573(0.8573) 
EVAL: [20/69] Loss: 0.4841(0.5885) 
EVAL: [40/69] Loss: 0.5443(0.5690) 
EVAL: [60/69] Loss: 0.4686(0.5698) 
EVAL: [68/69] Loss: 0.7782(0.5788) 


Epoch 1 - Save Best Score: 0.5969 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5969 Model
Epoch 1 - avg_train_loss: 0.5930  avg_val_loss: 0.5788
INFO:__main__:Epoch 1 - avg_train_loss: 0.5930  avg_val_loss: 0.5788
Epoch 1 - Score: 0.5969  Scores: [0.49988321645878453, 0.6938940400432257]
INFO:__main__:Epoch 1 - Score: 0.5969  Scores: [0.49988321645878453, 0.6938940400432257]


Epoch: [2][0/757] Elapsed 0m 0s (remain 6m 27s) Loss: 0.5168(0.5168) Grad: 252402.4062  LR: 0.00001866  
Epoch: [2][20/757] Elapsed 0m 6s (remain 3m 33s) Loss: 0.7902(0.4990) Grad: 218653.1250  LR: 0.00001854  
Epoch: [2][40/757] Elapsed 0m 12s (remain 3m 38s) Loss: 0.2702(0.4841) Grad: 109572.5312  LR: 0.00001842  
Epoch: [2][60/757] Elapsed 0m 18s (remain 3m 31s) Loss: 0.6805(0.4807) Grad: 145643.5469  LR: 0.00001829  
Epoch: [2][80/757] Elapsed 0m 27s (remain 3m 49s) Loss: 0.4192(0.4792) Grad: 156166.3906  LR: 0.00001816  
Epoch: [2][100/757] Elapsed 0m 34s (remain 3m 43s) Loss: 0.5348(0.4720) Grad: 107061.0859  LR: 0.00001803  
Epoch: [2][120/757] Elapsed 0m 41s (remain 3m 39s) Loss: 0.3716(0.4717) Grad: 138717.3281  LR: 0.00001789  
Epoch: [2][140/757] Elapsed 0m 47s (remain 3m 27s) Loss: 0.5357(0.4679) Grad: 246114.0000  LR: 0.00001774  
EVAL: [0/69] Loss: 0.8977(0.8977) 
EVAL: [20/69] Loss: 0.6353(0.7435) 
EVAL: [40/69] Loss: 0.7180(0.7139) 
EVAL: [60/69] Loss: 0.6100(0.7160) 


Epoch 2 - avg_train_loss: 0.4693  avg_val_loss: 0.7276
INFO:__main__:Epoch 2 - avg_train_loss: 0.4693  avg_val_loss: 0.7276
Epoch 2 - Score: 0.7447  Scores: [0.6509311623050532, 0.8384382097800415]
INFO:__main__:Epoch 2 - Score: 0.7447  Scores: [0.6509311623050532, 0.8384382097800415]


EVAL: [68/69] Loss: 1.0272(0.7276) 
Epoch: [2][160/757] Elapsed 1m 12s (remain 4m 26s) Loss: 0.4273(0.4681) Grad: 185303.0312  LR: 0.00001760  
Epoch: [2][180/757] Elapsed 1m 17s (remain 4m 7s) Loss: 0.3382(0.4642) Grad: 84453.6953  LR: 0.00001744  
Epoch: [2][200/757] Elapsed 1m 22s (remain 3m 49s) Loss: 0.4029(0.4606) Grad: 156454.4062  LR: 0.00001729  
Epoch: [2][220/757] Elapsed 1m 28s (remain 3m 35s) Loss: 0.4611(0.4585) Grad: 127112.4766  LR: 0.00001713  
Epoch: [2][240/757] Elapsed 1m 35s (remain 3m 24s) Loss: 0.5075(0.4595) Grad: 159133.7188  LR: 0.00001696  
Epoch: [2][260/757] Elapsed 1m 40s (remain 3m 11s) Loss: 0.4472(0.4582) Grad: 177394.1719  LR: 0.00001680  
Epoch: [2][280/757] Elapsed 1m 47s (remain 3m 1s) Loss: 0.5129(0.4577) Grad: 200536.4219  LR: 0.00001663  
Epoch: [2][300/757] Elapsed 1m 52s (remain 2m 50s) Loss: 0.4704(0.4554) Grad: 121457.0547  LR: 0.00001645  
EVAL: [0/69] Loss: 0.8437(0.8437) 
EVAL: [20/69] Loss: 0.5372(0.6563) 
EVAL: [40/69] Loss: 0.6367(0.627

Epoch 2 - avg_train_loss: 0.4554  avg_val_loss: 0.6400
INFO:__main__:Epoch 2 - avg_train_loss: 0.4554  avg_val_loss: 0.6400
Epoch 2 - Score: 0.6566  Scores: [0.6007101621551143, 0.7125052335895963]
INFO:__main__:Epoch 2 - Score: 0.6566  Scores: [0.6007101621551143, 0.7125052335895963]


EVAL: [68/69] Loss: 0.8967(0.6400) 
Epoch: [2][320/757] Elapsed 2m 18s (remain 3m 8s) Loss: 0.4078(0.4571) Grad: 127105.8203  LR: 0.00001627  
Epoch: [2][340/757] Elapsed 2m 24s (remain 2m 56s) Loss: 0.3483(0.4540) Grad: 100574.4375  LR: 0.00001609  
Epoch: [2][360/757] Elapsed 2m 30s (remain 2m 44s) Loss: 0.3291(0.4520) Grad: 148442.5469  LR: 0.00001591  
Epoch: [2][380/757] Elapsed 2m 36s (remain 2m 34s) Loss: 0.4409(0.4532) Grad: 182757.1406  LR: 0.00001572  
Epoch: [2][400/757] Elapsed 2m 41s (remain 2m 23s) Loss: 0.3983(0.4515) Grad: 100152.1250  LR: 0.00001553  
Epoch: [2][420/757] Elapsed 2m 47s (remain 2m 13s) Loss: 0.5747(0.4514) Grad: 238634.9062  LR: 0.00001534  
Epoch: [2][440/757] Elapsed 2m 53s (remain 2m 4s) Loss: 0.5018(0.4508) Grad: 153275.0156  LR: 0.00001514  
EVAL: [0/69] Loss: 0.8902(0.8902) 
EVAL: [20/69] Loss: 0.5203(0.6302) 
EVAL: [40/69] Loss: 0.5615(0.6007) 
EVAL: [60/69] Loss: 0.4478(0.5996) 


Epoch 2 - avg_train_loss: 0.4511  avg_val_loss: 0.6067
INFO:__main__:Epoch 2 - avg_train_loss: 0.4511  avg_val_loss: 0.6067
Epoch 2 - Score: 0.6260  Scores: [0.5321503569386071, 0.7197927748846576]
INFO:__main__:Epoch 2 - Score: 0.6260  Scores: [0.5321503569386071, 0.7197927748846576]


EVAL: [68/69] Loss: 0.8251(0.6067) 
Epoch: [2][460/757] Elapsed 3m 18s (remain 2m 7s) Loss: 0.6044(0.4515) Grad: 165401.6094  LR: 0.00001494  
Epoch: [2][480/757] Elapsed 3m 24s (remain 1m 57s) Loss: 0.6539(0.4521) Grad: 234470.4688  LR: 0.00001474  
Epoch: [2][500/757] Elapsed 3m 29s (remain 1m 47s) Loss: 0.3905(0.4520) Grad: 137631.3906  LR: 0.00001454  
Epoch: [2][520/757] Elapsed 3m 34s (remain 1m 37s) Loss: 0.3380(0.4496) Grad: 153667.1250  LR: 0.00001433  
Epoch: [2][540/757] Elapsed 3m 40s (remain 1m 28s) Loss: 0.5073(0.4501) Grad: 201833.6406  LR: 0.00001412  
Epoch: [2][560/757] Elapsed 3m 46s (remain 1m 19s) Loss: 0.5699(0.4492) Grad: 143436.4688  LR: 0.00001391  
Epoch: [2][580/757] Elapsed 3m 51s (remain 1m 10s) Loss: 0.2996(0.4488) Grad: 92488.0078  LR: 0.00001370  
Epoch: [2][600/757] Elapsed 3m 57s (remain 1m 1s) Loss: 0.5210(0.4478) Grad: 124086.3281  LR: 0.00001348  
EVAL: [0/69] Loss: 0.8667(0.8667) 
EVAL: [20/69] Loss: 0.5532(0.6348) 
EVAL: [40/69] Loss: 0.6033(0.610

Epoch 2 - avg_train_loss: 0.4478  avg_val_loss: 0.6155
INFO:__main__:Epoch 2 - avg_train_loss: 0.4478  avg_val_loss: 0.6155
Epoch 2 - Score: 0.6331  Scores: [0.52738674741329, 0.7387341434565343]
INFO:__main__:Epoch 2 - Score: 0.6331  Scores: [0.52738674741329, 0.7387341434565343]


EVAL: [68/69] Loss: 0.8427(0.6155) 
Epoch: [2][620/757] Elapsed 4m 24s (remain 0m 57s) Loss: 0.3002(0.4472) Grad: 97043.0703  LR: 0.00001327  
Epoch: [2][640/757] Elapsed 4m 30s (remain 0m 48s) Loss: 0.3871(0.4472) Grad: 114645.1172  LR: 0.00001305  
Epoch: [2][660/757] Elapsed 4m 35s (remain 0m 40s) Loss: 0.6687(0.4480) Grad: 148150.3438  LR: 0.00001283  
Epoch: [2][680/757] Elapsed 4m 41s (remain 0m 31s) Loss: 0.4918(0.4475) Grad: 126665.8125  LR: 0.00001261  
Epoch: [2][700/757] Elapsed 4m 47s (remain 0m 22s) Loss: 0.4502(0.4467) Grad: 87920.9531  LR: 0.00001238  
Epoch: [2][720/757] Elapsed 4m 52s (remain 0m 14s) Loss: 0.3317(0.4458) Grad: 87959.8750  LR: 0.00001216  
Epoch: [2][740/757] Elapsed 4m 57s (remain 0m 6s) Loss: 0.4132(0.4452) Grad: 293203.0938  LR: 0.00001193  
EVAL: [0/69] Loss: 0.8524(0.8524) 
EVAL: [20/69] Loss: 0.4303(0.5670) 
EVAL: [40/69] Loss: 0.5525(0.5452) 
EVAL: [60/69] Loss: 0.4466(0.5476) 


Epoch 2 - Save Best Score: 0.5735 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5735 Model


EVAL: [68/69] Loss: 0.7247(0.5543) 


Epoch 2 - avg_train_loss: 0.4454  avg_val_loss: 0.5543
INFO:__main__:Epoch 2 - avg_train_loss: 0.4454  avg_val_loss: 0.5543
Epoch 2 - Score: 0.5735  Scores: [0.48149582321810896, 0.665427150938156]
INFO:__main__:Epoch 2 - Score: 0.5735  Scores: [0.48149582321810896, 0.665427150938156]


Epoch: [2][756/757] Elapsed 5m 30s (remain 0m 0s) Loss: 0.4215(0.4456) Grad: 113171.6719  LR: 0.00001175  
EVAL: [0/69] Loss: 0.8515(0.8515) 
EVAL: [20/69] Loss: 0.4767(0.6024) 
EVAL: [40/69] Loss: 0.5963(0.5787) 
EVAL: [60/69] Loss: 0.4679(0.5801) 
EVAL: [68/69] Loss: 0.8038(0.5880) 


Epoch 2 - avg_train_loss: 0.4456  avg_val_loss: 0.5880
INFO:__main__:Epoch 2 - avg_train_loss: 0.4456  avg_val_loss: 0.5880
Epoch 2 - Score: 0.6065  Scores: [0.5433626891665906, 0.6697079869767975]
INFO:__main__:Epoch 2 - Score: 0.6065  Scores: [0.5433626891665906, 0.6697079869767975]


Epoch: [3][0/757] Elapsed 0m 0s (remain 10m 58s) Loss: 0.5786(0.5786) Grad: 339134.4375  LR: 0.00001174  
Epoch: [3][20/757] Elapsed 0m 6s (remain 3m 52s) Loss: 0.2466(0.3940) Grad: 172336.0000  LR: 0.00001151  
Epoch: [3][40/757] Elapsed 0m 12s (remain 3m 44s) Loss: 0.3134(0.4042) Grad: 109751.8672  LR: 0.00001129  
Epoch: [3][60/757] Elapsed 0m 19s (remain 3m 41s) Loss: 0.2735(0.4073) Grad: 79562.0938  LR: 0.00001106  
Epoch: [3][80/757] Elapsed 0m 25s (remain 3m 29s) Loss: 0.4766(0.3992) Grad: 150773.6875  LR: 0.00001083  
Epoch: [3][100/757] Elapsed 0m 31s (remain 3m 22s) Loss: 0.4796(0.4014) Grad: 151170.9531  LR: 0.00001060  
Epoch: [3][120/757] Elapsed 0m 37s (remain 3m 16s) Loss: 0.3967(0.4000) Grad: 135905.3438  LR: 0.00001037  
Epoch: [3][140/757] Elapsed 0m 44s (remain 3m 15s) Loss: 0.6618(0.3993) Grad: 412600.9375  LR: 0.00001014  
EVAL: [0/69] Loss: 0.8352(0.8352) 
EVAL: [20/69] Loss: 0.5002(0.6103) 
EVAL: [40/69] Loss: 0.5807(0.5909) 
EVAL: [60/69] Loss: 0.4572(0.5908) 


Epoch 3 - avg_train_loss: 0.3973  avg_val_loss: 0.5995
INFO:__main__:Epoch 3 - avg_train_loss: 0.3973  avg_val_loss: 0.5995
Epoch 3 - Score: 0.6177  Scores: [0.5486011501720685, 0.6867976726337676]
INFO:__main__:Epoch 3 - Score: 0.6177  Scores: [0.5486011501720685, 0.6867976726337676]


EVAL: [68/69] Loss: 0.8161(0.5995) 
Epoch: [3][160/757] Elapsed 1m 9s (remain 4m 18s) Loss: 0.5755(0.3980) Grad: 129712.7188  LR: 0.00000991  
Epoch: [3][180/757] Elapsed 1m 15s (remain 3m 59s) Loss: 0.4653(0.3967) Grad: 474109.5938  LR: 0.00000968  
Epoch: [3][200/757] Elapsed 1m 21s (remain 3m 45s) Loss: 0.3906(0.3941) Grad: 173653.3594  LR: 0.00000945  
Epoch: [3][220/757] Elapsed 1m 26s (remain 3m 30s) Loss: 0.3747(0.3920) Grad: 149967.1562  LR: 0.00000922  
Epoch: [3][240/757] Elapsed 1m 32s (remain 3m 17s) Loss: 0.2871(0.3898) Grad: 194153.4062  LR: 0.00000899  
Epoch: [3][260/757] Elapsed 1m 38s (remain 3m 7s) Loss: 0.4090(0.3879) Grad: 184306.6719  LR: 0.00000876  
Epoch: [3][280/757] Elapsed 1m 44s (remain 2m 57s) Loss: 0.4058(0.3858) Grad: 241695.6094  LR: 0.00000853  
Epoch: [3][300/757] Elapsed 1m 51s (remain 2m 48s) Loss: 0.4964(0.3876) Grad: 176862.4688  LR: 0.00000830  
EVAL: [0/69] Loss: 0.8736(0.8736) 
EVAL: [20/69] Loss: 0.5573(0.6509) 
EVAL: [40/69] Loss: 0.6280(0.62

Epoch 3 - avg_train_loss: 0.3876  avg_val_loss: 0.6353
INFO:__main__:Epoch 3 - avg_train_loss: 0.3876  avg_val_loss: 0.6353
Epoch 3 - Score: 0.6534  Scores: [0.5925759252795179, 0.7142130073101776]
INFO:__main__:Epoch 3 - Score: 0.6534  Scores: [0.5925759252795179, 0.7142130073101776]


EVAL: [68/69] Loss: 0.8800(0.6353) 
Epoch: [3][320/757] Elapsed 2m 15s (remain 3m 3s) Loss: 0.2408(0.3878) Grad: 113969.5547  LR: 0.00000808  
Epoch: [3][340/757] Elapsed 2m 20s (remain 2m 51s) Loss: 0.2583(0.3851) Grad: 170670.9375  LR: 0.00000785  
Epoch: [3][360/757] Elapsed 2m 26s (remain 2m 41s) Loss: 0.4507(0.3864) Grad: 117525.0547  LR: 0.00000763  
Epoch: [3][380/757] Elapsed 2m 32s (remain 2m 30s) Loss: 0.5017(0.3882) Grad: 200107.5625  LR: 0.00000740  
Epoch: [3][400/757] Elapsed 2m 37s (remain 2m 20s) Loss: 0.2290(0.3872) Grad: 138535.7812  LR: 0.00000718  
Epoch: [3][420/757] Elapsed 2m 43s (remain 2m 10s) Loss: 0.5199(0.3861) Grad: 123822.4688  LR: 0.00000696  
Epoch: [3][440/757] Elapsed 2m 48s (remain 2m 0s) Loss: 0.3246(0.3853) Grad: 156654.2031  LR: 0.00000674  
EVAL: [0/69] Loss: 0.8646(0.8646) 
EVAL: [20/69] Loss: 0.5065(0.6199) 
EVAL: [40/69] Loss: 0.6078(0.5945) 
EVAL: [60/69] Loss: 0.4763(0.5968) 
EVAL: [68/69] Loss: 0.8149(0.6059) 


Epoch 3 - avg_train_loss: 0.3849  avg_val_loss: 0.6059
INFO:__main__:Epoch 3 - avg_train_loss: 0.3849  avg_val_loss: 0.6059
Epoch 3 - Score: 0.6250  Scores: [0.5817075410654944, 0.6682111679234529]
INFO:__main__:Epoch 3 - Score: 0.6250  Scores: [0.5817075410654944, 0.6682111679234529]


Epoch: [3][460/757] Elapsed 3m 14s (remain 2m 4s) Loss: 0.3956(0.3844) Grad: 105348.7656  LR: 0.00000653  
Epoch: [3][480/757] Elapsed 3m 20s (remain 1m 54s) Loss: 0.4242(0.3837) Grad: 154085.8125  LR: 0.00000631  
Epoch: [3][500/757] Elapsed 3m 25s (remain 1m 45s) Loss: 0.3580(0.3823) Grad: 128526.1172  LR: 0.00000610  
Epoch: [3][520/757] Elapsed 3m 31s (remain 1m 35s) Loss: 0.4069(0.3810) Grad: 157958.9688  LR: 0.00000589  
Epoch: [3][540/757] Elapsed 3m 36s (remain 1m 26s) Loss: 0.2831(0.3804) Grad: 144651.5938  LR: 0.00000568  
Epoch: [3][560/757] Elapsed 3m 42s (remain 1m 17s) Loss: 0.2990(0.3802) Grad: 151354.7031  LR: 0.00000547  
Epoch: [3][580/757] Elapsed 3m 48s (remain 1m 9s) Loss: 0.4997(0.3802) Grad: 162887.8750  LR: 0.00000527  
Epoch: [3][600/757] Elapsed 3m 53s (remain 1m 0s) Loss: 0.4555(0.3808) Grad: 166189.7656  LR: 0.00000507  
EVAL: [0/69] Loss: 0.8854(0.8854) 
EVAL: [20/69] Loss: 0.5386(0.6525) 
EVAL: [40/69] Loss: 0.6387(0.6233) 
EVAL: [60/69] Loss: 0.5105(0.625

Epoch 3 - avg_train_loss: 0.3808  avg_val_loss: 0.6354
INFO:__main__:Epoch 3 - avg_train_loss: 0.3808  avg_val_loss: 0.6354
Epoch 3 - Score: 0.6543  Scores: [0.6230523658492573, 0.6854740534541637]
INFO:__main__:Epoch 3 - Score: 0.6543  Scores: [0.6230523658492573, 0.6854740534541637]


EVAL: [68/69] Loss: 0.8807(0.6354) 
Epoch: [3][620/757] Elapsed 4m 18s (remain 0m 56s) Loss: 0.2325(0.3805) Grad: 183335.2031  LR: 0.00000487  
Epoch: [3][640/757] Elapsed 4m 23s (remain 0m 47s) Loss: 0.4265(0.3792) Grad: 169185.0781  LR: 0.00000467  
Epoch: [3][660/757] Elapsed 4m 30s (remain 0m 39s) Loss: 0.3994(0.3781) Grad: 136020.7344  LR: 0.00000448  
Epoch: [3][680/757] Elapsed 4m 36s (remain 0m 30s) Loss: 0.1988(0.3777) Grad: 189989.2500  LR: 0.00000429  
Epoch: [3][700/757] Elapsed 4m 41s (remain 0m 22s) Loss: 0.4531(0.3769) Grad: 153663.7188  LR: 0.00000410  
Epoch: [3][720/757] Elapsed 4m 47s (remain 0m 14s) Loss: 0.4405(0.3763) Grad: 178934.9844  LR: 0.00000392  
Epoch: [3][740/757] Elapsed 4m 54s (remain 0m 6s) Loss: 0.5770(0.3760) Grad: 145105.5312  LR: 0.00000373  
EVAL: [0/69] Loss: 0.8693(0.8693) 
EVAL: [20/69] Loss: 0.4964(0.6147) 
EVAL: [40/69] Loss: 0.6034(0.5854) 
EVAL: [60/69] Loss: 0.4845(0.5894) 


Epoch 3 - avg_train_loss: 0.3762  avg_val_loss: 0.5990
INFO:__main__:Epoch 3 - avg_train_loss: 0.3762  avg_val_loss: 0.5990
Epoch 3 - Score: 0.6183  Scores: [0.5756407668369824, 0.6609564631348109]
INFO:__main__:Epoch 3 - Score: 0.6183  Scores: [0.5756407668369824, 0.6609564631348109]


EVAL: [68/69] Loss: 0.8173(0.5990) 
Epoch: [3][756/757] Elapsed 5m 17s (remain 0m 0s) Loss: 0.4010(0.3761) Grad: 109989.9297  LR: 0.00000359  
EVAL: [0/69] Loss: 0.8705(0.8705) 
EVAL: [20/69] Loss: 0.4883(0.6068) 
EVAL: [40/69] Loss: 0.5950(0.5777) 
EVAL: [60/69] Loss: 0.4839(0.5825) 
EVAL: [68/69] Loss: 0.7991(0.5916) 


Epoch 3 - avg_train_loss: 0.3761  avg_val_loss: 0.5916
INFO:__main__:Epoch 3 - avg_train_loss: 0.3761  avg_val_loss: 0.5916
Epoch 3 - Score: 0.6111  Scores: [0.5663952180658962, 0.6557326096125788]
INFO:__main__:Epoch 3 - Score: 0.6111  Scores: [0.5663952180658962, 0.6557326096125788]


Epoch: [4][0/757] Elapsed 0m 0s (remain 9m 32s) Loss: 0.2129(0.2129) Grad: 247868.8281  LR: 0.00000358  
Epoch: [4][20/757] Elapsed 0m 6s (remain 3m 50s) Loss: 0.2966(0.3191) Grad: 126021.3906  LR: 0.00000341  
Epoch: [4][40/757] Elapsed 0m 11s (remain 3m 29s) Loss: 0.3091(0.3124) Grad: 167833.4219  LR: 0.00000324  
Epoch: [4][60/757] Elapsed 0m 17s (remain 3m 21s) Loss: 0.2387(0.3115) Grad: 113496.2109  LR: 0.00000307  
Epoch: [4][80/757] Elapsed 0m 23s (remain 3m 20s) Loss: 0.2322(0.3145) Grad: 135509.0625  LR: 0.00000290  
Epoch: [4][100/757] Elapsed 0m 28s (remain 3m 8s) Loss: 0.4040(0.3169) Grad: 127746.9297  LR: 0.00000274  
Epoch: [4][120/757] Elapsed 0m 35s (remain 3m 4s) Loss: 0.3910(0.3196) Grad: 127493.5078  LR: 0.00000259  
Epoch: [4][140/757] Elapsed 0m 40s (remain 2m 57s) Loss: 0.1775(0.3222) Grad: 111982.0156  LR: 0.00000243  
EVAL: [0/69] Loss: 0.8802(0.8802) 
EVAL: [20/69] Loss: 0.5212(0.6383) 
EVAL: [40/69] Loss: 0.6342(0.6078) 
EVAL: [60/69] Loss: 0.5052(0.6121) 
EVA

Epoch 4 - avg_train_loss: 0.3217  avg_val_loss: 0.6223
INFO:__main__:Epoch 4 - avg_train_loss: 0.3217  avg_val_loss: 0.6223
Epoch 4 - Score: 0.6416  Scores: [0.611837781000285, 0.6714403949999157]
INFO:__main__:Epoch 4 - Score: 0.6416  Scores: [0.611837781000285, 0.6714403949999157]


Epoch: [4][160/757] Elapsed 1m 6s (remain 4m 6s) Loss: 0.4825(0.3231) Grad: 656421.0625  LR: 0.00000229  
Epoch: [4][180/757] Elapsed 1m 12s (remain 3m 50s) Loss: 0.4013(0.3226) Grad: 199242.2344  LR: 0.00000214  
Epoch: [4][200/757] Elapsed 1m 18s (remain 3m 36s) Loss: 0.3052(0.3219) Grad: 147404.0781  LR: 0.00000200  
Epoch: [4][220/757] Elapsed 1m 24s (remain 3m 24s) Loss: 0.2941(0.3196) Grad: 125048.2812  LR: 0.00000187  
Epoch: [4][240/757] Elapsed 1m 29s (remain 3m 10s) Loss: 0.3679(0.3174) Grad: 146572.4531  LR: 0.00000173  
Epoch: [4][260/757] Elapsed 1m 35s (remain 3m 0s) Loss: 0.2815(0.3176) Grad: 185021.3281  LR: 0.00000161  
Epoch: [4][280/757] Elapsed 1m 41s (remain 2m 51s) Loss: 0.3473(0.3174) Grad: 122989.6641  LR: 0.00000148  
Epoch: [4][300/757] Elapsed 1m 46s (remain 2m 41s) Loss: 0.3238(0.3153) Grad: 164093.4531  LR: 0.00000136  
EVAL: [0/69] Loss: 0.8608(0.8608) 
EVAL: [20/69] Loss: 0.4965(0.6124) 
EVAL: [40/69] Loss: 0.6034(0.5836) 
EVAL: [60/69] Loss: 0.4676(0.587

Epoch 4 - avg_train_loss: 0.3153  avg_val_loss: 0.5978
INFO:__main__:Epoch 4 - avg_train_loss: 0.3153  avg_val_loss: 0.5978
Epoch 4 - Score: 0.6174  Scores: [0.5681581826828506, 0.6667076324511186]
INFO:__main__:Epoch 4 - Score: 0.6174  Scores: [0.5681581826828506, 0.6667076324511186]


EVAL: [68/69] Loss: 0.8293(0.5978) 
Epoch: [4][320/757] Elapsed 2m 11s (remain 2m 59s) Loss: 0.2555(0.3139) Grad: 149636.2500  LR: 0.00000125  
Epoch: [4][340/757] Elapsed 2m 17s (remain 2m 47s) Loss: 0.2657(0.3130) Grad: 152974.7500  LR: 0.00000114  
Epoch: [4][360/757] Elapsed 2m 22s (remain 2m 36s) Loss: 0.4055(0.3115) Grad: 357484.6875  LR: 0.00000104  
Epoch: [4][380/757] Elapsed 2m 28s (remain 2m 26s) Loss: 0.3296(0.3137) Grad: 174717.3594  LR: 0.00000094  
Epoch: [4][400/757] Elapsed 2m 33s (remain 2m 16s) Loss: 0.5152(0.3138) Grad: 262084.8750  LR: 0.00000084  
Epoch: [4][420/757] Elapsed 2m 39s (remain 2m 7s) Loss: 0.3570(0.3150) Grad: 166546.8750  LR: 0.00000075  
Epoch: [4][440/757] Elapsed 2m 45s (remain 1m 58s) Loss: 0.3233(0.3163) Grad: 142850.5312  LR: 0.00000067  
EVAL: [0/69] Loss: 0.8667(0.8667) 
EVAL: [20/69] Loss: 0.5056(0.6159) 
EVAL: [40/69] Loss: 0.6061(0.5866) 
EVAL: [60/69] Loss: 0.4768(0.5905) 
EVAL: [68/69] Loss: 0.8242(0.6006) 


Epoch 4 - avg_train_loss: 0.3156  avg_val_loss: 0.6006
INFO:__main__:Epoch 4 - avg_train_loss: 0.3156  avg_val_loss: 0.6006
Epoch 4 - Score: 0.6200  Scores: [0.5805037609241446, 0.6595112559119941]
INFO:__main__:Epoch 4 - Score: 0.6200  Scores: [0.5805037609241446, 0.6595112559119941]


Epoch: [4][460/757] Elapsed 3m 12s (remain 2m 3s) Loss: 0.2534(0.3164) Grad: 154021.9375  LR: 0.00000059  
Epoch: [4][480/757] Elapsed 3m 18s (remain 1m 53s) Loss: 0.3260(0.3172) Grad: 168818.4531  LR: 0.00000051  
Epoch: [4][500/757] Elapsed 3m 23s (remain 1m 43s) Loss: 0.4190(0.3180) Grad: 231219.3594  LR: 0.00000044  
Epoch: [4][520/757] Elapsed 3m 28s (remain 1m 34s) Loss: 0.3746(0.3183) Grad: 152433.1406  LR: 0.00000038  
Epoch: [4][540/757] Elapsed 3m 34s (remain 1m 25s) Loss: 0.3192(0.3182) Grad: 123543.2578  LR: 0.00000032  
Epoch: [4][560/757] Elapsed 3m 39s (remain 1m 16s) Loss: 0.3071(0.3184) Grad: 194678.6250  LR: 0.00000026  
Epoch: [4][580/757] Elapsed 3m 45s (remain 1m 8s) Loss: 0.4569(0.3172) Grad: 209729.5312  LR: 0.00000021  
Epoch: [4][600/757] Elapsed 3m 50s (remain 0m 59s) Loss: 0.2223(0.3161) Grad: 167224.8594  LR: 0.00000017  
EVAL: [0/69] Loss: 0.8678(0.8678) 
EVAL: [20/69] Loss: 0.5015(0.6095) 
EVAL: [40/69] Loss: 0.5960(0.5805) 
EVAL: [60/69] Loss: 0.4738(0.58

Epoch 4 - avg_train_loss: 0.3161  avg_val_loss: 0.5946
INFO:__main__:Epoch 4 - avg_train_loss: 0.3161  avg_val_loss: 0.5946
Epoch 4 - Score: 0.6142  Scores: [0.5718562468382759, 0.6565676950234491]
INFO:__main__:Epoch 4 - Score: 0.6142  Scores: [0.5718562468382759, 0.6565676950234491]


EVAL: [68/69] Loss: 0.8109(0.5946) 
Epoch: [4][620/757] Elapsed 4m 16s (remain 0m 56s) Loss: 0.3904(0.3168) Grad: 155934.4531  LR: 0.00000013  
Epoch: [4][640/757] Elapsed 4m 22s (remain 0m 47s) Loss: 0.3869(0.3166) Grad: 145376.3906  LR: 0.00000009  
Epoch: [4][660/757] Elapsed 4m 27s (remain 0m 38s) Loss: 0.2782(0.3174) Grad: 130140.5078  LR: 0.00000006  
Epoch: [4][680/757] Elapsed 4m 33s (remain 0m 30s) Loss: 0.3283(0.3177) Grad: 268085.9062  LR: 0.00000004  
Epoch: [4][700/757] Elapsed 4m 39s (remain 0m 22s) Loss: 0.1847(0.3175) Grad: 101461.4453  LR: 0.00000002  
Epoch: [4][720/757] Elapsed 4m 44s (remain 0m 14s) Loss: 0.3006(0.3178) Grad: 145512.4688  LR: 0.00000001  
Epoch: [4][740/757] Elapsed 4m 50s (remain 0m 6s) Loss: 0.2712(0.3164) Grad: 144805.3281  LR: 0.00000000  
EVAL: [0/69] Loss: 0.8657(0.8657) 
EVAL: [20/69] Loss: 0.4966(0.6061) 
EVAL: [40/69] Loss: 0.5928(0.5772) 
EVAL: [60/69] Loss: 0.4714(0.5817) 


Epoch 4 - avg_train_loss: 0.3161  avg_val_loss: 0.5914
INFO:__main__:Epoch 4 - avg_train_loss: 0.3161  avg_val_loss: 0.5914
Epoch 4 - Score: 0.6111  Scores: [0.5669719201489786, 0.6551370311846765]
INFO:__main__:Epoch 4 - Score: 0.6111  Scores: [0.5669719201489786, 0.6551370311846765]


EVAL: [68/69] Loss: 0.8060(0.5914) 
Epoch: [4][756/757] Elapsed 5m 14s (remain 0m 0s) Loss: 0.3663(0.3159) Grad: 270550.5938  LR: 0.00000000  
EVAL: [0/69] Loss: 0.8657(0.8657) 
EVAL: [20/69] Loss: 0.4966(0.6061) 
EVAL: [40/69] Loss: 0.5928(0.5772) 
EVAL: [60/69] Loss: 0.4714(0.5817) 


Epoch 4 - avg_train_loss: 0.3159  avg_val_loss: 0.5914
INFO:__main__:Epoch 4 - avg_train_loss: 0.3159  avg_val_loss: 0.5914
Epoch 4 - Score: 0.6111  Scores: [0.5669721989815152, 0.6551371839573391]
INFO:__main__:Epoch 4 - Score: 0.6111  Scores: [0.5669721989815152, 0.6551371839573391]


EVAL: [68/69] Loss: 0.8060(0.5914) 


Score: 0.5735  Scores: [0.48149582321810896, 0.665427150938156]
INFO:__main__:Score: 0.5735  Scores: [0.48149582321810896, 0.665427150938156]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size

Epoch: [1][0/638] Elapsed 0m 0s (remain 6m 9s) Loss: 0.6775(0.6775) Grad: 147021.8281  LR: 0.00000008  
Epoch: [1][20/638] Elapsed 0m 6s (remain 3m 18s) Loss: 0.9411(1.0478) Grad: 72253.2422  LR: 0.00000165  
Epoch: [1][40/638] Elapsed 0m 12s (remain 2m 58s) Loss: 1.1893(1.0262) Grad: 19641.7539  LR: 0.00000322  
Epoch: [1][60/638] Elapsed 0m 17s (remain 2m 43s) Loss: 1.2682(1.0201) Grad: 75572.5000  LR: 0.00000478  
Epoch: [1][80/638] Elapsed 0m 23s (remain 2m 38s) Loss: 0.9728(0.9785) Grad: 95628.6562  LR: 0.00000635  
Epoch: [1][100/638] Elapsed 0m 28s (remain 2m 33s) Loss: 0.5084(0.9385) Grad: 94973.8281  LR: 0.00000792  
Epoch: [1][120/638] Elapsed 0m 34s (remain 2m 25s) Loss: 0.7229(0.8997) Grad: 587409.1250  LR: 0.00000949  
Epoch: [1][140/638] Elapsed 0m 40s (remain 2m 21s) Loss: 0.5457(0.8647) Grad: 352663.9375  LR: 0.00001106  
EVAL: [0/129] Loss: 0.6024(0.6024) 
EVAL: [20/129] Loss: 0.6840(0.7461) 
EVAL: [40/129] Loss: 0.5859(0.7225) 
EVAL: [60/129] Loss: 0.7697(0.7231) 
EVA

Epoch 1 - Save Best Score: 0.7371 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7371 Model


EVAL: [128/129] Loss: 0.6908(0.7238) 


Epoch 1 - avg_train_loss: 0.8527  avg_val_loss: 0.7238
INFO:__main__:Epoch 1 - avg_train_loss: 0.8527  avg_val_loss: 0.7238
Epoch 1 - Score: 0.7371  Scores: [0.6864722394306362, 0.7877491185828133]
INFO:__main__:Epoch 1 - Score: 0.7371  Scores: [0.6864722394306362, 0.7877491185828133]


Epoch: [1][160/638] Elapsed 1m 27s (remain 4m 20s) Loss: 0.6406(0.8445) Grad: 242621.5781  LR: 0.00001263  
Epoch: [1][180/638] Elapsed 1m 33s (remain 3m 56s) Loss: 0.7017(0.8176) Grad: 302763.4062  LR: 0.00001420  
Epoch: [1][200/638] Elapsed 1m 40s (remain 3m 37s) Loss: 0.8017(0.7991) Grad: 110477.0781  LR: 0.00001576  
Epoch: [1][220/638] Elapsed 1m 47s (remain 3m 23s) Loss: 0.6986(0.7826) Grad: 233782.7969  LR: 0.00001733  
Epoch: [1][240/638] Elapsed 1m 53s (remain 3m 6s) Loss: 0.4189(0.7691) Grad: 145487.5625  LR: 0.00001890  
Epoch: [1][260/638] Elapsed 1m 59s (remain 2m 53s) Loss: 0.6130(0.7546) Grad: 516895.3125  LR: 0.00002000  
Epoch: [1][280/638] Elapsed 2m 8s (remain 2m 43s) Loss: 0.6912(0.7446) Grad: 247200.9219  LR: 0.00001999  
Epoch: [1][300/638] Elapsed 2m 13s (remain 2m 29s) Loss: 0.9458(0.7329) Grad: 229191.2188  LR: 0.00001998  
EVAL: [0/129] Loss: 0.5332(0.5332) 
EVAL: [20/129] Loss: 0.5322(0.5756) 
EVAL: [40/129] Loss: 0.4938(0.5580) 
EVAL: [60/129] Loss: 0.6248(

Epoch 1 - Save Best Score: 0.5744 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5744 Model


EVAL: [128/129] Loss: 0.4902(0.5596) 


Epoch 1 - avg_train_loss: 0.7329  avg_val_loss: 0.5596
INFO:__main__:Epoch 1 - avg_train_loss: 0.7329  avg_val_loss: 0.5596
Epoch 1 - Score: 0.5744  Scores: [0.4936912689480149, 0.6550251393895062]
INFO:__main__:Epoch 1 - Score: 0.5744  Scores: [0.4936912689480149, 0.6550251393895062]


Epoch: [1][320/638] Elapsed 3m 7s (remain 3m 5s) Loss: 0.7598(0.7241) Grad: 270268.2812  LR: 0.00001996  
Epoch: [1][340/638] Elapsed 3m 14s (remain 2m 49s) Loss: 0.6789(0.7161) Grad: 151466.8594  LR: 0.00001993  
Epoch: [1][360/638] Elapsed 3m 21s (remain 2m 34s) Loss: 0.6825(0.7053) Grad: 209933.7031  LR: 0.00001990  
Epoch: [1][380/638] Elapsed 3m 28s (remain 2m 20s) Loss: 0.4352(0.6978) Grad: 81251.7188  LR: 0.00001985  
Epoch: [1][400/638] Elapsed 3m 35s (remain 2m 7s) Loss: 0.6023(0.6913) Grad: 104986.4297  LR: 0.00001980  
Epoch: [1][420/638] Elapsed 3m 41s (remain 1m 54s) Loss: 0.3102(0.6844) Grad: 106813.9297  LR: 0.00001974  
Epoch: [1][440/638] Elapsed 3m 47s (remain 1m 41s) Loss: 0.5456(0.6786) Grad: 81691.2188  LR: 0.00001968  
EVAL: [0/129] Loss: 0.4139(0.4139) 
EVAL: [20/129] Loss: 0.4502(0.5232) 
EVAL: [40/129] Loss: 0.4906(0.5197) 
EVAL: [60/129] Loss: 0.5828(0.5240) 
EVAL: [80/129] Loss: 0.5378(0.5225) 
EVAL: [100/129] Loss: 0.6880(0.5221) 
EVAL: [120/129] Loss: 0.422

Epoch 1 - Save Best Score: 0.5328 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5328 Model


EVAL: [128/129] Loss: 0.5208(0.5208) 


Epoch 1 - avg_train_loss: 0.6764  avg_val_loss: 0.5208
INFO:__main__:Epoch 1 - avg_train_loss: 0.6764  avg_val_loss: 0.5208
Epoch 1 - Score: 0.5328  Scores: [0.4780260093060706, 0.5875738861273226]
INFO:__main__:Epoch 1 - Score: 0.5328  Scores: [0.4780260093060706, 0.5875738861273226]


Epoch: [1][460/638] Elapsed 4m 37s (remain 1m 46s) Loss: 0.6057(0.6749) Grad: 51261.7383  LR: 0.00001961  
Epoch: [1][480/638] Elapsed 4m 42s (remain 1m 32s) Loss: 0.7168(0.6679) Grad: 92056.6719  LR: 0.00001953  
Epoch: [1][500/638] Elapsed 4m 49s (remain 1m 19s) Loss: 0.5427(0.6649) Grad: 163647.7031  LR: 0.00001944  
Epoch: [1][520/638] Elapsed 4m 56s (remain 1m 6s) Loss: 0.8488(0.6605) Grad: 85479.9453  LR: 0.00001935  
Epoch: [1][540/638] Elapsed 5m 3s (remain 0m 54s) Loss: 0.3713(0.6562) Grad: 74429.5938  LR: 0.00001925  
Epoch: [1][560/638] Elapsed 5m 11s (remain 0m 42s) Loss: 0.4850(0.6508) Grad: 49721.9609  LR: 0.00001914  
Epoch: [1][580/638] Elapsed 5m 17s (remain 0m 31s) Loss: 0.4894(0.6464) Grad: 50928.0117  LR: 0.00001902  
Epoch: [1][600/638] Elapsed 5m 22s (remain 0m 19s) Loss: 0.6669(0.6418) Grad: 95922.6875  LR: 0.00001890  
EVAL: [0/129] Loss: 0.4008(0.4008) 
EVAL: [20/129] Loss: 0.4241(0.5043) 
EVAL: [40/129] Loss: 0.4740(0.4951) 
EVAL: [60/129] Loss: 0.5007(0.5032)

Epoch 1 - Save Best Score: 0.5041 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5041 Model


EVAL: [128/129] Loss: 0.5066(0.4955) 


Epoch 1 - avg_train_loss: 0.6418  avg_val_loss: 0.4955
INFO:__main__:Epoch 1 - avg_train_loss: 0.6418  avg_val_loss: 0.4955
Epoch 1 - Score: 0.5041  Scores: [0.46104722392494013, 0.5471469172099651]
INFO:__main__:Epoch 1 - Score: 0.5041  Scores: [0.46104722392494013, 0.5471469172099651]


Epoch: [1][620/638] Elapsed 6m 13s (remain 0m 10s) Loss: 0.5813(0.6378) Grad: 121123.4688  LR: 0.00001878  
Epoch: [1][637/638] Elapsed 6m 19s (remain 0m 0s) Loss: 0.5091(0.6346) Grad: 92996.1875  LR: 0.00001866  
EVAL: [0/129] Loss: 0.4941(0.4941) 
EVAL: [20/129] Loss: 0.4874(0.5250) 
EVAL: [40/129] Loss: 0.4491(0.5172) 
EVAL: [60/129] Loss: 0.6167(0.5248) 
EVAL: [80/129] Loss: 0.5059(0.5212) 
EVAL: [100/129] Loss: 0.6901(0.5200) 
EVAL: [120/129] Loss: 0.4858(0.5187) 
EVAL: [128/129] Loss: 0.4818(0.5176) 


Epoch 1 - avg_train_loss: 0.6346  avg_val_loss: 0.5176
INFO:__main__:Epoch 1 - avg_train_loss: 0.6346  avg_val_loss: 0.5176
Epoch 1 - Score: 0.5271  Scores: [0.44828072669875557, 0.6059563521759793]
INFO:__main__:Epoch 1 - Score: 0.5271  Scores: [0.44828072669875557, 0.6059563521759793]


Epoch: [2][0/638] Elapsed 0m 0s (remain 8m 22s) Loss: 0.4588(0.4588) Grad: 350737.4375  LR: 0.00001865  
Epoch: [2][20/638] Elapsed 0m 6s (remain 3m 11s) Loss: 0.5544(0.4940) Grad: 220267.4688  LR: 0.00001851  
Epoch: [2][40/638] Elapsed 0m 12s (remain 2m 57s) Loss: 0.4142(0.4840) Grad: 131000.0547  LR: 0.00001837  
Epoch: [2][60/638] Elapsed 0m 18s (remain 2m 51s) Loss: 0.4467(0.4717) Grad: 114609.1641  LR: 0.00001822  
Epoch: [2][80/638] Elapsed 0m 23s (remain 2m 42s) Loss: 0.3320(0.4713) Grad: 135880.6875  LR: 0.00001806  
Epoch: [2][100/638] Elapsed 0m 28s (remain 2m 32s) Loss: 0.3098(0.4637) Grad: 188596.6719  LR: 0.00001789  
Epoch: [2][120/638] Elapsed 0m 34s (remain 2m 28s) Loss: 0.5461(0.4620) Grad: 122437.0156  LR: 0.00001772  
Epoch: [2][140/638] Elapsed 0m 39s (remain 2m 20s) Loss: 0.3404(0.4630) Grad: 123306.0859  LR: 0.00001754  
EVAL: [0/129] Loss: 0.4305(0.4305) 
EVAL: [20/129] Loss: 0.4365(0.4992) 
EVAL: [40/129] Loss: 0.4723(0.4905) 
EVAL: [60/129] Loss: 0.5450(0.4968

Epoch 2 - Save Best Score: 0.4976 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4976 Model


EVAL: [128/129] Loss: 0.5180(0.4890) 


Epoch 2 - avg_train_loss: 0.4672  avg_val_loss: 0.4890
INFO:__main__:Epoch 2 - avg_train_loss: 0.4672  avg_val_loss: 0.4890
Epoch 2 - Score: 0.4976  Scores: [0.42638081006111916, 0.568764498140797]
INFO:__main__:Epoch 2 - Score: 0.4976  Scores: [0.42638081006111916, 0.568764498140797]


Epoch: [2][160/638] Elapsed 1m 26s (remain 4m 16s) Loss: 0.5139(0.4699) Grad: 247886.8750  LR: 0.00001736  
Epoch: [2][180/638] Elapsed 1m 31s (remain 3m 51s) Loss: 0.3947(0.4699) Grad: 193003.0781  LR: 0.00001717  
Epoch: [2][200/638] Elapsed 1m 38s (remain 3m 34s) Loss: 0.5742(0.4699) Grad: 188708.4844  LR: 0.00001698  
Epoch: [2][220/638] Elapsed 1m 44s (remain 3m 17s) Loss: 0.3945(0.4758) Grad: 97916.3750  LR: 0.00001678  
Epoch: [2][240/638] Elapsed 1m 51s (remain 3m 2s) Loss: 0.3921(0.4716) Grad: 144493.7344  LR: 0.00001658  
Epoch: [2][260/638] Elapsed 1m 58s (remain 2m 51s) Loss: 0.3868(0.4710) Grad: 145768.9531  LR: 0.00001637  
Epoch: [2][280/638] Elapsed 2m 4s (remain 2m 38s) Loss: 0.3005(0.4703) Grad: 270416.7188  LR: 0.00001616  
Epoch: [2][300/638] Elapsed 2m 11s (remain 2m 26s) Loss: 0.5052(0.4740) Grad: 85277.2891  LR: 0.00001594  
EVAL: [0/129] Loss: 0.4507(0.4507) 
EVAL: [20/129] Loss: 0.4459(0.4858) 
EVAL: [40/129] Loss: 0.4613(0.4787) 
EVAL: [60/129] Loss: 0.5570(0.

Epoch 2 - Save Best Score: 0.4868 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4868 Model


EVAL: [128/129] Loss: 0.4492(0.4764) 


Epoch 2 - avg_train_loss: 0.4740  avg_val_loss: 0.4764
INFO:__main__:Epoch 2 - avg_train_loss: 0.4740  avg_val_loss: 0.4764
Epoch 2 - Score: 0.4868  Scores: [0.4261700899997922, 0.5474447428497093]
INFO:__main__:Epoch 2 - Score: 0.4868  Scores: [0.4261700899997922, 0.5474447428497093]


Epoch: [2][320/638] Elapsed 3m 6s (remain 3m 4s) Loss: 0.4836(0.4757) Grad: 216677.2031  LR: 0.00001572  
Epoch: [2][340/638] Elapsed 3m 14s (remain 2m 49s) Loss: 0.4070(0.4751) Grad: 234769.1875  LR: 0.00001549  
Epoch: [2][360/638] Elapsed 3m 20s (remain 2m 33s) Loss: 0.7158(0.4729) Grad: 181476.4219  LR: 0.00001526  
Epoch: [2][380/638] Elapsed 3m 27s (remain 2m 19s) Loss: 0.4417(0.4715) Grad: 190014.5000  LR: 0.00001503  
Epoch: [2][400/638] Elapsed 3m 34s (remain 2m 7s) Loss: 0.5246(0.4709) Grad: 230400.3125  LR: 0.00001479  
Epoch: [2][420/638] Elapsed 3m 41s (remain 1m 54s) Loss: 0.5235(0.4683) Grad: 219708.0312  LR: 0.00001455  
Epoch: [2][440/638] Elapsed 3m 47s (remain 1m 41s) Loss: 0.6409(0.4683) Grad: 214898.1875  LR: 0.00001430  
EVAL: [0/129] Loss: 0.4228(0.4228) 
EVAL: [20/129] Loss: 0.4413(0.4785) 
EVAL: [40/129] Loss: 0.4816(0.4720) 
EVAL: [60/129] Loss: 0.4766(0.4779) 
EVAL: [80/129] Loss: 0.5057(0.4719) 
EVAL: [100/129] Loss: 0.5999(0.4693) 
EVAL: [120/129] Loss: 0.3

Epoch 2 - Save Best Score: 0.4764 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4764 Model
Epoch 2 - avg_train_loss: 0.4694  avg_val_loss: 0.4665
INFO:__main__:Epoch 2 - avg_train_loss: 0.4694  avg_val_loss: 0.4665
Epoch 2 - Score: 0.4764  Scores: [0.4071460568919797, 0.5457197065874293]
INFO:__main__:Epoch 2 - Score: 0.4764  Scores: [0.4071460568919797, 0.5457197065874293]


Epoch: [2][460/638] Elapsed 4m 35s (remain 1m 45s) Loss: 0.5208(0.4684) Grad: 152564.4844  LR: 0.00001405  
Epoch: [2][480/638] Elapsed 4m 41s (remain 1m 31s) Loss: 0.4462(0.4693) Grad: 111000.4297  LR: 0.00001380  
Epoch: [2][500/638] Elapsed 4m 49s (remain 1m 19s) Loss: 0.5209(0.4681) Grad: 114451.1016  LR: 0.00001355  
Epoch: [2][520/638] Elapsed 4m 57s (remain 1m 6s) Loss: 0.3151(0.4682) Grad: 169997.2188  LR: 0.00001329  
Epoch: [2][540/638] Elapsed 5m 4s (remain 0m 54s) Loss: 0.3976(0.4680) Grad: 264919.2188  LR: 0.00001303  
Epoch: [2][560/638] Elapsed 5m 11s (remain 0m 42s) Loss: 0.4815(0.4684) Grad: 120189.5625  LR: 0.00001277  
Epoch: [2][580/638] Elapsed 5m 16s (remain 0m 31s) Loss: 0.7410(0.4681) Grad: 160742.2500  LR: 0.00001251  
Epoch: [2][600/638] Elapsed 5m 22s (remain 0m 19s) Loss: 0.4574(0.4675) Grad: 240378.9375  LR: 0.00001224  
EVAL: [0/129] Loss: 0.4446(0.4446) 
EVAL: [20/129] Loss: 0.4295(0.4975) 
EVAL: [40/129] Loss: 0.4322(0.4958) 
EVAL: [60/129] Loss: 0.5641(

Epoch 2 - avg_train_loss: 0.4675  avg_val_loss: 0.4935
INFO:__main__:Epoch 2 - avg_train_loss: 0.4675  avg_val_loss: 0.4935
Epoch 2 - Score: 0.5037  Scores: [0.4788103880116258, 0.5285355255472397]
INFO:__main__:Epoch 2 - Score: 0.5037  Scores: [0.4788103880116258, 0.5285355255472397]


EVAL: [128/129] Loss: 0.4283(0.4935) 
Epoch: [2][620/638] Elapsed 6m 2s (remain 0m 9s) Loss: 0.4159(0.4666) Grad: 152886.3125  LR: 0.00001198  
Epoch: [2][637/638] Elapsed 6m 6s (remain 0m 0s) Loss: 0.4644(0.4655) Grad: 207046.4844  LR: 0.00001175  
EVAL: [0/129] Loss: 0.3897(0.3897) 
EVAL: [20/129] Loss: 0.4038(0.4896) 
EVAL: [40/129] Loss: 0.4890(0.4927) 
EVAL: [60/129] Loss: 0.5306(0.4990) 
EVAL: [80/129] Loss: 0.5300(0.4982) 
EVAL: [100/129] Loss: 0.6298(0.4957) 
EVAL: [120/129] Loss: 0.3437(0.4938) 
EVAL: [128/129] Loss: 0.4599(0.4931) 


Epoch 2 - avg_train_loss: 0.4655  avg_val_loss: 0.4931
INFO:__main__:Epoch 2 - avg_train_loss: 0.4655  avg_val_loss: 0.4931
Epoch 2 - Score: 0.5033  Scores: [0.4575241099856397, 0.5490296930463149]
INFO:__main__:Epoch 2 - Score: 0.5033  Scores: [0.4575241099856397, 0.5490296930463149]


Epoch: [3][0/638] Elapsed 0m 0s (remain 7m 55s) Loss: 0.3602(0.3602) Grad: 267878.1875  LR: 0.00001173  
Epoch: [3][20/638] Elapsed 0m 6s (remain 3m 12s) Loss: 0.3105(0.4023) Grad: 122301.4062  LR: 0.00001146  
Epoch: [3][40/638] Elapsed 0m 11s (remain 2m 47s) Loss: 0.3371(0.3898) Grad: 172562.7188  LR: 0.00001119  
Epoch: [3][60/638] Elapsed 0m 18s (remain 2m 51s) Loss: 0.3591(0.3959) Grad: 151809.5312  LR: 0.00001092  
Epoch: [3][80/638] Elapsed 0m 24s (remain 2m 47s) Loss: 0.2598(0.4003) Grad: 115617.7344  LR: 0.00001065  
Epoch: [3][100/638] Elapsed 0m 30s (remain 2m 44s) Loss: 0.3847(0.4000) Grad: 218284.6406  LR: 0.00001038  
Epoch: [3][120/638] Elapsed 0m 37s (remain 2m 39s) Loss: 0.5374(0.3930) Grad: 243861.5312  LR: 0.00001010  
Epoch: [3][140/638] Elapsed 0m 42s (remain 2m 30s) Loss: 0.4099(0.3893) Grad: 186670.2812  LR: 0.00000983  
EVAL: [0/129] Loss: 0.4027(0.4027) 
EVAL: [20/129] Loss: 0.4199(0.4755) 
EVAL: [40/129] Loss: 0.4528(0.4753) 
EVAL: [60/129] Loss: 0.5420(0.4813

Epoch 3 - avg_train_loss: 0.3862  avg_val_loss: 0.4728
INFO:__main__:Epoch 3 - avg_train_loss: 0.3862  avg_val_loss: 0.4728
Epoch 3 - Score: 0.4829  Scores: [0.44888039155918763, 0.5168598057578119]
INFO:__main__:Epoch 3 - Score: 0.4829  Scores: [0.44888039155918763, 0.5168598057578119]


Epoch: [3][160/638] Elapsed 1m 23s (remain 4m 6s) Loss: 0.3949(0.3855) Grad: 122329.5703  LR: 0.00000956  
Epoch: [3][180/638] Elapsed 1m 28s (remain 3m 43s) Loss: 0.3744(0.3888) Grad: 163266.3594  LR: 0.00000928  
Epoch: [3][200/638] Elapsed 1m 33s (remain 3m 23s) Loss: 0.2304(0.3861) Grad: 136662.1406  LR: 0.00000901  
Epoch: [3][220/638] Elapsed 1m 39s (remain 3m 8s) Loss: 0.4519(0.3874) Grad: 108590.8203  LR: 0.00000874  
Epoch: [3][240/638] Elapsed 1m 45s (remain 2m 54s) Loss: 0.5996(0.3896) Grad: 169678.7031  LR: 0.00000847  
Epoch: [3][260/638] Elapsed 1m 51s (remain 2m 41s) Loss: 0.3151(0.3917) Grad: 235182.8125  LR: 0.00000820  
Epoch: [3][280/638] Elapsed 1m 57s (remain 2m 28s) Loss: 0.2845(0.3890) Grad: 131549.8594  LR: 0.00000793  
Epoch: [3][300/638] Elapsed 2m 2s (remain 2m 16s) Loss: 0.4042(0.3902) Grad: 155123.9531  LR: 0.00000766  
EVAL: [0/129] Loss: 0.3881(0.3881) 
EVAL: [20/129] Loss: 0.4072(0.4638) 
EVAL: [40/129] Loss: 0.4625(0.4631) 
EVAL: [60/129] Loss: 0.4970(0

Epoch 3 - Save Best Score: 0.4709 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4709 Model


EVAL: [128/129] Loss: 0.4338(0.4611) 


Epoch 3 - avg_train_loss: 0.3902  avg_val_loss: 0.4611
INFO:__main__:Epoch 3 - avg_train_loss: 0.3902  avg_val_loss: 0.4611
Epoch 3 - Score: 0.4709  Scores: [0.42018253332841665, 0.5216397471466937]
INFO:__main__:Epoch 3 - Score: 0.4709  Scores: [0.42018253332841665, 0.5216397471466937]


Epoch: [3][320/638] Elapsed 2m 49s (remain 2m 47s) Loss: 0.2759(0.3893) Grad: 144454.8750  LR: 0.00000740  
Epoch: [3][340/638] Elapsed 2m 56s (remain 2m 33s) Loss: 0.3908(0.3885) Grad: 157187.5469  LR: 0.00000714  
Epoch: [3][360/638] Elapsed 3m 3s (remain 2m 20s) Loss: 0.5377(0.3900) Grad: 65795.5234  LR: 0.00000688  
Epoch: [3][380/638] Elapsed 3m 9s (remain 2m 7s) Loss: 0.3948(0.3894) Grad: 92022.6250  LR: 0.00000662  
Epoch: [3][400/638] Elapsed 3m 16s (remain 1m 55s) Loss: 0.3652(0.3906) Grad: 93561.2734  LR: 0.00000636  
Epoch: [3][420/638] Elapsed 3m 23s (remain 1m 44s) Loss: 0.4633(0.3915) Grad: 64261.5352  LR: 0.00000611  
Epoch: [3][440/638] Elapsed 3m 29s (remain 1m 33s) Loss: 0.2903(0.3928) Grad: 56842.6523  LR: 0.00000586  
EVAL: [0/129] Loss: 0.4361(0.4361) 
EVAL: [20/129] Loss: 0.4207(0.4903) 
EVAL: [40/129] Loss: 0.4673(0.4925) 
EVAL: [60/129] Loss: 0.5289(0.4992) 
EVAL: [80/129] Loss: 0.5160(0.4964) 
EVAL: [100/129] Loss: 0.6469(0.4933) 
EVAL: [120/129] Loss: 0.3776(0

Epoch 3 - avg_train_loss: 0.3932  avg_val_loss: 0.4902
INFO:__main__:Epoch 3 - avg_train_loss: 0.3932  avg_val_loss: 0.4902
Epoch 3 - Score: 0.4998  Scores: [0.48075574078855576, 0.5188476333682893]
INFO:__main__:Epoch 3 - Score: 0.4998  Scores: [0.48075574078855576, 0.5188476333682893]


Epoch: [3][460/638] Elapsed 4m 9s (remain 1m 35s) Loss: 0.2967(0.3922) Grad: 86579.5391  LR: 0.00000561  
Epoch: [3][480/638] Elapsed 4m 14s (remain 1m 23s) Loss: 0.4748(0.3923) Grad: 51718.4805  LR: 0.00000537  
Epoch: [3][500/638] Elapsed 4m 19s (remain 1m 11s) Loss: 0.3717(0.3930) Grad: 73468.7500  LR: 0.00000513  
Epoch: [3][520/638] Elapsed 4m 26s (remain 0m 59s) Loss: 0.5216(0.3927) Grad: 59399.9922  LR: 0.00000489  
Epoch: [3][540/638] Elapsed 4m 31s (remain 0m 48s) Loss: 0.4295(0.3917) Grad: 84837.6484  LR: 0.00000466  
Epoch: [3][560/638] Elapsed 4m 37s (remain 0m 38s) Loss: 0.4228(0.3920) Grad: 50379.6016  LR: 0.00000443  
Epoch: [3][580/638] Elapsed 4m 43s (remain 0m 27s) Loss: 0.5287(0.3906) Grad: 106837.5781  LR: 0.00000420  
Epoch: [3][600/638] Elapsed 4m 48s (remain 0m 17s) Loss: 0.3556(0.3919) Grad: 108060.5625  LR: 0.00000398  
EVAL: [0/129] Loss: 0.4220(0.4220) 
EVAL: [20/129] Loss: 0.4022(0.4685) 
EVAL: [40/129] Loss: 0.4571(0.4651) 
EVAL: [60/129] Loss: 0.4839(0.471

Epoch 3 - Save Best Score: 0.4693 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.4693 Model


EVAL: [128/129] Loss: 0.4578(0.4599) 


Epoch 3 - avg_train_loss: 0.3919  avg_val_loss: 0.4599
INFO:__main__:Epoch 3 - avg_train_loss: 0.3919  avg_val_loss: 0.4599
Epoch 3 - Score: 0.4693  Scores: [0.4174328570516981, 0.5211795792755783]
INFO:__main__:Epoch 3 - Score: 0.4693  Scores: [0.4174328570516981, 0.5211795792755783]


Epoch: [3][620/638] Elapsed 5m 39s (remain 0m 9s) Loss: 0.3350(0.3920) Grad: 60241.5781  LR: 0.00000377  
Epoch: [3][637/638] Elapsed 5m 45s (remain 0m 0s) Loss: 0.2069(0.3926) Grad: 97321.3516  LR: 0.00000359  
EVAL: [0/129] Loss: 0.3983(0.3983) 
EVAL: [20/129] Loss: 0.3731(0.4715) 
EVAL: [40/129] Loss: 0.4600(0.4721) 
EVAL: [60/129] Loss: 0.4949(0.4788) 
EVAL: [80/129] Loss: 0.5084(0.4771) 
EVAL: [100/129] Loss: 0.5763(0.4732) 
EVAL: [120/129] Loss: 0.3518(0.4712) 


Epoch 3 - avg_train_loss: 0.3926  avg_val_loss: 0.4692
INFO:__main__:Epoch 3 - avg_train_loss: 0.3926  avg_val_loss: 0.4692
Epoch 3 - Score: 0.4783  Scores: [0.44336048728235733, 0.5132126271838894]
INFO:__main__:Epoch 3 - Score: 0.4783  Scores: [0.44336048728235733, 0.5132126271838894]


EVAL: [128/129] Loss: 0.4626(0.4692) 
Epoch: [4][0/638] Elapsed 0m 0s (remain 5m 17s) Loss: 0.3489(0.3489) Grad: 383873.5938  LR: 0.00000358  
Epoch: [4][20/638] Elapsed 0m 5s (remain 2m 54s) Loss: 0.2311(0.3523) Grad: 118516.6875  LR: 0.00000337  
Epoch: [4][40/638] Elapsed 0m 11s (remain 2m 50s) Loss: 0.3664(0.3381) Grad: 237023.1406  LR: 0.00000317  
Epoch: [4][60/638] Elapsed 0m 17s (remain 2m 46s) Loss: 0.3646(0.3432) Grad: 121455.4766  LR: 0.00000297  
Epoch: [4][80/638] Elapsed 0m 24s (remain 2m 45s) Loss: 0.3015(0.3356) Grad: 156113.0000  LR: 0.00000278  
Epoch: [4][100/638] Elapsed 0m 30s (remain 2m 42s) Loss: 0.3122(0.3326) Grad: 239779.7344  LR: 0.00000259  
Epoch: [4][120/638] Elapsed 0m 37s (remain 2m 38s) Loss: 0.4119(0.3371) Grad: 132595.5625  LR: 0.00000241  
Epoch: [4][140/638] Elapsed 0m 43s (remain 2m 32s) Loss: 0.1903(0.3344) Grad: 111583.1641  LR: 0.00000224  
EVAL: [0/129] Loss: 0.4304(0.4304) 
EVAL: [20/129] Loss: 0.4032(0.4755) 
EVAL: [40/129] Loss: 0.4643(0.474

Epoch 4 - avg_train_loss: 0.3360  avg_val_loss: 0.4705
INFO:__main__:Epoch 4 - avg_train_loss: 0.3360  avg_val_loss: 0.4705
Epoch 4 - Score: 0.4797  Scores: [0.4386307098170215, 0.5206795506833112]
INFO:__main__:Epoch 4 - Score: 0.4797  Scores: [0.4386307098170215, 0.5206795506833112]


EVAL: [128/129] Loss: 0.4631(0.4705) 
Epoch: [4][160/638] Elapsed 1m 22s (remain 4m 5s) Loss: 0.4141(0.3369) Grad: 227161.3281  LR: 0.00000207  
Epoch: [4][180/638] Elapsed 1m 27s (remain 3m 41s) Loss: 0.3388(0.3377) Grad: 216342.0781  LR: 0.00000190  
Epoch: [4][200/638] Elapsed 1m 34s (remain 3m 24s) Loss: 0.2984(0.3370) Grad: 156204.3125  LR: 0.00000175  
Epoch: [4][220/638] Elapsed 1m 39s (remain 3m 8s) Loss: 0.3750(0.3326) Grad: 162478.0469  LR: 0.00000159  
Epoch: [4][240/638] Elapsed 1m 45s (remain 2m 54s) Loss: 0.4181(0.3324) Grad: 209777.9844  LR: 0.00000145  
Epoch: [4][260/638] Elapsed 1m 51s (remain 2m 41s) Loss: 0.3597(0.3288) Grad: 202782.8438  LR: 0.00000131  
Epoch: [4][280/638] Elapsed 1m 57s (remain 2m 28s) Loss: 0.2613(0.3278) Grad: 187794.6094  LR: 0.00000118  
Epoch: [4][300/638] Elapsed 2m 3s (remain 2m 17s) Loss: 0.3441(0.3286) Grad: 151072.1094  LR: 0.00000105  
EVAL: [0/129] Loss: 0.4206(0.4206) 
EVAL: [20/129] Loss: 0.4004(0.4748) 
EVAL: [40/129] Loss: 0.4626(

Epoch 4 - avg_train_loss: 0.3286  avg_val_loss: 0.4730
INFO:__main__:Epoch 4 - avg_train_loss: 0.3286  avg_val_loss: 0.4730
Epoch 4 - Score: 0.4823  Scores: [0.4473229786471676, 0.5171902814341651]
INFO:__main__:Epoch 4 - Score: 0.4823  Scores: [0.4473229786471676, 0.5171902814341651]


Epoch: [4][320/638] Elapsed 2m 42s (remain 2m 40s) Loss: 0.3195(0.3284) Grad: 147510.4688  LR: 0.00000094  
Epoch: [4][340/638] Elapsed 2m 47s (remain 2m 25s) Loss: 0.3087(0.3271) Grad: 144411.7188  LR: 0.00000082  
Epoch: [4][360/638] Elapsed 2m 53s (remain 2m 13s) Loss: 0.2964(0.3280) Grad: 209962.8906  LR: 0.00000072  
Epoch: [4][380/638] Elapsed 2m 58s (remain 2m 0s) Loss: 0.3512(0.3276) Grad: 130204.2500  LR: 0.00000062  
Epoch: [4][400/638] Elapsed 3m 3s (remain 1m 48s) Loss: 0.3546(0.3274) Grad: 334894.5625  LR: 0.00000053  
Epoch: [4][420/638] Elapsed 3m 10s (remain 1m 38s) Loss: 0.4645(0.3282) Grad: 196887.7188  LR: 0.00000044  
Epoch: [4][440/638] Elapsed 3m 15s (remain 1m 27s) Loss: 0.4085(0.3278) Grad: 126945.7812  LR: 0.00000037  
EVAL: [0/129] Loss: 0.4176(0.4176) 
EVAL: [20/129] Loss: 0.3999(0.4726) 
EVAL: [40/129] Loss: 0.4636(0.4727) 
EVAL: [60/129] Loss: 0.5045(0.4799) 
EVAL: [80/129] Loss: 0.4981(0.4767) 
EVAL: [100/129] Loss: 0.5925(0.4732) 
EVAL: [120/129] Loss: 0.

Epoch 4 - avg_train_loss: 0.3276  avg_val_loss: 0.4697
INFO:__main__:Epoch 4 - avg_train_loss: 0.3276  avg_val_loss: 0.4697
Epoch 4 - Score: 0.4790  Scores: [0.44135866863940126, 0.5167364695301802]
INFO:__main__:Epoch 4 - Score: 0.4790  Scores: [0.44135866863940126, 0.5167364695301802]


Epoch: [4][460/638] Elapsed 3m 55s (remain 1m 30s) Loss: 0.3548(0.3278) Grad: 178789.4219  LR: 0.00000030  
Epoch: [4][480/638] Elapsed 4m 0s (remain 1m 18s) Loss: 0.5372(0.3277) Grad: 199267.0312  LR: 0.00000024  
Epoch: [4][500/638] Elapsed 4m 6s (remain 1m 7s) Loss: 0.3780(0.3260) Grad: 177740.8906  LR: 0.00000018  
Epoch: [4][520/638] Elapsed 4m 11s (remain 0m 56s) Loss: 0.2143(0.3262) Grad: 151697.5000  LR: 0.00000013  
Epoch: [4][540/638] Elapsed 4m 17s (remain 0m 46s) Loss: 0.2619(0.3255) Grad: 110638.1875  LR: 0.00000009  
Epoch: [4][560/638] Elapsed 4m 23s (remain 0m 36s) Loss: 0.2344(0.3250) Grad: 157231.4375  LR: 0.00000006  
Epoch: [4][580/638] Elapsed 4m 30s (remain 0m 26s) Loss: 0.2855(0.3254) Grad: 134632.2500  LR: 0.00000003  
Epoch: [4][600/638] Elapsed 4m 35s (remain 0m 16s) Loss: 0.3362(0.3252) Grad: 168653.7656  LR: 0.00000001  
EVAL: [0/129] Loss: 0.4210(0.4210) 
EVAL: [20/129] Loss: 0.4010(0.4747) 
EVAL: [40/129] Loss: 0.4626(0.4750) 
EVAL: [60/129] Loss: 0.5071(0

Epoch 4 - avg_train_loss: 0.3252  avg_val_loss: 0.4718
INFO:__main__:Epoch 4 - avg_train_loss: 0.3252  avg_val_loss: 0.4718
Epoch 4 - Score: 0.4811  Scores: [0.4452618195516229, 0.516856274171098]
INFO:__main__:Epoch 4 - Score: 0.4811  Scores: [0.4452618195516229, 0.516856274171098]


Epoch: [4][620/638] Elapsed 5m 16s (remain 0m 8s) Loss: 0.4295(0.3251) Grad: 205753.0156  LR: 0.00000000  
Epoch: [4][637/638] Elapsed 5m 21s (remain 0m 0s) Loss: 0.2315(0.3244) Grad: 114381.2812  LR: 0.00000000  
EVAL: [0/129] Loss: 0.4209(0.4209) 
EVAL: [20/129] Loss: 0.4010(0.4748) 
EVAL: [40/129] Loss: 0.4626(0.4751) 
EVAL: [60/129] Loss: 0.5072(0.4821) 
EVAL: [80/129] Loss: 0.5004(0.4790) 
EVAL: [100/129] Loss: 0.5962(0.4754) 
EVAL: [120/129] Loss: 0.3663(0.4735) 


Epoch 4 - avg_train_loss: 0.3244  avg_val_loss: 0.4719
INFO:__main__:Epoch 4 - avg_train_loss: 0.3244  avg_val_loss: 0.4719
Epoch 4 - Score: 0.4812  Scores: [0.4454609224997959, 0.5168435075180935]
INFO:__main__:Epoch 4 - Score: 0.4812  Scores: [0.4454609224997959, 0.5168435075180935]


EVAL: [128/129] Loss: 0.4540(0.4719) 


Score: 0.4693  Scores: [0.4174328570516981, 0.5211795792755783]
INFO:__main__:Score: 0.4693  Scores: [0.4174328570516981, 0.5211795792755783]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size

Epoch: [1][0/644] Elapsed 0m 0s (remain 5m 41s) Loss: 1.2544(1.2544) Grad: 70313.4844  LR: 0.00000008  
Epoch: [1][20/644] Elapsed 0m 5s (remain 2m 39s) Loss: 1.3552(1.1102) Grad: 28075.5566  LR: 0.00000163  
Epoch: [1][40/644] Elapsed 0m 10s (remain 2m 39s) Loss: 0.8763(1.0310) Grad: 69931.6016  LR: 0.00000319  
Epoch: [1][60/644] Elapsed 0m 16s (remain 2m 38s) Loss: 1.0714(1.0447) Grad: 47680.0664  LR: 0.00000475  
Epoch: [1][80/644] Elapsed 0m 21s (remain 2m 31s) Loss: 0.7852(1.0269) Grad: 63068.1641  LR: 0.00000630  
Epoch: [1][100/644] Elapsed 0m 27s (remain 2m 27s) Loss: 0.7268(0.9827) Grad: 191341.7656  LR: 0.00000786  
Epoch: [1][120/644] Elapsed 0m 32s (remain 2m 20s) Loss: 0.6965(0.9355) Grad: 304849.7188  LR: 0.00000942  
Epoch: [1][140/644] Elapsed 0m 37s (remain 2m 13s) Loss: 0.4564(0.8894) Grad: 219533.3906  LR: 0.00001097  
EVAL: [0/126] Loss: 1.0141(1.0141) 
EVAL: [20/126] Loss: 0.6749(0.7261) 
EVAL: [40/126] Loss: 0.7948(0.7285) 
EVAL: [60/126] Loss: 0.7528(0.7285) 
EV

Epoch 1 - Save Best Score: 0.7305 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7305 Model
Epoch 1 - avg_train_loss: 0.8705  avg_val_loss: 0.7181
INFO:__main__:Epoch 1 - avg_train_loss: 0.8705  avg_val_loss: 0.7181
Epoch 1 - Score: 0.7305  Scores: [0.699738893649059, 0.7612381857502764]
INFO:__main__:Epoch 1 - Score: 0.7305  Scores: [0.699738893649059, 0.7612381857502764]


Epoch: [1][160/644] Elapsed 1m 44s (remain 5m 13s) Loss: 0.4744(0.8581) Grad: 219825.4688  LR: 0.00001253  
Epoch: [1][180/644] Elapsed 1m 50s (remain 4m 42s) Loss: 0.5436(0.8257) Grad: 199186.6094  LR: 0.00001409  
Epoch: [1][200/644] Elapsed 1m 56s (remain 4m 17s) Loss: 0.7050(0.7954) Grad: 197544.1719  LR: 0.00001564  
Epoch: [1][220/644] Elapsed 2m 5s (remain 4m 0s) Loss: 0.9675(0.7786) Grad: 184928.1250  LR: 0.00001720  
Epoch: [1][240/644] Elapsed 2m 11s (remain 3m 39s) Loss: 0.6461(0.7638) Grad: 268709.1250  LR: 0.00001875  
Epoch: [1][260/644] Elapsed 2m 18s (remain 3m 22s) Loss: 0.6520(0.7506) Grad: 236212.1406  LR: 0.00002000  
Epoch: [1][280/644] Elapsed 2m 24s (remain 3m 6s) Loss: 0.4979(0.7337) Grad: 294450.8750  LR: 0.00001999  
Epoch: [1][300/644] Elapsed 2m 29s (remain 2m 50s) Loss: 0.3393(0.7205) Grad: 93455.2422  LR: 0.00001998  
EVAL: [0/126] Loss: 0.9271(0.9271) 
EVAL: [20/126] Loss: 0.6278(0.6829) 
EVAL: [40/126] Loss: 0.7622(0.7042) 
EVAL: [60/126] Loss: 0.6596(0.

Epoch 1 - Save Best Score: 0.7122 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7122 Model


EVAL: [125/126] Loss: 0.5840(0.6997) 


Epoch 1 - avg_train_loss: 0.7205  avg_val_loss: 0.6997
INFO:__main__:Epoch 1 - avg_train_loss: 0.7205  avg_val_loss: 0.6997
Epoch 1 - Score: 0.7122  Scores: [0.6136114173562237, 0.8107044250630112]
INFO:__main__:Epoch 1 - Score: 0.7122  Scores: [0.6136114173562237, 0.8107044250630112]


Epoch: [1][320/644] Elapsed 3m 37s (remain 3m 38s) Loss: 0.4236(0.7121) Grad: 322271.1875  LR: 0.00001996  
Epoch: [1][340/644] Elapsed 3m 43s (remain 3m 18s) Loss: 0.6326(0.7010) Grad: 162874.2344  LR: 0.00001994  
Epoch: [1][360/644] Elapsed 3m 49s (remain 2m 59s) Loss: 0.5027(0.6917) Grad: 216872.2656  LR: 0.00001990  
Epoch: [1][380/644] Elapsed 3m 56s (remain 2m 43s) Loss: 0.7110(0.6824) Grad: 304576.7812  LR: 0.00001986  
Epoch: [1][400/644] Elapsed 4m 3s (remain 2m 27s) Loss: 0.4124(0.6710) Grad: 170274.1875  LR: 0.00001981  
Epoch: [1][420/644] Elapsed 4m 9s (remain 2m 12s) Loss: 0.4815(0.6639) Grad: 152191.6562  LR: 0.00001975  
Epoch: [1][440/644] Elapsed 4m 15s (remain 1m 57s) Loss: 0.4400(0.6579) Grad: 75137.9062  LR: 0.00001969  
EVAL: [0/126] Loss: 0.9000(0.9000) 
EVAL: [20/126] Loss: 0.6109(0.6499) 
EVAL: [40/126] Loss: 0.6917(0.6489) 
EVAL: [60/126] Loss: 0.6773(0.6513) 
EVAL: [80/126] Loss: 0.6889(0.6585) 
EVAL: [100/126] Loss: 0.6050(0.6633) 
EVAL: [120/126] Loss: 0.7

Epoch 1 - Save Best Score: 0.6746 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6746 Model


EVAL: [125/126] Loss: 0.3988(0.6609) 


Epoch 1 - avg_train_loss: 0.6555  avg_val_loss: 0.6609
INFO:__main__:Epoch 1 - avg_train_loss: 0.6555  avg_val_loss: 0.6609
Epoch 1 - Score: 0.6746  Scores: [0.5916674027175239, 0.7574693092912509]
INFO:__main__:Epoch 1 - Score: 0.6746  Scores: [0.5916674027175239, 0.7574693092912509]


Epoch: [1][460/644] Elapsed 5m 21s (remain 2m 7s) Loss: 0.4497(0.6503) Grad: 200949.3750  LR: 0.00001962  
Epoch: [1][480/644] Elapsed 5m 26s (remain 1m 50s) Loss: 0.3962(0.6453) Grad: 138465.1406  LR: 0.00001954  
Epoch: [1][500/644] Elapsed 5m 33s (remain 1m 35s) Loss: 0.5575(0.6379) Grad: 156530.9844  LR: 0.00001946  
Epoch: [1][520/644] Elapsed 5m 39s (remain 1m 20s) Loss: 0.3910(0.6341) Grad: 206816.9219  LR: 0.00001937  
Epoch: [1][540/644] Elapsed 5m 47s (remain 1m 6s) Loss: 0.4740(0.6285) Grad: 212093.9062  LR: 0.00001927  
Epoch: [1][560/644] Elapsed 5m 54s (remain 0m 52s) Loss: 0.5464(0.6226) Grad: 279676.2500  LR: 0.00001917  
Epoch: [1][580/644] Elapsed 6m 0s (remain 0m 39s) Loss: 0.6767(0.6181) Grad: 128851.4922  LR: 0.00001905  
Epoch: [1][600/644] Elapsed 6m 6s (remain 0m 26s) Loss: 0.2893(0.6126) Grad: 185976.8594  LR: 0.00001894  
EVAL: [0/126] Loss: 0.8751(0.8751) 
EVAL: [20/126] Loss: 0.5932(0.6366) 
EVAL: [40/126] Loss: 0.6965(0.6379) 
EVAL: [60/126] Loss: 0.6524(0.

Epoch 1 - Save Best Score: 0.6635 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6635 Model


EVAL: [125/126] Loss: 0.3336(0.6480) 


Epoch 1 - avg_train_loss: 0.6126  avg_val_loss: 0.6480
INFO:__main__:Epoch 1 - avg_train_loss: 0.6126  avg_val_loss: 0.6480
Epoch 1 - Score: 0.6635  Scores: [0.5379346931412189, 0.7890929141731438]
INFO:__main__:Epoch 1 - Score: 0.6635  Scores: [0.5379346931412189, 0.7890929141731438]


Epoch: [1][620/644] Elapsed 7m 9s (remain 0m 15s) Loss: 0.4717(0.6086) Grad: 130746.8438  LR: 0.00001881  
Epoch: [1][640/644] Elapsed 7m 14s (remain 0m 2s) Loss: 0.9365(0.6065) Grad: 166043.4062  LR: 0.00001868  
Epoch: [1][643/644] Elapsed 7m 15s (remain 0m 0s) Loss: 0.4865(0.6059) Grad: 219392.2344  LR: 0.00001866  
EVAL: [0/126] Loss: 1.0235(1.0235) 
EVAL: [20/126] Loss: 0.7382(0.7809) 
EVAL: [40/126] Loss: 0.8173(0.7751) 
EVAL: [60/126] Loss: 0.7871(0.7787) 
EVAL: [80/126] Loss: 0.8209(0.7844) 
EVAL: [100/126] Loss: 0.7535(0.7906) 
EVAL: [120/126] Loss: 0.7942(0.7865) 


Epoch 1 - avg_train_loss: 0.6059  avg_val_loss: 0.7891
INFO:__main__:Epoch 1 - avg_train_loss: 0.6059  avg_val_loss: 0.7891
Epoch 1 - Score: 0.8034  Scores: [0.7427957561518197, 0.8640835581598195]
INFO:__main__:Epoch 1 - Score: 0.8034  Scores: [0.7427957561518197, 0.8640835581598195]


EVAL: [125/126] Loss: 0.5110(0.7891) 
Epoch: [2][0/644] Elapsed 0m 0s (remain 5m 17s) Loss: 0.6724(0.6724) Grad: 278332.2812  LR: 0.00001865  
Epoch: [2][20/644] Elapsed 0m 5s (remain 2m 56s) Loss: 0.4696(0.4895) Grad: 250184.3438  LR: 0.00001851  
Epoch: [2][40/644] Elapsed 0m 11s (remain 2m 55s) Loss: 0.5207(0.4803) Grad: 149277.0156  LR: 0.00001837  
Epoch: [2][60/644] Elapsed 0m 16s (remain 2m 41s) Loss: 0.4995(0.4737) Grad: 138273.0625  LR: 0.00001822  
Epoch: [2][80/644] Elapsed 0m 22s (remain 2m 36s) Loss: 0.3552(0.4753) Grad: 301491.0938  LR: 0.00001806  
Epoch: [2][100/644] Elapsed 0m 28s (remain 2m 32s) Loss: 0.5497(0.4839) Grad: 176653.7188  LR: 0.00001790  
Epoch: [2][120/644] Elapsed 0m 33s (remain 2m 25s) Loss: 0.3589(0.4854) Grad: 171819.6250  LR: 0.00001773  
Epoch: [2][140/644] Elapsed 0m 39s (remain 2m 20s) Loss: 0.4413(0.4769) Grad: 121422.9062  LR: 0.00001755  
EVAL: [0/126] Loss: 0.9050(0.9050) 
EVAL: [20/126] Loss: 0.5862(0.6427) 
EVAL: [40/126] Loss: 0.6720(0.640

Epoch 2 - avg_train_loss: 0.4713  avg_val_loss: 0.6545
INFO:__main__:Epoch 2 - avg_train_loss: 0.4713  avg_val_loss: 0.6545
Epoch 2 - Score: 0.6706  Scores: [0.5534654463961347, 0.7877261355815686]
INFO:__main__:Epoch 2 - Score: 0.6706  Scores: [0.5534654463961347, 0.7877261355815686]


Epoch: [2][160/644] Elapsed 1m 34s (remain 4m 43s) Loss: 0.4897(0.4704) Grad: 268606.2812  LR: 0.00001737  
Epoch: [2][180/644] Elapsed 1m 39s (remain 4m 15s) Loss: 0.2903(0.4671) Grad: 191797.3906  LR: 0.00001719  
Epoch: [2][200/644] Elapsed 1m 45s (remain 3m 51s) Loss: 0.3851(0.4638) Grad: 146775.7188  LR: 0.00001700  
Epoch: [2][220/644] Elapsed 1m 50s (remain 3m 32s) Loss: 0.5021(0.4671) Grad: 175602.7031  LR: 0.00001680  
Epoch: [2][240/644] Elapsed 1m 55s (remain 3m 13s) Loss: 0.4389(0.4676) Grad: 105796.0781  LR: 0.00001660  
Epoch: [2][260/644] Elapsed 2m 1s (remain 2m 57s) Loss: 0.3610(0.4644) Grad: 153484.9844  LR: 0.00001639  
Epoch: [2][280/644] Elapsed 2m 6s (remain 2m 43s) Loss: 0.5480(0.4596) Grad: 205072.1562  LR: 0.00001618  
Epoch: [2][300/644] Elapsed 2m 11s (remain 2m 29s) Loss: 0.5035(0.4594) Grad: 169828.5000  LR: 0.00001597  
EVAL: [0/126] Loss: 0.8744(0.8744) 
EVAL: [20/126] Loss: 0.5349(0.6025) 
EVAL: [40/126] Loss: 0.6497(0.6113) 
EVAL: [60/126] Loss: 0.6112(

Epoch 2 - Save Best Score: 0.6393 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6393 Model


EVAL: [125/126] Loss: 0.4195(0.6253) 


Epoch 2 - avg_train_loss: 0.4594  avg_val_loss: 0.6253
INFO:__main__:Epoch 2 - avg_train_loss: 0.4594  avg_val_loss: 0.6253
Epoch 2 - Score: 0.6393  Scores: [0.5476821499364519, 0.7308481354734092]
INFO:__main__:Epoch 2 - Score: 0.6393  Scores: [0.5476821499364519, 0.7308481354734092]


Epoch: [2][320/644] Elapsed 3m 15s (remain 3m 16s) Loss: 0.4892(0.4615) Grad: 334017.6562  LR: 0.00001575  
Epoch: [2][340/644] Elapsed 3m 22s (remain 2m 59s) Loss: 0.3187(0.4601) Grad: 171475.8750  LR: 0.00001553  
Epoch: [2][360/644] Elapsed 3m 29s (remain 2m 44s) Loss: 0.2036(0.4600) Grad: 123875.8281  LR: 0.00001530  
Epoch: [2][380/644] Elapsed 3m 36s (remain 2m 29s) Loss: 0.4152(0.4595) Grad: 146060.6562  LR: 0.00001507  
Epoch: [2][400/644] Elapsed 3m 43s (remain 2m 15s) Loss: 0.4215(0.4596) Grad: 233781.6562  LR: 0.00001483  
Epoch: [2][420/644] Elapsed 3m 49s (remain 2m 1s) Loss: 0.3139(0.4573) Grad: 86421.1562  LR: 0.00001459  
Epoch: [2][440/644] Elapsed 3m 55s (remain 1m 48s) Loss: 0.4245(0.4555) Grad: 105242.4219  LR: 0.00001435  
EVAL: [0/126] Loss: 0.8853(0.8853) 
EVAL: [20/126] Loss: 0.5513(0.6167) 
EVAL: [40/126] Loss: 0.6519(0.6205) 
EVAL: [60/126] Loss: 0.6442(0.6255) 
EVAL: [80/126] Loss: 0.6851(0.6371) 
EVAL: [100/126] Loss: 0.5499(0.6439) 
EVAL: [120/126] Loss: 0.

Epoch 2 - avg_train_loss: 0.4552  avg_val_loss: 0.6427
INFO:__main__:Epoch 2 - avg_train_loss: 0.4552  avg_val_loss: 0.6427
Epoch 2 - Score: 0.6606  Scores: [0.5169565232167656, 0.8042010608040524]
INFO:__main__:Epoch 2 - Score: 0.6606  Scores: [0.5169565232167656, 0.8042010608040524]


EVAL: [125/126] Loss: 0.3465(0.6427) 
Epoch: [2][460/644] Elapsed 4m 50s (remain 1m 55s) Loss: 0.7949(0.4548) Grad: 196447.7812  LR: 0.00001410  
Epoch: [2][480/644] Elapsed 4m 56s (remain 1m 40s) Loss: 0.4832(0.4553) Grad: 146946.9844  LR: 0.00001386  
Epoch: [2][500/644] Elapsed 5m 2s (remain 1m 26s) Loss: 0.4351(0.4535) Grad: 144308.2656  LR: 0.00001361  
Epoch: [2][520/644] Elapsed 5m 6s (remain 1m 12s) Loss: 0.3972(0.4507) Grad: 196463.2656  LR: 0.00001335  
Epoch: [2][540/644] Elapsed 5m 11s (remain 0m 59s) Loss: 0.4439(0.4512) Grad: 89986.3281  LR: 0.00001310  
Epoch: [2][560/644] Elapsed 5m 17s (remain 0m 46s) Loss: 0.2664(0.4500) Grad: 148754.2344  LR: 0.00001284  
Epoch: [2][580/644] Elapsed 5m 22s (remain 0m 35s) Loss: 0.4018(0.4497) Grad: 356174.5000  LR: 0.00001258  
Epoch: [2][600/644] Elapsed 5m 28s (remain 0m 23s) Loss: 0.2916(0.4502) Grad: 127771.4141  LR: 0.00001231  
EVAL: [0/126] Loss: 0.8866(0.8866) 
EVAL: [20/126] Loss: 0.5868(0.6489) 
EVAL: [40/126] Loss: 0.6839(

Epoch 2 - avg_train_loss: 0.4502  avg_val_loss: 0.6669
INFO:__main__:Epoch 2 - avg_train_loss: 0.4502  avg_val_loss: 0.6669
Epoch 2 - Score: 0.6834  Scores: [0.5325717604391035, 0.8341766637637938]
INFO:__main__:Epoch 2 - Score: 0.6834  Scores: [0.5325717604391035, 0.8341766637637938]


Epoch: [2][620/644] Elapsed 6m 24s (remain 0m 14s) Loss: 0.4375(0.4504) Grad: 261261.4531  LR: 0.00001205  
Epoch: [2][640/644] Elapsed 6m 29s (remain 0m 1s) Loss: 0.3930(0.4485) Grad: 210251.4062  LR: 0.00001178  
Epoch: [2][643/644] Elapsed 6m 29s (remain 0m 0s) Loss: 0.3606(0.4480) Grad: 205165.3750  LR: 0.00001174  
EVAL: [0/126] Loss: 0.8328(0.8328) 
EVAL: [20/126] Loss: 0.5343(0.5926) 
EVAL: [40/126] Loss: 0.6719(0.5964) 
EVAL: [60/126] Loss: 0.6021(0.5959) 
EVAL: [80/126] Loss: 0.6067(0.6033) 
EVAL: [100/126] Loss: 0.5334(0.6095) 
EVAL: [120/126] Loss: 0.6910(0.6080) 


Epoch 2 - Save Best Score: 0.6240 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6240 Model


EVAL: [125/126] Loss: 0.3636(0.6101) 


Epoch 2 - avg_train_loss: 0.4480  avg_val_loss: 0.6101
INFO:__main__:Epoch 2 - avg_train_loss: 0.4480  avg_val_loss: 0.6101
Epoch 2 - Score: 0.6240  Scores: [0.5202253493055398, 0.727857144998899]
INFO:__main__:Epoch 2 - Score: 0.6240  Scores: [0.5202253493055398, 0.727857144998899]


Epoch: [3][0/644] Elapsed 0m 0s (remain 5m 40s) Loss: 0.2273(0.2273) Grad: 470721.4375  LR: 0.00001173  
Epoch: [3][20/644] Elapsed 0m 5s (remain 2m 57s) Loss: 0.5040(0.3934) Grad: 101481.6797  LR: 0.00001146  
Epoch: [3][40/644] Elapsed 0m 11s (remain 2m 54s) Loss: 0.5037(0.4169) Grad: 154271.9375  LR: 0.00001120  
Epoch: [3][60/644] Elapsed 0m 18s (remain 3m 1s) Loss: 0.2836(0.4040) Grad: 119447.9844  LR: 0.00001093  
Epoch: [3][80/644] Elapsed 0m 24s (remain 2m 49s) Loss: 0.4698(0.4039) Grad: 197460.7500  LR: 0.00001066  
Epoch: [3][100/644] Elapsed 0m 31s (remain 2m 51s) Loss: 0.3586(0.4049) Grad: 124253.1016  LR: 0.00001039  
Epoch: [3][120/644] Elapsed 0m 37s (remain 2m 41s) Loss: 0.5016(0.4074) Grad: 130812.6250  LR: 0.00001012  
Epoch: [3][140/644] Elapsed 0m 42s (remain 2m 31s) Loss: 0.4042(0.4063) Grad: 175054.9219  LR: 0.00000984  
EVAL: [0/126] Loss: 0.8442(0.8442) 
EVAL: [20/126] Loss: 0.5144(0.5873) 
EVAL: [40/126] Loss: 0.6306(0.5886) 
EVAL: [60/126] Loss: 0.5920(0.5918)

Epoch 3 - Save Best Score: 0.6240 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6240 Model
Epoch 3 - avg_train_loss: 0.4057  avg_val_loss: 0.6074
INFO:__main__:Epoch 3 - avg_train_loss: 0.4057  avg_val_loss: 0.6074
Epoch 3 - Score: 0.6240  Scores: [0.500910605875325, 0.7470747147430271]
INFO:__main__:Epoch 3 - Score: 0.6240  Scores: [0.500910605875325, 0.7470747147430271]


Epoch: [3][160/644] Elapsed 1m 45s (remain 5m 17s) Loss: 0.4629(0.4012) Grad: 135013.8438  LR: 0.00000957  
Epoch: [3][180/644] Elapsed 1m 52s (remain 4m 47s) Loss: 0.5559(0.3980) Grad: 194367.8906  LR: 0.00000930  
Epoch: [3][200/644] Elapsed 1m 58s (remain 4m 21s) Loss: 0.4387(0.3950) Grad: 116969.4766  LR: 0.00000903  
Epoch: [3][220/644] Elapsed 2m 5s (remain 4m 0s) Loss: 0.2924(0.3920) Grad: 103425.1484  LR: 0.00000876  
Epoch: [3][240/644] Elapsed 2m 12s (remain 3m 41s) Loss: 0.4652(0.3901) Grad: 223707.1406  LR: 0.00000850  
Epoch: [3][260/644] Elapsed 2m 17s (remain 3m 22s) Loss: 0.4586(0.3875) Grad: 152684.1094  LR: 0.00000823  
Epoch: [3][280/644] Elapsed 2m 25s (remain 3m 7s) Loss: 0.1926(0.3892) Grad: 166573.9844  LR: 0.00000796  
Epoch: [3][300/644] Elapsed 2m 30s (remain 2m 51s) Loss: 0.4968(0.3890) Grad: 134518.4219  LR: 0.00000770  
EVAL: [0/126] Loss: 0.8630(0.8630) 
EVAL: [20/126] Loss: 0.5729(0.6221) 
EVAL: [40/126] Loss: 0.6748(0.6259) 
EVAL: [60/126] Loss: 0.6245(0

Epoch 3 - avg_train_loss: 0.3890  avg_val_loss: 0.6453
INFO:__main__:Epoch 3 - avg_train_loss: 0.3890  avg_val_loss: 0.6453
Epoch 3 - Score: 0.6626  Scores: [0.5088533111542981, 0.8164421095183444]
INFO:__main__:Epoch 3 - Score: 0.6626  Scores: [0.5088533111542981, 0.8164421095183444]


EVAL: [125/126] Loss: 0.3436(0.6453) 
Epoch: [3][320/644] Elapsed 3m 24s (remain 3m 25s) Loss: 0.4303(0.3887) Grad: 189668.6406  LR: 0.00000744  
Epoch: [3][340/644] Elapsed 3m 30s (remain 3m 6s) Loss: 0.3441(0.3883) Grad: 97379.3672  LR: 0.00000718  
Epoch: [3][360/644] Elapsed 3m 35s (remain 2m 49s) Loss: 0.4140(0.3887) Grad: 144713.8594  LR: 0.00000692  
Epoch: [3][380/644] Elapsed 3m 40s (remain 2m 32s) Loss: 0.3366(0.3855) Grad: 183825.1094  LR: 0.00000666  
Epoch: [3][400/644] Elapsed 3m 46s (remain 2m 17s) Loss: 0.3745(0.3847) Grad: 148548.4219  LR: 0.00000641  
Epoch: [3][420/644] Elapsed 3m 52s (remain 2m 2s) Loss: 0.2244(0.3841) Grad: 151607.5000  LR: 0.00000616  
Epoch: [3][440/644] Elapsed 3m 57s (remain 1m 49s) Loss: 0.4928(0.3821) Grad: 216096.1094  LR: 0.00000591  
EVAL: [0/126] Loss: 0.9136(0.9136) 
EVAL: [20/126] Loss: 0.5764(0.6356) 
EVAL: [40/126] Loss: 0.6724(0.6372) 
EVAL: [60/126] Loss: 0.6415(0.6418) 
EVAL: [80/126] Loss: 0.7118(0.6541) 
EVAL: [100/126] Loss: 0.5

Epoch 3 - avg_train_loss: 0.3821  avg_val_loss: 0.6628
INFO:__main__:Epoch 3 - avg_train_loss: 0.3821  avg_val_loss: 0.6628
Epoch 3 - Score: 0.6826  Scores: [0.5001234004737923, 0.8650570745727458]
INFO:__main__:Epoch 3 - Score: 0.6826  Scores: [0.5001234004737923, 0.8650570745727458]


Epoch: [3][460/644] Elapsed 4m 52s (remain 1m 56s) Loss: 0.4532(0.3834) Grad: 233002.2500  LR: 0.00000566  
Epoch: [3][480/644] Elapsed 4m 57s (remain 1m 40s) Loss: 0.4454(0.3825) Grad: 178108.6250  LR: 0.00000542  
Epoch: [3][500/644] Elapsed 5m 2s (remain 1m 26s) Loss: 0.3461(0.3824) Grad: 107764.1172  LR: 0.00000518  
Epoch: [3][520/644] Elapsed 5m 8s (remain 1m 12s) Loss: 0.3272(0.3824) Grad: 191048.6562  LR: 0.00000495  
Epoch: [3][540/644] Elapsed 5m 14s (remain 0m 59s) Loss: 0.5466(0.3825) Grad: 198112.0312  LR: 0.00000471  
Epoch: [3][560/644] Elapsed 5m 19s (remain 0m 47s) Loss: 0.4100(0.3814) Grad: 209209.2969  LR: 0.00000449  
Epoch: [3][580/644] Elapsed 5m 25s (remain 0m 35s) Loss: 0.3205(0.3807) Grad: 144787.9219  LR: 0.00000426  
Epoch: [3][600/644] Elapsed 5m 30s (remain 0m 23s) Loss: 0.4776(0.3804) Grad: 135097.3281  LR: 0.00000404  
EVAL: [0/126] Loss: 0.9130(0.9130) 
EVAL: [20/126] Loss: 0.5774(0.6340) 
EVAL: [40/126] Loss: 0.6714(0.6361) 
EVAL: [60/126] Loss: 0.6374(

Epoch 3 - avg_train_loss: 0.3804  avg_val_loss: 0.6555
INFO:__main__:Epoch 3 - avg_train_loss: 0.3804  avg_val_loss: 0.6555
Epoch 3 - Score: 0.6725  Scores: [0.5240496618069295, 0.8209164666767678]
INFO:__main__:Epoch 3 - Score: 0.6725  Scores: [0.5240496618069295, 0.8209164666767678]


EVAL: [125/126] Loss: 0.3263(0.6555) 
Epoch: [3][620/644] Elapsed 6m 25s (remain 0m 14s) Loss: 0.3464(0.3810) Grad: 162427.5781  LR: 0.00000383  
Epoch: [3][640/644] Elapsed 6m 30s (remain 0m 1s) Loss: 0.3891(0.3803) Grad: 126692.9766  LR: 0.00000362  
Epoch: [3][643/644] Elapsed 6m 31s (remain 0m 0s) Loss: 0.3865(0.3798) Grad: 162131.6719  LR: 0.00000359  
EVAL: [0/126] Loss: 0.9002(0.9002) 
EVAL: [20/126] Loss: 0.5310(0.5993) 
EVAL: [40/126] Loss: 0.6265(0.6026) 
EVAL: [60/126] Loss: 0.5967(0.6052) 
EVAL: [80/126] Loss: 0.6605(0.6143) 
EVAL: [100/126] Loss: 0.5251(0.6201) 
EVAL: [120/126] Loss: 0.6720(0.6168) 


Epoch 3 - avg_train_loss: 0.3798  avg_val_loss: 0.6198
INFO:__main__:Epoch 3 - avg_train_loss: 0.3798  avg_val_loss: 0.6198
Epoch 3 - Score: 0.6370  Scores: [0.5064597010406201, 0.7674587478276101]
INFO:__main__:Epoch 3 - Score: 0.6370  Scores: [0.5064597010406201, 0.7674587478276101]


EVAL: [125/126] Loss: 0.3258(0.6198) 
Epoch: [4][0/644] Elapsed 0m 0s (remain 5m 34s) Loss: 0.4280(0.4280) Grad: 402150.4062  LR: 0.00000358  
Epoch: [4][20/644] Elapsed 0m 5s (remain 2m 38s) Loss: 0.2260(0.3479) Grad: 371091.1562  LR: 0.00000337  
Epoch: [4][40/644] Elapsed 0m 11s (remain 2m 50s) Loss: 0.3672(0.3196) Grad: 115652.3594  LR: 0.00000317  
Epoch: [4][60/644] Elapsed 0m 16s (remain 2m 36s) Loss: 0.2340(0.3152) Grad: 179772.2500  LR: 0.00000297  
Epoch: [4][80/644] Elapsed 0m 21s (remain 2m 27s) Loss: 0.4506(0.3185) Grad: 131153.3125  LR: 0.00000278  
Epoch: [4][100/644] Elapsed 0m 27s (remain 2m 27s) Loss: 0.3202(0.3204) Grad: 118387.9609  LR: 0.00000260  
Epoch: [4][120/644] Elapsed 0m 32s (remain 2m 20s) Loss: 0.3426(0.3217) Grad: 177536.8750  LR: 0.00000242  
Epoch: [4][140/644] Elapsed 0m 37s (remain 2m 12s) Loss: 0.4156(0.3230) Grad: 170061.6562  LR: 0.00000225  
EVAL: [0/126] Loss: 0.8963(0.8963) 
EVAL: [20/126] Loss: 0.5494(0.6150) 
EVAL: [40/126] Loss: 0.6593(0.619

Epoch 4 - avg_train_loss: 0.3216  avg_val_loss: 0.6399
INFO:__main__:Epoch 4 - avg_train_loss: 0.3216  avg_val_loss: 0.6399
Epoch 4 - Score: 0.6577  Scores: [0.5079750266077823, 0.8075207334421656]
INFO:__main__:Epoch 4 - Score: 0.6577  Scores: [0.5079750266077823, 0.8075207334421656]


Epoch: [4][160/644] Elapsed 1m 32s (remain 4m 36s) Loss: 0.3037(0.3202) Grad: 151549.5156  LR: 0.00000208  
Epoch: [4][180/644] Elapsed 1m 37s (remain 4m 8s) Loss: 0.2462(0.3199) Grad: 116890.8125  LR: 0.00000192  
Epoch: [4][200/644] Elapsed 1m 42s (remain 3m 45s) Loss: 0.3508(0.3212) Grad: 255915.4062  LR: 0.00000176  
Epoch: [4][220/644] Elapsed 1m 48s (remain 3m 27s) Loss: 0.3785(0.3232) Grad: 129130.0703  LR: 0.00000161  
Epoch: [4][240/644] Elapsed 1m 53s (remain 3m 9s) Loss: 0.3577(0.3247) Grad: 171348.3438  LR: 0.00000147  
Epoch: [4][260/644] Elapsed 1m 58s (remain 2m 53s) Loss: 0.2718(0.3246) Grad: 206861.1719  LR: 0.00000133  
Epoch: [4][280/644] Elapsed 2m 4s (remain 2m 40s) Loss: 0.2985(0.3237) Grad: 147276.4219  LR: 0.00000120  
Epoch: [4][300/644] Elapsed 2m 9s (remain 2m 27s) Loss: 0.3250(0.3245) Grad: 253012.9688  LR: 0.00000107  
EVAL: [0/126] Loss: 0.9068(0.9068) 
EVAL: [20/126] Loss: 0.5626(0.6250) 
EVAL: [40/126] Loss: 0.6739(0.6304) 
EVAL: [60/126] Loss: 0.6191(0.

Epoch 4 - avg_train_loss: 0.3245  avg_val_loss: 0.6503
INFO:__main__:Epoch 4 - avg_train_loss: 0.3245  avg_val_loss: 0.6503
Epoch 4 - Score: 0.6682  Scores: [0.5079637423313558, 0.8284540324753071]
INFO:__main__:Epoch 4 - Score: 0.6682  Scores: [0.5079637423313558, 0.8284540324753071]


EVAL: [125/126] Loss: 0.3276(0.6503) 
Epoch: [4][320/644] Elapsed 3m 4s (remain 3m 5s) Loss: 0.2494(0.3232) Grad: 144379.3125  LR: 0.00000095  
Epoch: [4][340/644] Elapsed 3m 9s (remain 2m 48s) Loss: 0.2118(0.3230) Grad: 167812.6406  LR: 0.00000084  
Epoch: [4][360/644] Elapsed 3m 14s (remain 2m 32s) Loss: 0.2608(0.3228) Grad: 99971.0547  LR: 0.00000073  
Epoch: [4][380/644] Elapsed 3m 20s (remain 2m 18s) Loss: 0.3001(0.3226) Grad: 118594.2188  LR: 0.00000064  
Epoch: [4][400/644] Elapsed 3m 25s (remain 2m 4s) Loss: 0.4296(0.3219) Grad: 248918.2344  LR: 0.00000054  
Epoch: [4][420/644] Elapsed 3m 30s (remain 1m 51s) Loss: 0.3368(0.3202) Grad: 170361.7656  LR: 0.00000046  
Epoch: [4][440/644] Elapsed 3m 36s (remain 1m 39s) Loss: 0.3443(0.3194) Grad: 146586.6719  LR: 0.00000038  
EVAL: [0/126] Loss: 0.8950(0.8950) 
EVAL: [20/126] Loss: 0.5474(0.6145) 
EVAL: [40/126] Loss: 0.6652(0.6197) 
EVAL: [60/126] Loss: 0.6082(0.6220) 
EVAL: [80/126] Loss: 0.6790(0.6318) 
EVAL: [100/126] Loss: 0.556

Epoch 4 - avg_train_loss: 0.3202  avg_val_loss: 0.6394
INFO:__main__:Epoch 4 - avg_train_loss: 0.3202  avg_val_loss: 0.6394
Epoch 4 - Score: 0.6569  Scores: [0.5058966803104886, 0.8078853958786153]
INFO:__main__:Epoch 4 - Score: 0.6569  Scores: [0.5058966803104886, 0.8078853958786153]


EVAL: [125/126] Loss: 0.3219(0.6394) 
Epoch: [4][460/644] Elapsed 4m 30s (remain 1m 47s) Loss: 0.4470(0.3197) Grad: 77983.2500  LR: 0.00000031  
Epoch: [4][480/644] Elapsed 4m 35s (remain 1m 33s) Loss: 0.3271(0.3185) Grad: 87095.3906  LR: 0.00000025  
Epoch: [4][500/644] Elapsed 4m 42s (remain 1m 20s) Loss: 0.2740(0.3185) Grad: 105069.0391  LR: 0.00000019  
Epoch: [4][520/644] Elapsed 4m 48s (remain 1m 8s) Loss: 0.3510(0.3177) Grad: 57797.2695  LR: 0.00000014  
Epoch: [4][540/644] Elapsed 4m 54s (remain 0m 56s) Loss: 0.2567(0.3170) Grad: 104474.2734  LR: 0.00000010  
Epoch: [4][560/644] Elapsed 4m 59s (remain 0m 44s) Loss: 0.3681(0.3170) Grad: 84193.3438  LR: 0.00000007  
Epoch: [4][580/644] Elapsed 5m 4s (remain 0m 33s) Loss: 0.4419(0.3176) Grad: 112914.7344  LR: 0.00000004  
Epoch: [4][600/644] Elapsed 5m 9s (remain 0m 22s) Loss: 0.4620(0.3171) Grad: 62149.9922  LR: 0.00000002  
EVAL: [0/126] Loss: 0.8981(0.8981) 
EVAL: [20/126] Loss: 0.5546(0.6229) 
EVAL: [40/126] Loss: 0.6700(0.626

Epoch 4 - avg_train_loss: 0.3171  avg_val_loss: 0.6471
INFO:__main__:Epoch 4 - avg_train_loss: 0.3171  avg_val_loss: 0.6471
Epoch 4 - Score: 0.6649  Scores: [0.5034612065339279, 0.8263858019754161]
INFO:__main__:Epoch 4 - Score: 0.6649  Scores: [0.5034612065339279, 0.8263858019754161]


Epoch: [4][620/644] Elapsed 6m 4s (remain 0m 13s) Loss: 0.4378(0.3179) Grad: 103890.6719  LR: 0.00000001  
Epoch: [4][640/644] Elapsed 6m 9s (remain 0m 1s) Loss: 0.3585(0.3181) Grad: 98777.0938  LR: 0.00000000  
Epoch: [4][643/644] Elapsed 6m 10s (remain 0m 0s) Loss: 0.2625(0.3181) Grad: 57529.7305  LR: 0.00000000  
EVAL: [0/126] Loss: 0.8980(0.8980) 
EVAL: [20/126] Loss: 0.5543(0.6226) 
EVAL: [40/126] Loss: 0.6697(0.6262) 
EVAL: [60/126] Loss: 0.6178(0.6292) 
EVAL: [80/126] Loss: 0.6927(0.6393) 
EVAL: [100/126] Loss: 0.5632(0.6467) 
EVAL: [120/126] Loss: 0.6905(0.6434) 
EVAL: [125/126] Loss: 0.3305(0.6468) 


Epoch 4 - avg_train_loss: 0.3181  avg_val_loss: 0.6468
INFO:__main__:Epoch 4 - avg_train_loss: 0.3181  avg_val_loss: 0.6468
Epoch 4 - Score: 0.6646  Scores: [0.5034708762384914, 0.8258184180747492]
INFO:__main__:Epoch 4 - Score: 0.6646  Scores: [0.5034708762384914, 0.8258184180747492]
Score: 0.6240  Scores: [0.500910605875325, 0.7470747147430271]
INFO:__main__:Score: 0.6240  Scores: [0.500910605875325, 0.7470747147430271]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropou

Epoch: [1][0/646] Elapsed 0m 0s (remain 5m 44s) Loss: 1.0962(1.0962) Grad: 47916.9062  LR: 0.00000008  
Epoch: [1][20/646] Elapsed 0m 6s (remain 3m 26s) Loss: 1.2837(1.0538) Grad: 78961.2891  LR: 0.00000163  
Epoch: [1][40/646] Elapsed 0m 12s (remain 3m 0s) Loss: 1.6603(1.0511) Grad: 98001.7422  LR: 0.00000318  
Epoch: [1][60/646] Elapsed 0m 17s (remain 2m 51s) Loss: 0.8157(1.0731) Grad: 33516.6680  LR: 0.00000473  
Epoch: [1][80/646] Elapsed 0m 24s (remain 2m 47s) Loss: 1.2855(1.0652) Grad: 88941.4453  LR: 0.00000628  
Epoch: [1][100/646] Elapsed 0m 28s (remain 2m 35s) Loss: 0.7740(1.0236) Grad: 142109.5781  LR: 0.00000783  
Epoch: [1][120/646] Elapsed 0m 34s (remain 2m 30s) Loss: 1.0339(0.9793) Grad: 126441.9297  LR: 0.00000938  
Epoch: [1][140/646] Elapsed 0m 40s (remain 2m 26s) Loss: 0.7559(0.9359) Grad: 145182.1250  LR: 0.00001093  
EVAL: [0/125] Loss: 0.8294(0.8294) 
EVAL: [20/125] Loss: 0.6167(0.6496) 
EVAL: [40/125] Loss: 0.7366(0.6669) 
EVAL: [60/125] Loss: 0.6140(0.6492) 
EVA

Epoch 1 - Save Best Score: 0.6557 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6557 Model
Epoch 1 - avg_train_loss: 0.9148  avg_val_loss: 0.6446
INFO:__main__:Epoch 1 - avg_train_loss: 0.9148  avg_val_loss: 0.6446
Epoch 1 - Score: 0.6557  Scores: [0.5886382048350085, 0.7227743773237063]
INFO:__main__:Epoch 1 - Score: 0.6557  Scores: [0.5886382048350085, 0.7227743773237063]


Epoch: [1][160/646] Elapsed 1m 37s (remain 4m 52s) Loss: 0.5670(0.8991) Grad: 152394.2812  LR: 0.00001248  
Epoch: [1][180/646] Elapsed 1m 43s (remain 4m 26s) Loss: 0.6746(0.8715) Grad: 70379.7891  LR: 0.00001403  
Epoch: [1][200/646] Elapsed 1m 50s (remain 4m 3s) Loss: 0.5211(0.8421) Grad: 80435.0781  LR: 0.00001558  
Epoch: [1][220/646] Elapsed 1m 57s (remain 3m 45s) Loss: 0.5829(0.8144) Grad: 108135.8672  LR: 0.00001713  
Epoch: [1][240/646] Elapsed 2m 4s (remain 3m 29s) Loss: 0.6666(0.7978) Grad: 76557.0781  LR: 0.00001868  
Epoch: [1][260/646] Elapsed 2m 10s (remain 3m 12s) Loss: 0.4963(0.7865) Grad: 55098.8203  LR: 0.00002000  
Epoch: [1][280/646] Elapsed 2m 17s (remain 2m 58s) Loss: 0.6697(0.7725) Grad: 71810.0781  LR: 0.00002000  
Epoch: [1][300/646] Elapsed 2m 23s (remain 2m 43s) Loss: 0.5706(0.7552) Grad: 125539.5469  LR: 0.00001998  
EVAL: [0/125] Loss: 0.8444(0.8444) 
EVAL: [20/125] Loss: 0.5404(0.6135) 
EVAL: [40/125] Loss: 0.6112(0.6184) 
EVAL: [60/125] Loss: 0.5469(0.604

Epoch 1 - Save Best Score: 0.6121 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6121 Model
Epoch 1 - avg_train_loss: 0.7552  avg_val_loss: 0.6007
INFO:__main__:Epoch 1 - avg_train_loss: 0.7552  avg_val_loss: 0.6007
Epoch 1 - Score: 0.6121  Scores: [0.624001513153704, 0.6001852025165643]
INFO:__main__:Epoch 1 - Score: 0.6121  Scores: [0.624001513153704, 0.6001852025165643]


Epoch: [1][320/646] Elapsed 3m 19s (remain 3m 21s) Loss: 0.5761(0.7468) Grad: 60163.6094  LR: 0.00001996  
Epoch: [1][340/646] Elapsed 3m 27s (remain 3m 5s) Loss: 0.2853(0.7335) Grad: 81945.2500  LR: 0.00001994  
Epoch: [1][360/646] Elapsed 3m 34s (remain 2m 49s) Loss: 0.8642(0.7279) Grad: 142182.9062  LR: 0.00001990  
Epoch: [1][380/646] Elapsed 3m 42s (remain 2m 34s) Loss: 0.3910(0.7206) Grad: 95507.7812  LR: 0.00001986  
Epoch: [1][400/646] Elapsed 3m 48s (remain 2m 19s) Loss: 0.5011(0.7131) Grad: 77260.2734  LR: 0.00001981  
Epoch: [1][420/646] Elapsed 3m 54s (remain 2m 5s) Loss: 0.6568(0.7066) Grad: 68078.5391  LR: 0.00001976  
Epoch: [1][440/646] Elapsed 4m 0s (remain 1m 51s) Loss: 0.4703(0.7006) Grad: 89752.0078  LR: 0.00001970  
EVAL: [0/125] Loss: 0.7205(0.7205) 
EVAL: [20/125] Loss: 0.5894(0.5125) 
EVAL: [40/125] Loss: 0.5855(0.5590) 
EVAL: [60/125] Loss: 0.5989(0.5411) 
EVAL: [80/125] Loss: 0.4057(0.5329) 
EVAL: [100/125] Loss: 0.6081(0.5212) 
EVAL: [120/125] Loss: 0.4465(0.

Epoch 1 - Save Best Score: 0.5374 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5374 Model


EVAL: [124/125] Loss: 0.5282(0.5239) 


Epoch 1 - avg_train_loss: 0.6972  avg_val_loss: 0.5239
INFO:__main__:Epoch 1 - avg_train_loss: 0.6972  avg_val_loss: 0.5239
Epoch 1 - Score: 0.5374  Scores: [0.455906037936632, 0.618852152160471]
INFO:__main__:Epoch 1 - Score: 0.5374  Scores: [0.455906037936632, 0.618852152160471]


Epoch: [1][460/646] Elapsed 5m 0s (remain 2m 0s) Loss: 0.4948(0.6941) Grad: 94346.6172  LR: 0.00001963  
Epoch: [1][480/646] Elapsed 5m 6s (remain 1m 45s) Loss: 0.7969(0.6880) Grad: 72203.4453  LR: 0.00001955  
Epoch: [1][500/646] Elapsed 5m 13s (remain 1m 30s) Loss: 0.5343(0.6806) Grad: 91829.5000  LR: 0.00001947  
Epoch: [1][520/646] Elapsed 5m 20s (remain 1m 16s) Loss: 0.5269(0.6745) Grad: 141303.1719  LR: 0.00001938  
Epoch: [1][540/646] Elapsed 5m 27s (remain 1m 3s) Loss: 0.5199(0.6682) Grad: 53925.6016  LR: 0.00001928  
Epoch: [1][560/646] Elapsed 5m 34s (remain 0m 50s) Loss: 0.6751(0.6625) Grad: 70926.7969  LR: 0.00001917  
Epoch: [1][580/646] Elapsed 5m 39s (remain 0m 37s) Loss: 0.4018(0.6588) Grad: 51239.4766  LR: 0.00001906  
Epoch: [1][600/646] Elapsed 5m 44s (remain 0m 25s) Loss: 0.6027(0.6540) Grad: 77705.4531  LR: 0.00001895  
EVAL: [0/125] Loss: 0.7392(0.7392) 
EVAL: [20/125] Loss: 0.4881(0.5187) 
EVAL: [40/125] Loss: 0.5481(0.5412) 
EVAL: [60/125] Loss: 0.5222(0.5249) 


Epoch 1 - Save Best Score: 0.5281 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5281 Model


EVAL: [124/125] Loss: 0.6019(0.5167) 


Epoch 1 - avg_train_loss: 0.6540  avg_val_loss: 0.5167
INFO:__main__:Epoch 1 - avg_train_loss: 0.6540  avg_val_loss: 0.5167
Epoch 1 - Score: 0.5281  Scores: [0.4885089334396162, 0.567701852213978]
INFO:__main__:Epoch 1 - Score: 0.5281  Scores: [0.4885089334396162, 0.567701852213978]


Epoch: [1][620/646] Elapsed 6m 43s (remain 0m 16s) Loss: 0.4883(0.6492) Grad: 72952.7734  LR: 0.00001882  
Epoch: [1][640/646] Elapsed 6m 49s (remain 0m 3s) Loss: 0.4744(0.6442) Grad: 53018.6328  LR: 0.00001869  
Epoch: [1][645/646] Elapsed 6m 51s (remain 0m 0s) Loss: 0.4116(0.6433) Grad: 86240.8828  LR: 0.00001866  
EVAL: [0/125] Loss: 0.6464(0.6464) 
EVAL: [20/125] Loss: 0.5004(0.4887) 
EVAL: [40/125] Loss: 0.5415(0.5196) 
EVAL: [60/125] Loss: 0.5628(0.5045) 
EVAL: [80/125] Loss: 0.3900(0.5010) 
EVAL: [100/125] Loss: 0.5763(0.4912) 
EVAL: [120/125] Loss: 0.4403(0.4947) 


Epoch 1 - Save Best Score: 0.5047 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5047 Model


EVAL: [124/125] Loss: 0.5380(0.4937) 


Epoch 1 - avg_train_loss: 0.6433  avg_val_loss: 0.4937
INFO:__main__:Epoch 1 - avg_train_loss: 0.6433  avg_val_loss: 0.4937
Epoch 1 - Score: 0.5047  Scores: [0.4467090605459824, 0.5626574941705766]
INFO:__main__:Epoch 1 - Score: 0.5047  Scores: [0.4467090605459824, 0.5626574941705766]


Epoch: [2][0/646] Elapsed 0m 0s (remain 5m 53s) Loss: 0.4394(0.4394) Grad: 358165.5938  LR: 0.00001865  
Epoch: [2][20/646] Elapsed 0m 5s (remain 2m 39s) Loss: 0.4013(0.4735) Grad: 170311.3750  LR: 0.00001851  
Epoch: [2][40/646] Elapsed 0m 11s (remain 2m 56s) Loss: 0.6144(0.4591) Grad: 262518.2188  LR: 0.00001837  
Epoch: [2][60/646] Elapsed 0m 19s (remain 3m 5s) Loss: 0.4490(0.4593) Grad: 258470.2500  LR: 0.00001822  
Epoch: [2][80/646] Elapsed 0m 25s (remain 2m 59s) Loss: 0.2344(0.4690) Grad: 192557.5312  LR: 0.00001806  
Epoch: [2][100/646] Elapsed 0m 33s (remain 2m 59s) Loss: 0.2629(0.4664) Grad: 378235.7188  LR: 0.00001790  
Epoch: [2][120/646] Elapsed 0m 38s (remain 2m 47s) Loss: 0.4292(0.4573) Grad: 488521.1250  LR: 0.00001773  
Epoch: [2][140/646] Elapsed 0m 44s (remain 2m 39s) Loss: 0.4581(0.4592) Grad: 93417.2656  LR: 0.00001755  
EVAL: [0/125] Loss: 0.6097(0.6097) 
EVAL: [20/125] Loss: 0.4644(0.4895) 
EVAL: [40/125] Loss: 0.5369(0.5074) 
EVAL: [60/125] Loss: 0.5068(0.4936) 

Epoch 2 - Save Best Score: 0.4969 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4969 Model
Epoch 2 - avg_train_loss: 0.4561  avg_val_loss: 0.4878
INFO:__main__:Epoch 2 - avg_train_loss: 0.4561  avg_val_loss: 0.4878
Epoch 2 - Score: 0.4969  Scores: [0.4474487113858622, 0.5462712075963629]
INFO:__main__:Epoch 2 - Score: 0.4969  Scores: [0.4474487113858622, 0.5462712075963629]


Epoch: [2][160/646] Elapsed 1m 40s (remain 5m 3s) Loss: 0.3386(0.4542) Grad: 100419.2500  LR: 0.00001737  
Epoch: [2][180/646] Elapsed 1m 46s (remain 4m 33s) Loss: 0.4023(0.4575) Grad: 172166.9375  LR: 0.00001719  
Epoch: [2][200/646] Elapsed 1m 53s (remain 4m 11s) Loss: 0.4606(0.4608) Grad: 153383.1719  LR: 0.00001700  
Epoch: [2][220/646] Elapsed 1m 59s (remain 3m 49s) Loss: 0.4814(0.4627) Grad: 179743.8438  LR: 0.00001680  
Epoch: [2][240/646] Elapsed 2m 6s (remain 3m 33s) Loss: 0.6661(0.4648) Grad: 266664.5312  LR: 0.00001660  
Epoch: [2][260/646] Elapsed 2m 12s (remain 3m 15s) Loss: 0.4795(0.4631) Grad: 166480.0781  LR: 0.00001640  
Epoch: [2][280/646] Elapsed 2m 20s (remain 3m 1s) Loss: 0.4511(0.4625) Grad: 213865.6094  LR: 0.00001619  
Epoch: [2][300/646] Elapsed 2m 26s (remain 2m 47s) Loss: 0.7329(0.4608) Grad: 115206.7969  LR: 0.00001597  
EVAL: [0/125] Loss: 0.6938(0.6938) 
EVAL: [20/125] Loss: 0.5205(0.4876) 
EVAL: [40/125] Loss: 0.4754(0.5124) 
EVAL: [60/125] Loss: 0.5673(0

Epoch 2 - avg_train_loss: 0.4608  avg_val_loss: 0.4920
INFO:__main__:Epoch 2 - avg_train_loss: 0.4608  avg_val_loss: 0.4920
Epoch 2 - Score: 0.5038  Scores: [0.4521954623152759, 0.5553484396229437]
INFO:__main__:Epoch 2 - Score: 0.5038  Scores: [0.4521954623152759, 0.5553484396229437]


EVAL: [124/125] Loss: 0.5222(0.4920) 
Epoch: [2][320/646] Elapsed 3m 14s (remain 3m 17s) Loss: 0.4794(0.4610) Grad: 161034.2969  LR: 0.00001575  
Epoch: [2][340/646] Elapsed 3m 19s (remain 2m 58s) Loss: 0.6256(0.4620) Grad: 125263.1406  LR: 0.00001553  
Epoch: [2][360/646] Elapsed 3m 24s (remain 2m 41s) Loss: 0.4242(0.4626) Grad: 145416.9219  LR: 0.00001530  
Epoch: [2][380/646] Elapsed 3m 30s (remain 2m 26s) Loss: 0.4823(0.4635) Grad: 130756.7422  LR: 0.00001507  
Epoch: [2][400/646] Elapsed 3m 36s (remain 2m 12s) Loss: 0.6108(0.4637) Grad: 243166.6250  LR: 0.00001484  
Epoch: [2][420/646] Elapsed 3m 42s (remain 1m 58s) Loss: 0.4186(0.4634) Grad: 172509.4531  LR: 0.00001460  
Epoch: [2][440/646] Elapsed 3m 48s (remain 1m 46s) Loss: 0.4685(0.4662) Grad: 159485.3125  LR: 0.00001436  
EVAL: [0/125] Loss: 0.6551(0.6551) 
EVAL: [20/125] Loss: 0.5057(0.4912) 
EVAL: [40/125] Loss: 0.5048(0.5188) 
EVAL: [60/125] Loss: 0.5510(0.5057) 
EVAL: [80/125] Loss: 0.3936(0.4997) 
EVAL: [100/125] Loss: 

Epoch 2 - avg_train_loss: 0.4668  avg_val_loss: 0.4949
INFO:__main__:Epoch 2 - avg_train_loss: 0.4668  avg_val_loss: 0.4949
Epoch 2 - Score: 0.5047  Scores: [0.4294810656480067, 0.5798986449598834]
INFO:__main__:Epoch 2 - Score: 0.5047  Scores: [0.4294810656480067, 0.5798986449598834]


Epoch: [2][460/646] Elapsed 4m 36s (remain 1m 50s) Loss: 0.5381(0.4687) Grad: 264298.3438  LR: 0.00001412  
Epoch: [2][480/646] Elapsed 4m 41s (remain 1m 36s) Loss: 0.3525(0.4669) Grad: 117266.2969  LR: 0.00001387  
Epoch: [2][500/646] Elapsed 4m 47s (remain 1m 23s) Loss: 0.5017(0.4682) Grad: 172950.8438  LR: 0.00001362  
Epoch: [2][520/646] Elapsed 4m 53s (remain 1m 10s) Loss: 0.3333(0.4670) Grad: 111377.5781  LR: 0.00001336  
Epoch: [2][540/646] Elapsed 4m 58s (remain 0m 57s) Loss: 0.5052(0.4656) Grad: 213337.7344  LR: 0.00001311  
Epoch: [2][560/646] Elapsed 5m 4s (remain 0m 46s) Loss: 0.3842(0.4646) Grad: 90014.6719  LR: 0.00001285  
Epoch: [2][580/646] Elapsed 5m 9s (remain 0m 34s) Loss: 0.4134(0.4639) Grad: 110919.6016  LR: 0.00001259  
Epoch: [2][600/646] Elapsed 5m 14s (remain 0m 23s) Loss: 0.4933(0.4618) Grad: 152137.9844  LR: 0.00001233  
EVAL: [0/125] Loss: 0.5970(0.5970) 
EVAL: [20/125] Loss: 0.4598(0.4584) 
EVAL: [40/125] Loss: 0.4953(0.4790) 
EVAL: [60/125] Loss: 0.5287(0

Epoch 2 - Save Best Score: 0.4680 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4680 Model


EVAL: [124/125] Loss: 0.4585(0.4574) 


Epoch 2 - avg_train_loss: 0.4618  avg_val_loss: 0.4574
INFO:__main__:Epoch 2 - avg_train_loss: 0.4618  avg_val_loss: 0.4574
Epoch 2 - Score: 0.4680  Scores: [0.42087563608646067, 0.5151692791295484]
INFO:__main__:Epoch 2 - Score: 0.4680  Scores: [0.42087563608646067, 0.5151692791295484]


Epoch: [2][620/646] Elapsed 6m 11s (remain 0m 14s) Loss: 0.8022(0.4620) Grad: 227595.3438  LR: 0.00001207  
Epoch: [2][640/646] Elapsed 6m 16s (remain 0m 2s) Loss: 0.4345(0.4608) Grad: 130811.2812  LR: 0.00001180  
Epoch: [2][645/646] Elapsed 6m 18s (remain 0m 0s) Loss: 0.4861(0.4616) Grad: 133433.6250  LR: 0.00001173  
EVAL: [0/125] Loss: 0.7387(0.7387) 
EVAL: [20/125] Loss: 0.5762(0.5260) 
EVAL: [40/125] Loss: 0.6019(0.5632) 
EVAL: [60/125] Loss: 0.6094(0.5524) 
EVAL: [80/125] Loss: 0.4210(0.5438) 
EVAL: [100/125] Loss: 0.6077(0.5360) 
EVAL: [120/125] Loss: 0.4630(0.5412) 
EVAL: [124/125] Loss: 0.5478(0.5397) 


Epoch 2 - avg_train_loss: 0.4616  avg_val_loss: 0.5397
INFO:__main__:Epoch 2 - avg_train_loss: 0.4616  avg_val_loss: 0.5397
Epoch 2 - Score: 0.5518  Scores: [0.450931078948441, 0.6525706505960777]
INFO:__main__:Epoch 2 - Score: 0.5518  Scores: [0.450931078948441, 0.6525706505960777]


Epoch: [3][0/646] Elapsed 0m 1s (remain 10m 47s) Loss: 0.5518(0.5518) Grad: 240411.9375  LR: 0.00001172  
Epoch: [3][20/646] Elapsed 0m 6s (remain 3m 21s) Loss: 0.3406(0.3847) Grad: 212907.9375  LR: 0.00001145  
Epoch: [3][40/646] Elapsed 0m 12s (remain 3m 0s) Loss: 0.5748(0.4096) Grad: 171135.1250  LR: 0.00001119  
Epoch: [3][60/646] Elapsed 0m 18s (remain 3m 1s) Loss: 0.4619(0.4057) Grad: 138248.3438  LR: 0.00001092  
Epoch: [3][80/646] Elapsed 0m 24s (remain 2m 54s) Loss: 0.4946(0.4163) Grad: 202905.7344  LR: 0.00001065  
Epoch: [3][100/646] Elapsed 0m 30s (remain 2m 46s) Loss: 0.3675(0.4131) Grad: 141487.9844  LR: 0.00001038  
Epoch: [3][120/646] Elapsed 0m 37s (remain 2m 41s) Loss: 0.4888(0.4116) Grad: 167706.4375  LR: 0.00001011  
Epoch: [3][140/646] Elapsed 0m 42s (remain 2m 32s) Loss: 0.4381(0.4098) Grad: 186384.1094  LR: 0.00000984  
EVAL: [0/125] Loss: 0.6695(0.6695) 
EVAL: [20/125] Loss: 0.4636(0.4920) 
EVAL: [40/125] Loss: 0.4786(0.5018) 
EVAL: [60/125] Loss: 0.4962(0.4942)

Epoch 3 - avg_train_loss: 0.4055  avg_val_loss: 0.4866
INFO:__main__:Epoch 3 - avg_train_loss: 0.4055  avg_val_loss: 0.4866
Epoch 3 - Score: 0.4966  Scores: [0.46829856461866387, 0.5248880434056509]
INFO:__main__:Epoch 3 - Score: 0.4966  Scores: [0.46829856461866387, 0.5248880434056509]


EVAL: [124/125] Loss: 0.4931(0.4866) 
Epoch: [3][160/646] Elapsed 1m 29s (remain 4m 30s) Loss: 0.3226(0.4020) Grad: 164429.3594  LR: 0.00000957  
Epoch: [3][180/646] Elapsed 1m 35s (remain 4m 4s) Loss: 0.4227(0.4011) Grad: 206457.5000  LR: 0.00000930  
Epoch: [3][200/646] Elapsed 1m 41s (remain 3m 44s) Loss: 0.4578(0.4013) Grad: 223139.0938  LR: 0.00000903  
Epoch: [3][220/646] Elapsed 1m 46s (remain 3m 24s) Loss: 0.2774(0.3978) Grad: 104725.4219  LR: 0.00000876  
Epoch: [3][240/646] Elapsed 1m 51s (remain 3m 7s) Loss: 0.5042(0.3985) Grad: 145764.5312  LR: 0.00000849  
Epoch: [3][260/646] Elapsed 1m 57s (remain 2m 53s) Loss: 0.5632(0.3972) Grad: 129514.9688  LR: 0.00000823  
Epoch: [3][280/646] Elapsed 2m 2s (remain 2m 38s) Loss: 0.4646(0.3960) Grad: 284230.7188  LR: 0.00000796  
Epoch: [3][300/646] Elapsed 2m 8s (remain 2m 26s) Loss: 0.3709(0.3955) Grad: 90721.6641  LR: 0.00000770  
EVAL: [0/125] Loss: 0.6669(0.6669) 
EVAL: [20/125] Loss: 0.4571(0.4804) 
EVAL: [40/125] Loss: 0.4736(0.

Epoch 3 - avg_train_loss: 0.3955  avg_val_loss: 0.4772
INFO:__main__:Epoch 3 - avg_train_loss: 0.3955  avg_val_loss: 0.4772
Epoch 3 - Score: 0.4879  Scores: [0.45439314290966437, 0.5213998833490485]
INFO:__main__:Epoch 3 - Score: 0.4879  Scores: [0.45439314290966437, 0.5213998833490485]


EVAL: [124/125] Loss: 0.5141(0.4772) 
Epoch: [3][320/646] Elapsed 2m 56s (remain 2m 58s) Loss: 0.3538(0.3954) Grad: 114272.5391  LR: 0.00000744  
Epoch: [3][340/646] Elapsed 3m 1s (remain 2m 42s) Loss: 0.4535(0.3966) Grad: 144127.0000  LR: 0.00000718  
Epoch: [3][360/646] Elapsed 3m 6s (remain 2m 27s) Loss: 0.3164(0.3955) Grad: 121962.2109  LR: 0.00000692  
Epoch: [3][380/646] Elapsed 3m 12s (remain 2m 13s) Loss: 0.5833(0.3956) Grad: 211415.1875  LR: 0.00000666  
Epoch: [3][400/646] Elapsed 3m 18s (remain 2m 1s) Loss: 0.2830(0.3944) Grad: 135041.2188  LR: 0.00000641  
Epoch: [3][420/646] Elapsed 3m 23s (remain 1m 48s) Loss: 0.3746(0.3949) Grad: 108763.6719  LR: 0.00000616  
Epoch: [3][440/646] Elapsed 3m 29s (remain 1m 37s) Loss: 0.3837(0.3939) Grad: 115788.5859  LR: 0.00000591  
EVAL: [0/125] Loss: 0.6986(0.6986) 
EVAL: [20/125] Loss: 0.4650(0.4902) 
EVAL: [40/125] Loss: 0.4757(0.4985) 
EVAL: [60/125] Loss: 0.4602(0.4905) 
EVAL: [80/125] Loss: 0.3695(0.4893) 
EVAL: [100/125] Loss: 0.5

Epoch 3 - avg_train_loss: 0.3931  avg_val_loss: 0.4832
INFO:__main__:Epoch 3 - avg_train_loss: 0.3931  avg_val_loss: 0.4832
Epoch 3 - Score: 0.4940  Scores: [0.46297780883026296, 0.5249372436043055]
INFO:__main__:Epoch 3 - Score: 0.4940  Scores: [0.46297780883026296, 0.5249372436043055]


Epoch: [3][460/646] Elapsed 4m 18s (remain 1m 43s) Loss: 0.3092(0.3939) Grad: 116548.8516  LR: 0.00000566  
Epoch: [3][480/646] Elapsed 4m 25s (remain 1m 31s) Loss: 0.3653(0.3950) Grad: 151933.8906  LR: 0.00000542  
Epoch: [3][500/646] Elapsed 4m 30s (remain 1m 18s) Loss: 0.2543(0.3947) Grad: 192943.1406  LR: 0.00000518  
Epoch: [3][520/646] Elapsed 4m 36s (remain 1m 6s) Loss: 0.4637(0.3957) Grad: 143102.7344  LR: 0.00000495  
Epoch: [3][540/646] Elapsed 4m 42s (remain 0m 54s) Loss: 0.2579(0.3957) Grad: 155838.0938  LR: 0.00000472  
Epoch: [3][560/646] Elapsed 4m 47s (remain 0m 43s) Loss: 0.2925(0.3952) Grad: 161342.2344  LR: 0.00000449  
Epoch: [3][580/646] Elapsed 4m 53s (remain 0m 32s) Loss: 0.3934(0.3945) Grad: 108192.3438  LR: 0.00000427  
Epoch: [3][600/646] Elapsed 4m 58s (remain 0m 22s) Loss: 0.2906(0.3938) Grad: 110536.9922  LR: 0.00000405  
EVAL: [0/125] Loss: 0.6540(0.6540) 
EVAL: [20/125] Loss: 0.4663(0.4876) 
EVAL: [40/125] Loss: 0.4834(0.4908) 
EVAL: [60/125] Loss: 0.4405

Epoch 3 - avg_train_loss: 0.3938  avg_val_loss: 0.4818
INFO:__main__:Epoch 3 - avg_train_loss: 0.3938  avg_val_loss: 0.4818
Epoch 3 - Score: 0.4912  Scores: [0.4650794722202968, 0.5173539115245985]
INFO:__main__:Epoch 3 - Score: 0.4912  Scores: [0.4650794722202968, 0.5173539115245985]


Epoch: [3][620/646] Elapsed 5m 46s (remain 0m 13s) Loss: 0.3653(0.3927) Grad: 131454.5469  LR: 0.00000383  
Epoch: [3][640/646] Elapsed 5m 52s (remain 0m 2s) Loss: 0.3774(0.3924) Grad: 140182.8281  LR: 0.00000362  
Epoch: [3][645/646] Elapsed 5m 54s (remain 0m 0s) Loss: 0.4371(0.3924) Grad: 228142.0625  LR: 0.00000357  
EVAL: [0/125] Loss: 0.6598(0.6598) 
EVAL: [20/125] Loss: 0.4601(0.4697) 
EVAL: [40/125] Loss: 0.4631(0.4809) 
EVAL: [60/125] Loss: 0.4635(0.4753) 
EVAL: [80/125] Loss: 0.3742(0.4739) 
EVAL: [100/125] Loss: 0.5236(0.4638) 
EVAL: [120/125] Loss: 0.4479(0.4691) 


Epoch 3 - avg_train_loss: 0.3924  avg_val_loss: 0.4689
INFO:__main__:Epoch 3 - avg_train_loss: 0.3924  avg_val_loss: 0.4689
Epoch 3 - Score: 0.4790  Scores: [0.4476808844544515, 0.5103217582307262]
INFO:__main__:Epoch 3 - Score: 0.4790  Scores: [0.4476808844544515, 0.5103217582307262]


EVAL: [124/125] Loss: 0.4689(0.4689) 
Epoch: [4][0/646] Elapsed 0m 0s (remain 6m 50s) Loss: 0.3712(0.3712) Grad: 265871.8750  LR: 0.00000356  
Epoch: [4][20/646] Elapsed 0m 8s (remain 3m 58s) Loss: 0.4884(0.3354) Grad: 407046.7188  LR: 0.00000336  
Epoch: [4][40/646] Elapsed 0m 13s (remain 3m 13s) Loss: 0.3603(0.3332) Grad: 137864.5938  LR: 0.00000316  
Epoch: [4][60/646] Elapsed 0m 19s (remain 3m 2s) Loss: 0.3982(0.3371) Grad: 140777.2500  LR: 0.00000296  
Epoch: [4][80/646] Elapsed 0m 25s (remain 2m 55s) Loss: 0.2669(0.3365) Grad: 162075.2031  LR: 0.00000277  
Epoch: [4][100/646] Elapsed 0m 30s (remain 2m 43s) Loss: 0.2784(0.3309) Grad: 126510.3203  LR: 0.00000259  
Epoch: [4][120/646] Elapsed 0m 35s (remain 2m 32s) Loss: 0.2899(0.3321) Grad: 306173.0625  LR: 0.00000241  
Epoch: [4][140/646] Elapsed 0m 41s (remain 2m 27s) Loss: 0.3571(0.3311) Grad: 187321.2969  LR: 0.00000224  
EVAL: [0/125] Loss: 0.6465(0.6465) 
EVAL: [20/125] Loss: 0.4641(0.4708) 
EVAL: [40/125] Loss: 0.4643(0.4803

Epoch 4 - avg_train_loss: 0.3291  avg_val_loss: 0.4689
INFO:__main__:Epoch 4 - avg_train_loss: 0.3291  avg_val_loss: 0.4689
Epoch 4 - Score: 0.4792  Scores: [0.4460660499704673, 0.5122674560184581]
INFO:__main__:Epoch 4 - Score: 0.4792  Scores: [0.4460660499704673, 0.5122674560184581]


Epoch: [4][160/646] Elapsed 1m 29s (remain 4m 28s) Loss: 0.3420(0.3295) Grad: 241021.7344  LR: 0.00000207  
Epoch: [4][180/646] Elapsed 1m 34s (remain 4m 1s) Loss: 0.3374(0.3309) Grad: 202847.3906  LR: 0.00000191  
Epoch: [4][200/646] Elapsed 1m 39s (remain 3m 39s) Loss: 0.3659(0.3292) Grad: 232438.1562  LR: 0.00000175  
Epoch: [4][220/646] Elapsed 1m 45s (remain 3m 21s) Loss: 0.2917(0.3255) Grad: 180956.1094  LR: 0.00000160  
Epoch: [4][240/646] Elapsed 1m 50s (remain 3m 5s) Loss: 0.2843(0.3279) Grad: 158148.2344  LR: 0.00000146  
Epoch: [4][260/646] Elapsed 1m 56s (remain 2m 51s) Loss: 0.3276(0.3276) Grad: 214479.6562  LR: 0.00000132  
Epoch: [4][280/646] Elapsed 2m 3s (remain 2m 40s) Loss: 0.4298(0.3290) Grad: 331310.5000  LR: 0.00000119  
Epoch: [4][300/646] Elapsed 2m 9s (remain 2m 28s) Loss: 0.3300(0.3293) Grad: 176918.3281  LR: 0.00000107  
EVAL: [0/125] Loss: 0.6587(0.6587) 
EVAL: [20/125] Loss: 0.4624(0.4754) 
EVAL: [40/125] Loss: 0.4701(0.4833) 
EVAL: [60/125] Loss: 0.4579(0.

Epoch 4 - avg_train_loss: 0.3293  avg_val_loss: 0.4731
INFO:__main__:Epoch 4 - avg_train_loss: 0.3293  avg_val_loss: 0.4731
Epoch 4 - Score: 0.4831  Scores: [0.4510254137924777, 0.5152574774005733]
INFO:__main__:Epoch 4 - Score: 0.4831  Scores: [0.4510254137924777, 0.5152574774005733]


EVAL: [124/125] Loss: 0.4632(0.4731) 
Epoch: [4][320/646] Elapsed 2m 56s (remain 2m 59s) Loss: 0.2879(0.3292) Grad: 332253.7500  LR: 0.00000095  
Epoch: [4][340/646] Elapsed 3m 1s (remain 2m 42s) Loss: 0.3371(0.3288) Grad: 136939.1562  LR: 0.00000084  
Epoch: [4][360/646] Elapsed 3m 7s (remain 2m 28s) Loss: 0.2690(0.3273) Grad: 135702.6875  LR: 0.00000073  
Epoch: [4][380/646] Elapsed 3m 13s (remain 2m 14s) Loss: 0.2238(0.3286) Grad: 139253.4688  LR: 0.00000063  
Epoch: [4][400/646] Elapsed 3m 18s (remain 2m 1s) Loss: 0.3357(0.3290) Grad: 115266.4141  LR: 0.00000054  
Epoch: [4][420/646] Elapsed 3m 24s (remain 1m 49s) Loss: 0.3051(0.3285) Grad: 104681.0156  LR: 0.00000046  
Epoch: [4][440/646] Elapsed 3m 29s (remain 1m 37s) Loss: 0.2944(0.3284) Grad: 195250.5781  LR: 0.00000038  
EVAL: [0/125] Loss: 0.6512(0.6512) 
EVAL: [20/125] Loss: 0.4617(0.4771) 
EVAL: [40/125] Loss: 0.4750(0.4847) 
EVAL: [60/125] Loss: 0.4562(0.4799) 
EVAL: [80/125] Loss: 0.3848(0.4792) 
EVAL: [100/125] Loss: 0.5

Epoch 4 - avg_train_loss: 0.3283  avg_val_loss: 0.4746
INFO:__main__:Epoch 4 - avg_train_loss: 0.3283  avg_val_loss: 0.4746
Epoch 4 - Score: 0.4844  Scores: [0.45692892973597954, 0.5118266667464544]
INFO:__main__:Epoch 4 - Score: 0.4844  Scores: [0.45692892973597954, 0.5118266667464544]


EVAL: [124/125] Loss: 0.4664(0.4746) 
Epoch: [4][460/646] Elapsed 4m 17s (remain 1m 43s) Loss: 0.2348(0.3281) Grad: 201474.3594  LR: 0.00000031  
Epoch: [4][480/646] Elapsed 4m 22s (remain 1m 30s) Loss: 0.4347(0.3278) Grad: 293634.8438  LR: 0.00000025  
Epoch: [4][500/646] Elapsed 4m 28s (remain 1m 17s) Loss: 0.4025(0.3271) Grad: 153521.8594  LR: 0.00000019  
Epoch: [4][520/646] Elapsed 4m 34s (remain 1m 5s) Loss: 0.4736(0.3273) Grad: 163064.0156  LR: 0.00000014  
Epoch: [4][540/646] Elapsed 4m 40s (remain 0m 54s) Loss: 0.3252(0.3272) Grad: 145172.1562  LR: 0.00000010  
Epoch: [4][560/646] Elapsed 4m 46s (remain 0m 43s) Loss: 0.3205(0.3285) Grad: 226732.8906  LR: 0.00000007  
Epoch: [4][580/646] Elapsed 4m 51s (remain 0m 32s) Loss: 0.2345(0.3276) Grad: 188870.2500  LR: 0.00000004  
Epoch: [4][600/646] Elapsed 4m 57s (remain 0m 22s) Loss: 0.4551(0.3279) Grad: 144283.2344  LR: 0.00000002  
EVAL: [0/125] Loss: 0.6532(0.6532) 
EVAL: [20/125] Loss: 0.4590(0.4772) 
EVAL: [40/125] Loss: 0.471

Epoch 4 - avg_train_loss: 0.3279  avg_val_loss: 0.4738
INFO:__main__:Epoch 4 - avg_train_loss: 0.3279  avg_val_loss: 0.4738
Epoch 4 - Score: 0.4837  Scores: [0.4529382240131592, 0.5144553599667427]
INFO:__main__:Epoch 4 - Score: 0.4837  Scores: [0.4529382240131592, 0.5144553599667427]


EVAL: [124/125] Loss: 0.4667(0.4738) 
Epoch: [4][620/646] Elapsed 5m 46s (remain 0m 13s) Loss: 0.3257(0.3284) Grad: 187685.0938  LR: 0.00000001  
Epoch: [4][640/646] Elapsed 5m 51s (remain 0m 2s) Loss: 0.3813(0.3283) Grad: 153730.6719  LR: 0.00000000  
Epoch: [4][645/646] Elapsed 5m 52s (remain 0m 0s) Loss: 0.3118(0.3285) Grad: 138919.6875  LR: 0.00000000  
EVAL: [0/125] Loss: 0.6532(0.6532) 
EVAL: [20/125] Loss: 0.4590(0.4772) 
EVAL: [40/125] Loss: 0.4717(0.4839) 
EVAL: [60/125] Loss: 0.4552(0.4793) 
EVAL: [80/125] Loss: 0.3834(0.4787) 
EVAL: [100/125] Loss: 0.5536(0.4690) 
EVAL: [120/125] Loss: 0.4597(0.4740) 
EVAL: [124/125] Loss: 0.4667(0.4739) 


Epoch 4 - avg_train_loss: 0.3285  avg_val_loss: 0.4739
INFO:__main__:Epoch 4 - avg_train_loss: 0.3285  avg_val_loss: 0.4739
Epoch 4 - Score: 0.4837  Scores: [0.4529650366650152, 0.5144514909948328]
INFO:__main__:Epoch 4 - Score: 0.4837  Scores: [0.4529650366650152, 0.5144514909948328]
Score: 0.4680  Scores: [0.42087563608646067, 0.5151692791295484]
INFO:__main__:Score: 0.4680  Scores: [0.42087563608646067, 0.5151692791295484]
Score: 0.5334  Scores: [0.4532268327730023, 0.6136549960452997]
INFO:__main__:Score: 0.5334  Scores: [0.4532268327730023, 0.6136549960452997]
