# About this notebook
- This notebook is a modified version of the PyTorch pipeline from Y.Nakama's starter NLP notebook from Feedback Prize 3 competition [here](https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train). Don't forget to upvote his work!
- Inference notebook is [here](https://www.kaggle.com/mohammad2012191/debertav3-pytorch-baseline-inference-cv-0-467)

In [None]:
!nvidia-smi

Sat Sep  2 09:11:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import runtime



# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:
    exp='exp050'
    is_exp=False
    wandb=False
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.98)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    awp=False
    nth_awp_start_epoch= 3
    adv_lr = 1e-4
    adv_eps = 1e-2
    eval_steps = 100
    save_strategy='step'
    pooling='ConcatPooling'
    n_layers=14
    freeze=True
    freeze_top_num_layer=10
    lr_weight_decay=0.95
    reinit=False


if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = f'/content/drive/MyDrive/Kaggle/outputs/{CFG.exp}/'

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)


In [None]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:

    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public',
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install -q transformers')
!pip install transformers==4.31.0
os.system('pip install -q tokenizers')
!pip install tokenizers==0.13.3
!pip install sentencepiece


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.31.0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.31.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers==4.31.0)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(seed=42)

# Data Loading

In [None]:
# ====================================================
# Data Loading
# ====================================================
input_path = '/content/drive/MyDrive/Kaggle/inputs/'
train = pd.read_csv(input_path+'summaries_train.csv')
test = pd.read_csv(input_path+'summaries_test.csv')
submission = pd.read_csv(input_path+'sample_submission.csv')
prompt_train = pd.read_csv(input_path+'prompts_train.csv')
prompt_test = pd.read_csv(input_path+'prompts_test.csv')
train = pd.merge(train,prompt_train,how='left',on='prompt_id')
test = pd.merge(test,prompt_test,how='left',on='prompt_id')
print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (7165, 8)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...


test.shape: (4, 6)


Unnamed: 0,student_id,prompt_id,text,prompt_question,prompt_title,prompt_text
0,000000ffffff,abc123,Example text 1,Summarize...,Example Title 1,Heading\nText...
1,111111eeeeee,def789,Example text 2,Summarize...,Example Title 2,Heading\nText...
2,222222cccccc,abc123,Example text 3,Summarize...,Example Title 1,Heading\nText...
3,333333dddddd,def789,Example text 4,Summarize...,Example Title 2,Heading\nText...


submission.shape: (4, 3)


Unnamed: 0,student_id,content,wording
0,000000ffffff,0.0,0.0
1,111111eeeeee,0.0,0.0
2,222222cccccc,0.0,0.0
3,333333dddddd,0.0,0.0


In [None]:
# oof_df=pd.read_pickle(input_path+'oof_df.pkl')

In [None]:
train['text'] =  train['prompt_question'] + ' [SEP] ' + train['text']
test['text'] =  test['prompt_question'] + ' [SEP] ' + test['text']

#################################################
# prompt_textも
#################################################

# # "text"列の長さを計算して新しい列"length"に追加
# train['length'] = train['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# train['text'] = train['length'].astype(str) + '[SEP]' + train['prompt_question'] + '[SEP]' +train['prompt_title'] + 'summary(' + train['text'] +') [SEP] source of summary('+train['prompt_text']+')'

# # "text"列の長さを計算して新しい列"length"に追加
# test['length'] = test['text'].apply(len)
# # "text"列の先頭に"length"列の値を結合
# test['text'] = test['length'].astype(str) + '[SEP]' + test['prompt_question'] + '[SEP]' +test['prompt_title'] + 'summary(' + test['text'] +') [SEP] source of summary('+test['prompt_text']+')'


# CV split

In [None]:
# ====================================================
# CV split
# ====================================================
# Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
#     train.loc[val_index, 'fold'] = int(n)
id2fold = {
    "814d6b": 0,
    "39c16e": 1,
    "3b9047": 2,
    "ebad26": 3,
}

train["fold"] = train["prompt_id"].map(id2fold)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1103
1    2057
2    2009
3    1996
dtype: int64

In [None]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=3000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [None]:
CFG.model

'microsoft/deberta-v3-large'

In [None]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
train['text'].iloc[8]

'Summarize at least 3 elements of an ideal tragedy, as described by Aristotle. [SEP] 1 element of an ideal tragedy is that it should be arranged on a complex plan.  Another element of an ideal tragedy is that it should only have one main issue. The last element of an ideal tragedy is that it should have a double thread plot and an opposite catastrophe for both good and bad.'

In [None]:
# テキストをエンコード
text = 'unnko'
encoded = tokenizer(text, return_tensors='pt')

# デコードして元のテキストを取得

decoded_tokens = tokenizer.convert_ids_to_tokens(encoded['input_ids'][0])
decoded_text = " ".join(decoded_tokens)

print(f"Original text: {text}")
print(f"Encoded: {encoded}")
print(f"Decoded text: {decoded_text}")

Original text: unnko
Encoded: {'input_ids': tensor([[   1, 1655,  673, 4712,    2]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
Decoded text: [CLS] ▁un n ko [SEP]


# Dataset

In [None]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 2 # cls & sep

LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/7165 [00:00<?, ?it/s]

max_len: 854
INFO:__main__:max_len: 854


In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label


def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [None]:
#ref:https://github.com/shu421/kagglib/blob/main/nlp/model.py
# ====================================================
# Model
# ====================================================

def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False
# =====================================================
# Pooling
# =====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings



class AttentionPooling(nn.Module):
    """
    Usage:
        self.pool = AttentionPooling(self.config.hidden_size)
    """
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
        )

        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float("-inf")
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings



class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]

class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm, dropout_rate, is_lstm=True):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm

        if is_lstm:
            self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        else:
            self.lstm = nn.GRU(self.hidden_size, self.hiddendim_lstm, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

class ConcatPooling(nn.Module):
    def __init__(self, n_layers=4):
        super(ConcatPooling, self, ).__init__()

        self.n_layers = n_layers

    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling

# GeM
class GeMText(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

# ===========================================
# custom Model
# ===========================================

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if cfg.pooling =='LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
            self.fc = nn.Linear(self.cfg.hidden_size, 2)
        elif cfg.pooling == "GeM":
            self.pool = GeMText()
            self.fc = nn.Linear(self.config.hidden_size, 2)
        elif cfg.pooling=='ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 2)
        elif cfg.pooling=='WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
            self.fc = nn.Linear(self.config.hidden_size, 2)


        self._init_weights(self.fc)


        # Freeze
        if self.cfg.freeze:
            freeze(self.model.encoder.layer[:self.cfg.freeze_top_num_layer])

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        if self.cfg.pooling=='MeanPooling':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling=='GeM':
          feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output


# initialize layer
def reinit_bert(model):
    """_summary_

    Args:
        model (AutoModel): _description_

    Returns:
        model (AutoModel): _description_

    Usage:
        model = reinit_bert(model)
    """
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

# Loss

In [None]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss



class WeightedSmoothL1Loss(nn.Module):
    def __init__(self,weights = torch.tensor([0.5, 1.2], device = device )):
        super(WeightedSmoothL1Loss, self).__init__()
        self.weights=weights

    def forward(self, inputs, targets):
        """
        inputs: ネットワークの出力 (予測値)
        targets: 正解ラベル
        weights: 各サンプルに対する重み
        """
        # Smooth L1 損失を計算
        loss = nn.SmoothL1Loss(reduction='none')(inputs, targets)

        # 重みを適用して損失を計算
        weighted_loss = torch.mean(loss * self.weights)

        return weighted_loss


class MCRMSELoss(nn.Module):
    def __init__(self):
        super(MCRMSELoss, self).__init__()

    def forward(self, y_true, y_pred):
        colwise_mse = torch.mean(torch.square(y_true - y_pred), dim=0)
        return torch.mean(torch.sqrt(colwise_mse), dim=0)

# AWP

In [None]:
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
from torch.nn.modules.loss import _Loss

class AWP:
    def __init__(
        self,
        model: Module,
        criterion: _Loss,
        optimizer: Optimizer,
        apex: bool,
        adv_param: str="weight",
        adv_lr: float=1.0,
        adv_eps: float=0.01
    ) -> None:
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.apex = apex
        self.backup = {}
        self.backup_eps = {}

    def attack_backward(self, inputs: dict, label: Tensor) -> Tensor:
        with torch.cuda.amp.autocast(enabled=self.apex):
            self._save()
            self._attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss

    def _attack_step(self) -> None:
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    # 直前に損失関数に通してパラメータの勾配を取得できるようにしておく必要あり
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(
                            param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self) -> None:
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self) -> None:
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

# Helpler functions

In [None]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):

    if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
        LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    awp = AWP(
            model,
            criterion,
            optimizer,
            CFG.apex,
            adv_lr=CFG.adv_lr,
            adv_eps=CFG.adv_eps
        )
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
            loss = awp.attack_backward(inputs, labels)
            scaler.scale(loss).backward()
            awp._restore()

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader),
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions





# def train_fn_by_step(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device, now_step):

#     # if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
#     #     LOGGER.info(f'AWP training with epoch {epoch+1}')
#     model.train()
#     # awp = AWP(
#     #         model,
#     #         criterion,
#     #         optimizer,
#     #         CFG.apex,
#     #         adv_lr=CFG.adv_lr,
#     #         adv_eps=CFG.adv_eps
#     #     )
#     if now_step==0:
#       scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
#       losses = AverageMeter()
#       start = end = time.time()
#       global_step = 0
#     for step, (inputs, labels) in enumerate(train_loader):
#         if now_step>step:
#           continue
#         inputs = collate(inputs)
#         for k, v in inputs.items():
#             inputs[k] = v.to(device)
#         labels = labels.to(device)
#         batch_size = labels.size(0)
#         with torch.cuda.amp.autocast(enabled=CFG.apex):
#             y_preds = model(inputs)
#             loss = criterion(y_preds, labels)
#         if CFG.gradient_accumulation_steps > 1:
#             loss = loss / CFG.gradient_accumulation_steps
#         losses.update(loss.item(), batch_size)
#         scaler.scale(loss).backward()
#         grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

#         # if CFG.awp and CFG.nth_awp_start_epoch <= epoch+1:
#         #     loss = awp.attack_backward(inputs, labels)
#         #     scaler.scale(loss).backward()
#         #     awp._restore()

#         if (step + 1) % CFG.gradient_accumulation_steps == 0:
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad()
#             global_step += 1
#             if CFG.batch_scheduler:
#                 scheduler.step()
#         end = time.time()
#         if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
#             print('Epoch: [{0}][{1}/{2}] '
#                   'Elapsed {remain:s} '
#                   'Loss: {loss.val:.4f}({loss.avg:.4f}) '
#                   'Grad: {grad_norm:.4f}  '
#                   'LR: {lr:.8f}  '
#                   .format(epoch+1, step, len(train_loader),
#                           remain=timeSince(start, float(step+1)/len(train_loader)),
#                           loss=losses,
#                           grad_norm=grad_norm,
#                           lr=scheduler.get_lr()[0]))

#         if CFG.wandb:
#             wandb.log({f"[fold{fold}] loss": losses.val,
#                        f"[fold{fold}] lr": scheduler.get_lr()[0]})
#         if step%CFG.eval_steps==0:
#           return losses.avg, step+1 ,epoch

#     return losses.avg, step+1 ,epoch+1




# train loop

In [None]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def get_optimizer_grouped_parameters(cfg, model):
        """Layerwise Learning Rate Decay"""
        model_type = "model"
        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if model_type not in n],
                "lr": cfg.decoder_lr,
                "weight_decay": 0.0,
            },
        ]
        num_layers = model.config.num_hidden_layers
        layers = [getattr(model, model_type).embeddings] + list(
            getattr(model, model_type).encoder.layer
        )
        layers.reverse()
        lr = cfg.encoder_lr
        for layer in layers:
            optimizer_grouped_parameters += [
                {
                    "params": [
                        p
                        for n, p in layer.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": cfg.weight_decay,
                    "lr": lr,
                },
                {
                    "params": [
                        p
                        for n, p in layer.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    "weight_decay": 0.0,
                    "lr": lr,
                },
            ]

            lr *= cfg.lr_weight_decay
        return optimizer_grouped_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    # optimizer_parameters = get_optimizer_grouped_parameters(CFG,model)
    # optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)

        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1,
                       f"[fold{fold}] avg_train_loss": avg_loss,
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})

        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds












# ====================================================
# train loop by steps
# ====================================================
def train_loop_steps(folds, fold):

    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values

    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    if CFG.reinit:
      model=reinit_bert(model)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)

    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr,
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(cfg.num_warmup_steps_rate*num_train_steps), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = MCRMSELoss()
    #nn.SmoothL1Loss(reduction='mean')
    # WeightedSmoothL1Loss(reduction='mean') #

    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        model.train()
        # if CFG.awp and epoch+1 >= CFG.nth_awp_start_epoch:
        #   LOGGER.info(f'AWP training with epoch {epoch+1}')

        # awp = AWP(
        #     model,
        #     criterion,
        #     optimizer,
        #     CFG.apex,
        #     adv_lr=CFG.adv_lr,
        #     adv_eps=CFG.adv_eps
        #     )
        scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
        losses = AverageMeter()
        start = end = time.time()
        global_step = 0
        for step, (inputs, labels) in enumerate(train_loader):

            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            batch_size = labels.size(0)
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                y_preds = model(inputs)
                loss = criterion(y_preds, labels)
            if CFG.gradient_accumulation_steps > 1:
                loss = loss / CFG.gradient_accumulation_steps
            losses.update(loss.item(), batch_size)
            scaler.scale(loss).backward()
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
            if (step + 1) % CFG.gradient_accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                global_step += 1
                if CFG.batch_scheduler:
                    scheduler.step()
            end = time.time()
            if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
                print('Epoch: [{0}][{1}/{2}] '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                      'Grad: {grad_norm:.4f}  '
                      'LR: {lr:.8f}  '
                      .format(epoch+1, step, len(train_loader),
                              remain=timeSince(start, float(step+1)/len(train_loader)),
                              loss=losses,
                              grad_norm=grad_norm,
                              lr=scheduler.get_lr()[0]))
            if CFG.wandb:
                wandb.log({f"[fold{fold}] loss": losses.val,
                          f"[fold{fold}] lr": scheduler.get_lr()[0]})

            if (step % CFG.eval_steps==0 and step!=0) or step == (len(train_loader)-1):

                  # valid
                  losses_val = AverageMeter()
                  model.eval()
                  preds = []

                  for val_step, (inputs, labels) in enumerate(valid_loader):

                      inputs = collate(inputs)
                      for k, v in inputs.items():
                          inputs[k] = v.to(device)
                      labels = labels.to(device)
                      batch_size = labels.size(0)
                      with torch.no_grad():
                          y_preds = model(inputs)
                          loss = criterion(y_preds, labels)
                      if CFG.gradient_accumulation_steps > 1:
                          loss = loss / CFG.gradient_accumulation_steps
                      losses_val.update(loss.item(), batch_size)
                      preds.append(y_preds.to('cpu').numpy())

                      if val_step % CFG.print_freq == 0 or val_step == (len(valid_loader)-1):
                          print('EVAL: [{0}/{1}] '
                                'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                                .format(val_step, len(valid_loader),
                                        loss=losses_val))
                  predictions = np.concatenate(preds)

                  # scoring
                  score, scores = get_score(valid_labels, predictions)

                  elapsed = time.time() - start_time



                  if best_score > score:
                      best_score = score
                      LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                      torch.save({'model': model.state_dict(),
                                  'predictions': predictions},
                                  OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")


                  LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {losses.avg:.4f}  avg_val_loss: {losses_val.avg:.4f}')
                  LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
                  # if CFG.wandb:
                  #     wandb.log({f"[fold{fold}] epoch": epoch+1,
                  #                f"[fold{fold}] avg_train_loss": avg_loss,
                  #                f"[fold{fold}] avg_val_loss": avg_val_loss,
                  #                f"[fold{fold}] score": score})
                  model.train()

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return valid_folds


In [None]:
if __name__ == '__main__':

    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')

    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                if CFG.save_strategy=='epoch':
                  _oof_df = train_loop(train, fold)
                elif CFG.save_strategy=='step':
                  _oof_df = train_loop_steps(train,fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')

    if CFG.wandb:
        wandb.finish()
    runtime.unassign()

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.31.0",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

INFO:__main__:DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
 

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Epoch: [1][0/757] Elapsed 0m 4s (remain 52m 52s) Loss: 1.5537(1.5537) Grad: nan  LR: 0.00000007  
Epoch: [1][20/757] Elapsed 0m 9s (remain 5m 40s) Loss: 0.9675(1.2075) Grad: 68671.4297  LR: 0.00000139  
Epoch: [1][40/757] Elapsed 0m 16s (remain 4m 56s) Loss: 0.8705(1.1131) Grad: 33046.3516  LR: 0.00000271  
Epoch: [1][60/757] Elapsed 0m 23s (remain 4m 23s) Loss: 0.7938(0.9985) Grad: 105676.7188  LR: 0.00000403  
Epoch: [1][80/757] Elapsed 0m 29s (remain 4m 4s) Loss: 0.5006(0.9106) Grad: 157125.1875  LR: 0.00000535  
Epoch: [1][100/757] Elapsed 0m 35s (remain 3m 52s) Loss: 0.8357(0.8450) Grad: 108078.0234  LR: 0.00000667  
EVAL: [0/69] Loss: 0.9973(0.9973) 
EVAL: [20/69] Loss: 0.6788(0.8120) 
EVAL: [40/69] Loss: 0.6923(0.7748) 
EVAL: [60/69] Loss: 0.7016(0.7710) 


Epoch 1 - Save Best Score: 0.7962 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7962 Model


EVAL: [68/69] Loss: 1.0662(0.7812) 


Epoch 1 - avg_train_loss: 0.8450  avg_val_loss: 0.7812
INFO:__main__:Epoch 1 - avg_train_loss: 0.8450  avg_val_loss: 0.7812
Epoch 1 - Score: 0.7962  Scores: [0.620919700507766, 0.9714675402349685]
INFO:__main__:Epoch 1 - Score: 0.7962  Scores: [0.620919700507766, 0.9714675402349685]


Epoch: [1][120/757] Elapsed 1m 9s (remain 6m 3s) Loss: 0.5858(0.7992) Grad: 41981.4297  LR: 0.00000799  
Epoch: [1][140/757] Elapsed 1m 16s (remain 5m 34s) Loss: 0.4743(0.7814) Grad: 165765.8750  LR: 0.00000931  
Epoch: [1][160/757] Elapsed 1m 26s (remain 5m 18s) Loss: 0.5101(0.7526) Grad: 50537.9297  LR: 0.00001063  
Epoch: [1][180/757] Elapsed 1m 34s (remain 5m 1s) Loss: 0.4344(0.7317) Grad: 127915.9922  LR: 0.00001195  
Epoch: [1][200/757] Elapsed 1m 41s (remain 4m 40s) Loss: 0.7502(0.7113) Grad: 158955.3125  LR: 0.00001327  
EVAL: [0/69] Loss: 0.9727(0.9727) 
EVAL: [20/69] Loss: 0.6121(0.6857) 
EVAL: [40/69] Loss: 0.6641(0.6704) 
EVAL: [60/69] Loss: 0.6585(0.6768) 


Epoch 1 - Save Best Score: 0.6963 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6963 Model


EVAL: [68/69] Loss: 0.6890(0.6808) 


Epoch 1 - avg_train_loss: 0.7113  avg_val_loss: 0.6808
INFO:__main__:Epoch 1 - avg_train_loss: 0.7113  avg_val_loss: 0.6808
Epoch 1 - Score: 0.6963  Scores: [0.6600544556999662, 0.732628829876423]
INFO:__main__:Epoch 1 - Score: 0.6963  Scores: [0.6600544556999662, 0.732628829876423]


Epoch: [1][220/757] Elapsed 2m 18s (remain 5m 35s) Loss: 0.7200(0.7111) Grad: 100527.4375  LR: 0.00001459  
Epoch: [1][240/757] Elapsed 2m 24s (remain 5m 9s) Loss: 0.5354(0.7056) Grad: 94374.1328  LR: 0.00001591  
Epoch: [1][260/757] Elapsed 2m 32s (remain 4m 49s) Loss: 0.5137(0.6968) Grad: 70079.9609  LR: 0.00001723  
Epoch: [1][280/757] Elapsed 2m 38s (remain 4m 28s) Loss: 0.5981(0.6954) Grad: 132012.3594  LR: 0.00001855  
Epoch: [1][300/757] Elapsed 2m 46s (remain 4m 12s) Loss: 0.6120(0.6820) Grad: 37142.5703  LR: 0.00001987  
EVAL: [0/69] Loss: 0.8666(0.8666) 
EVAL: [20/69] Loss: 0.5448(0.6755) 
EVAL: [40/69] Loss: 0.5700(0.6457) 
EVAL: [60/69] Loss: 0.5054(0.6426) 


Epoch 1 - Save Best Score: 0.6693 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6693 Model


EVAL: [68/69] Loss: 0.9109(0.6524) 


Epoch 1 - avg_train_loss: 0.6820  avg_val_loss: 0.6524
INFO:__main__:Epoch 1 - avg_train_loss: 0.6820  avg_val_loss: 0.6524
Epoch 1 - Score: 0.6693  Scores: [0.575464938368635, 0.7631724073926609]
INFO:__main__:Epoch 1 - Score: 0.6693  Scores: [0.575464938368635, 0.7631724073926609]


Epoch: [1][320/757] Elapsed 3m 22s (remain 4m 35s) Loss: 0.6882(0.6739) Grad: 27017.9023  LR: 0.00002000  
Epoch: [1][340/757] Elapsed 3m 28s (remain 4m 14s) Loss: 0.4804(0.6663) Grad: 127385.3047  LR: 0.00001999  
Epoch: [1][360/757] Elapsed 3m 36s (remain 3m 57s) Loss: 0.6264(0.6605) Grad: 53224.7812  LR: 0.00001998  
Epoch: [1][380/757] Elapsed 3m 42s (remain 3m 39s) Loss: 0.6634(0.6543) Grad: 164498.6875  LR: 0.00001996  
Epoch: [1][400/757] Elapsed 3m 50s (remain 3m 24s) Loss: 0.3809(0.6500) Grad: 90853.7109  LR: 0.00001994  
EVAL: [0/69] Loss: 0.9781(0.9781) 
EVAL: [20/69] Loss: 0.8180(0.9164) 
EVAL: [40/69] Loss: 0.8329(0.8826) 
EVAL: [60/69] Loss: 0.8008(0.8790) 


Epoch 1 - avg_train_loss: 0.6500  avg_val_loss: 0.8929
INFO:__main__:Epoch 1 - avg_train_loss: 0.6500  avg_val_loss: 0.8929
Epoch 1 - Score: 0.9091  Scores: [0.774989143219485, 1.0432277561532863]
INFO:__main__:Epoch 1 - Score: 0.9091  Scores: [0.774989143219485, 1.0432277561532863]


EVAL: [68/69] Loss: 1.2707(0.8929) 
Epoch: [1][420/757] Elapsed 4m 15s (remain 3m 23s) Loss: 0.6171(0.6439) Grad: 129869.1797  LR: 0.00001991  
Epoch: [1][440/757] Elapsed 4m 22s (remain 3m 7s) Loss: 0.3656(0.6381) Grad: 97664.2656  LR: 0.00001987  
Epoch: [1][460/757] Elapsed 4m 27s (remain 2m 52s) Loss: 0.4706(0.6338) Grad: 174259.0312  LR: 0.00001983  
Epoch: [1][480/757] Elapsed 4m 34s (remain 2m 37s) Loss: 0.4695(0.6285) Grad: 49990.9414  LR: 0.00001979  
Epoch: [1][500/757] Elapsed 4m 40s (remain 2m 23s) Loss: 0.2923(0.6239) Grad: 82393.8516  LR: 0.00001974  
EVAL: [0/69] Loss: 0.8528(0.8528) 
EVAL: [20/69] Loss: 0.4803(0.6079) 
EVAL: [40/69] Loss: 0.5361(0.5796) 
EVAL: [60/69] Loss: 0.4832(0.5820) 


Epoch 1 - Save Best Score: 0.6051 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6051 Model


EVAL: [68/69] Loss: 0.7726(0.5890) 


Epoch 1 - avg_train_loss: 0.6239  avg_val_loss: 0.5890
INFO:__main__:Epoch 1 - avg_train_loss: 0.6239  avg_val_loss: 0.5890
Epoch 1 - Score: 0.6051  Scores: [0.5148969170483292, 0.6952711092882186]
INFO:__main__:Epoch 1 - Score: 0.6051  Scores: [0.5148969170483292, 0.6952711092882186]


Epoch: [1][520/757] Elapsed 5m 18s (remain 2m 24s) Loss: 0.7153(0.6204) Grad: 100640.8047  LR: 0.00001969  
Epoch: [1][540/757] Elapsed 5m 25s (remain 2m 9s) Loss: 0.4518(0.6179) Grad: 44761.3867  LR: 0.00001963  
Epoch: [1][560/757] Elapsed 5m 32s (remain 1m 56s) Loss: 0.5136(0.6131) Grad: 54779.1680  LR: 0.00001956  
Epoch: [1][580/757] Elapsed 5m 39s (remain 1m 42s) Loss: 0.6242(0.6102) Grad: 67016.7656  LR: 0.00001949  
Epoch: [1][600/757] Elapsed 5m 47s (remain 1m 30s) Loss: 0.4113(0.6054) Grad: 134095.8906  LR: 0.00001942  
EVAL: [0/69] Loss: 0.8854(0.8854) 
EVAL: [20/69] Loss: 0.6095(0.7178) 
EVAL: [40/69] Loss: 0.6761(0.6957) 
EVAL: [60/69] Loss: 0.5169(0.6928) 


Epoch 1 - avg_train_loss: 0.6054  avg_val_loss: 0.7034
INFO:__main__:Epoch 1 - avg_train_loss: 0.6054  avg_val_loss: 0.7034
Epoch 1 - Score: 0.7174  Scores: [0.5513658507065268, 0.883458645857631]
INFO:__main__:Epoch 1 - Score: 0.7174  Scores: [0.5513658507065268, 0.883458645857631]


EVAL: [68/69] Loss: 0.9583(0.7034) 
Epoch: [1][620/757] Elapsed 6m 11s (remain 1m 21s) Loss: 0.5222(0.6019) Grad: 43136.8008  LR: 0.00001934  
Epoch: [1][640/757] Elapsed 6m 18s (remain 1m 8s) Loss: 0.5022(0.6008) Grad: 56926.0352  LR: 0.00001925  
Epoch: [1][660/757] Elapsed 6m 23s (remain 0m 55s) Loss: 0.4040(0.5957) Grad: 102024.7266  LR: 0.00001916  
Epoch: [1][680/757] Elapsed 6m 29s (remain 0m 43s) Loss: 0.4312(0.5937) Grad: 88240.5078  LR: 0.00001907  
Epoch: [1][700/757] Elapsed 6m 35s (remain 0m 31s) Loss: 0.4430(0.5903) Grad: 85992.0078  LR: 0.00001897  
EVAL: [0/69] Loss: 0.8843(0.8843) 
EVAL: [20/69] Loss: 0.6085(0.7486) 
EVAL: [40/69] Loss: 0.6892(0.7133) 
EVAL: [60/69] Loss: 0.6047(0.7122) 


Epoch 1 - avg_train_loss: 0.5903  avg_val_loss: 0.7221
INFO:__main__:Epoch 1 - avg_train_loss: 0.5903  avg_val_loss: 0.7221
Epoch 1 - Score: 0.7394  Scores: [0.7176342606584862, 0.7611639583515515]
INFO:__main__:Epoch 1 - Score: 0.7394  Scores: [0.7176342606584862, 0.7611639583515515]


EVAL: [68/69] Loss: 0.9994(0.7221) 
Epoch: [1][720/757] Elapsed 7m 0s (remain 0m 20s) Loss: 0.3915(0.5880) Grad: 102911.9141  LR: 0.00001886  
Epoch: [1][740/757] Elapsed 7m 6s (remain 0m 9s) Loss: 0.4343(0.5881) Grad: 88266.3203  LR: 0.00001875  
Epoch: [1][756/757] Elapsed 7m 10s (remain 0m 0s) Loss: 0.4814(0.5873) Grad: 94147.8906  LR: 0.00001866  
EVAL: [0/69] Loss: 0.8214(0.8214) 
EVAL: [20/69] Loss: 0.5197(0.6536) 
EVAL: [40/69] Loss: 0.5896(0.6256) 
EVAL: [60/69] Loss: 0.5050(0.6223) 


Epoch 1 - avg_train_loss: 0.5873  avg_val_loss: 0.6329
INFO:__main__:Epoch 1 - avg_train_loss: 0.5873  avg_val_loss: 0.6329
Epoch 1 - Score: 0.6493  Scores: [0.5656459289589073, 0.733023356899246]
INFO:__main__:Epoch 1 - Score: 0.6493  Scores: [0.5656459289589073, 0.733023356899246]


EVAL: [68/69] Loss: 0.8925(0.6329) 
Epoch: [2][0/757] Elapsed 0m 0s (remain 6m 47s) Loss: 0.3687(0.3687) Grad: 287974.0312  LR: 0.00001866  
Epoch: [2][20/757] Elapsed 0m 6s (remain 3m 36s) Loss: 0.4493(0.4276) Grad: 71802.8672  LR: 0.00001854  
Epoch: [2][40/757] Elapsed 0m 14s (remain 4m 5s) Loss: 0.4792(0.4582) Grad: 105159.6250  LR: 0.00001842  
Epoch: [2][60/757] Elapsed 0m 19s (remain 3m 37s) Loss: 0.3741(0.4545) Grad: 111212.1953  LR: 0.00001829  
Epoch: [2][80/757] Elapsed 0m 25s (remain 3m 36s) Loss: 0.4856(0.4565) Grad: 129255.1719  LR: 0.00001816  
Epoch: [2][100/757] Elapsed 0m 31s (remain 3m 23s) Loss: 0.5203(0.4609) Grad: 63732.6289  LR: 0.00001803  
EVAL: [0/69] Loss: 0.8281(0.8281) 
EVAL: [20/69] Loss: 0.5003(0.6266) 
EVAL: [40/69] Loss: 0.5643(0.6019) 
EVAL: [60/69] Loss: 0.4852(0.6061) 


Epoch 2 - avg_train_loss: 0.4609  avg_val_loss: 0.6164
INFO:__main__:Epoch 2 - avg_train_loss: 0.4609  avg_val_loss: 0.6164
Epoch 2 - Score: 0.6327  Scores: [0.577246592914605, 0.6880942533933022]
INFO:__main__:Epoch 2 - Score: 0.6327  Scores: [0.577246592914605, 0.6880942533933022]


EVAL: [68/69] Loss: 0.8626(0.6164) 
Epoch: [2][120/757] Elapsed 0m 57s (remain 4m 59s) Loss: 0.5726(0.4650) Grad: 63649.9961  LR: 0.00001789  
Epoch: [2][140/757] Elapsed 1m 2s (remain 4m 35s) Loss: 0.6000(0.4684) Grad: 85794.6250  LR: 0.00001774  
Epoch: [2][160/757] Elapsed 1m 8s (remain 4m 12s) Loss: 0.4576(0.4703) Grad: 111967.5078  LR: 0.00001760  
Epoch: [2][180/757] Elapsed 1m 15s (remain 3m 59s) Loss: 0.5091(0.4679) Grad: 117580.5625  LR: 0.00001744  
Epoch: [2][200/757] Elapsed 1m 20s (remain 3m 43s) Loss: 0.3633(0.4682) Grad: 33121.9766  LR: 0.00001729  
EVAL: [0/69] Loss: 0.9162(0.9162) 
EVAL: [20/69] Loss: 0.7140(0.8377) 
EVAL: [40/69] Loss: 0.7650(0.8040) 
EVAL: [60/69] Loss: 0.6771(0.7990) 


Epoch 2 - avg_train_loss: 0.4682  avg_val_loss: 0.8111
INFO:__main__:Epoch 2 - avg_train_loss: 0.4682  avg_val_loss: 0.8111
Epoch 2 - Score: 0.8276  Scores: [0.698291471740013, 0.9569471521233143]
INFO:__main__:Epoch 2 - Score: 0.8276  Scores: [0.698291471740013, 0.9569471521233143]


EVAL: [68/69] Loss: 1.1410(0.8111) 
Epoch: [2][220/757] Elapsed 1m 46s (remain 4m 17s) Loss: 0.3301(0.4719) Grad: 88919.4219  LR: 0.00001713  
Epoch: [2][240/757] Elapsed 1m 51s (remain 3m 59s) Loss: 0.4307(0.4721) Grad: 119144.1484  LR: 0.00001696  
Epoch: [2][260/757] Elapsed 1m 57s (remain 3m 42s) Loss: 0.4600(0.4707) Grad: 108305.3828  LR: 0.00001680  
Epoch: [2][280/757] Elapsed 2m 3s (remain 3m 29s) Loss: 0.4384(0.4669) Grad: 108150.5156  LR: 0.00001663  
Epoch: [2][300/757] Elapsed 2m 9s (remain 3m 15s) Loss: 0.3670(0.4620) Grad: 88381.5234  LR: 0.00001645  
EVAL: [0/69] Loss: 0.8442(0.8442) 
EVAL: [20/69] Loss: 0.4403(0.5789) 
EVAL: [40/69] Loss: 0.5394(0.5556) 
EVAL: [60/69] Loss: 0.4706(0.5596) 


Epoch 2 - Save Best Score: 0.5847 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5847 Model


EVAL: [68/69] Loss: 0.7679(0.5671) 


Epoch 2 - avg_train_loss: 0.4620  avg_val_loss: 0.5671
INFO:__main__:Epoch 2 - avg_train_loss: 0.4620  avg_val_loss: 0.5671
Epoch 2 - Score: 0.5847  Scores: [0.499839020489442, 0.6695849672106007]
INFO:__main__:Epoch 2 - Score: 0.5847  Scores: [0.499839020489442, 0.6695849672106007]


Epoch: [2][320/757] Elapsed 2m 44s (remain 3m 42s) Loss: 0.4133(0.4613) Grad: 111139.0703  LR: 0.00001627  
Epoch: [2][340/757] Elapsed 2m 52s (remain 3m 30s) Loss: 0.4597(0.4628) Grad: 78878.0312  LR: 0.00001609  
Epoch: [2][360/757] Elapsed 2m 59s (remain 3m 17s) Loss: 0.5763(0.4634) Grad: 87678.0703  LR: 0.00001591  
Epoch: [2][380/757] Elapsed 3m 6s (remain 3m 4s) Loss: 0.5133(0.4632) Grad: 112476.2109  LR: 0.00001572  
Epoch: [2][400/757] Elapsed 3m 14s (remain 2m 52s) Loss: 0.4660(0.4635) Grad: 49936.1289  LR: 0.00001553  
EVAL: [0/69] Loss: 0.8596(0.8596) 
EVAL: [20/69] Loss: 0.5302(0.6327) 
EVAL: [40/69] Loss: 0.5317(0.6061) 
EVAL: [60/69] Loss: 0.4629(0.6045) 


Epoch 2 - avg_train_loss: 0.4635  avg_val_loss: 0.6106
INFO:__main__:Epoch 2 - avg_train_loss: 0.4635  avg_val_loss: 0.6106
Epoch 2 - Score: 0.6269  Scores: [0.5335470056543838, 0.7202977083597867]
INFO:__main__:Epoch 2 - Score: 0.6269  Scores: [0.5335470056543838, 0.7202977083597867]


EVAL: [68/69] Loss: 0.8163(0.6106) 
Epoch: [2][420/757] Elapsed 3m 39s (remain 2m 55s) Loss: 0.3523(0.4625) Grad: 68166.6172  LR: 0.00001534  
Epoch: [2][440/757] Elapsed 3m 45s (remain 2m 41s) Loss: 0.3688(0.4636) Grad: 16014.7070  LR: 0.00001514  
Epoch: [2][460/757] Elapsed 3m 51s (remain 2m 28s) Loss: 0.4042(0.4613) Grad: 132911.9531  LR: 0.00001494  
Epoch: [2][480/757] Elapsed 3m 58s (remain 2m 16s) Loss: 0.3723(0.4625) Grad: 99177.3359  LR: 0.00001474  
Epoch: [2][500/757] Elapsed 4m 3s (remain 2m 4s) Loss: 0.4070(0.4607) Grad: 90039.0469  LR: 0.00001454  
EVAL: [0/69] Loss: 0.8325(0.8325) 
EVAL: [20/69] Loss: 0.5719(0.6764) 
EVAL: [40/69] Loss: 0.5930(0.6477) 
EVAL: [60/69] Loss: 0.4994(0.6460) 


Epoch 2 - avg_train_loss: 0.4607  avg_val_loss: 0.6543
INFO:__main__:Epoch 2 - avg_train_loss: 0.4607  avg_val_loss: 0.6543
Epoch 2 - Score: 0.6700  Scores: [0.5812153989248743, 0.758703649557258]
INFO:__main__:Epoch 2 - Score: 0.6700  Scores: [0.5812153989248743, 0.758703649557258]


EVAL: [68/69] Loss: 0.9251(0.6543) 
Epoch: [2][520/757] Elapsed 4m 28s (remain 2m 1s) Loss: 0.4341(0.4590) Grad: 111377.7422  LR: 0.00001433  
Epoch: [2][540/757] Elapsed 4m 34s (remain 1m 49s) Loss: 0.4341(0.4582) Grad: 41992.0352  LR: 0.00001412  
Epoch: [2][560/757] Elapsed 4m 40s (remain 1m 37s) Loss: 0.8107(0.4589) Grad: 55153.1133  LR: 0.00001391  
Epoch: [2][580/757] Elapsed 4m 46s (remain 1m 26s) Loss: 0.3470(0.4589) Grad: 70645.5547  LR: 0.00001370  
Epoch: [2][600/757] Elapsed 4m 52s (remain 1m 16s) Loss: 0.2749(0.4587) Grad: 84761.2422  LR: 0.00001348  
EVAL: [0/69] Loss: 0.8122(0.8122) 
EVAL: [20/69] Loss: 0.4611(0.5869) 
EVAL: [40/69] Loss: 0.5229(0.5619) 
EVAL: [60/69] Loss: 0.4475(0.5644) 


Epoch 2 - avg_train_loss: 0.4587  avg_val_loss: 0.5725
INFO:__main__:Epoch 2 - avg_train_loss: 0.4587  avg_val_loss: 0.5725
Epoch 2 - Score: 0.5895  Scores: [0.5050405826899855, 0.6739346533024448]
INFO:__main__:Epoch 2 - Score: 0.5895  Scores: [0.5050405826899855, 0.6739346533024448]


EVAL: [68/69] Loss: 0.7872(0.5725) 
Epoch: [2][620/757] Elapsed 5m 17s (remain 1m 9s) Loss: 0.3757(0.4589) Grad: 128753.6250  LR: 0.00001327  
Epoch: [2][640/757] Elapsed 5m 24s (remain 0m 58s) Loss: 0.5301(0.4583) Grad: 80536.7891  LR: 0.00001305  
Epoch: [2][660/757] Elapsed 5m 29s (remain 0m 47s) Loss: 0.2939(0.4564) Grad: 91470.5547  LR: 0.00001283  
Epoch: [2][680/757] Elapsed 5m 34s (remain 0m 37s) Loss: 0.5152(0.4566) Grad: 124759.4688  LR: 0.00001261  
Epoch: [2][700/757] Elapsed 5m 41s (remain 0m 27s) Loss: 0.3355(0.4554) Grad: 36888.8828  LR: 0.00001238  
EVAL: [0/69] Loss: 0.8346(0.8346) 
EVAL: [20/69] Loss: 0.5502(0.6702) 
EVAL: [40/69] Loss: 0.6069(0.6412) 
EVAL: [60/69] Loss: 0.4915(0.6392) 


Epoch 2 - avg_train_loss: 0.4554  avg_val_loss: 0.6489
INFO:__main__:Epoch 2 - avg_train_loss: 0.4554  avg_val_loss: 0.6489
Epoch 2 - Score: 0.6652  Scores: [0.6131083048261786, 0.7172045831907733]
INFO:__main__:Epoch 2 - Score: 0.6652  Scores: [0.6131083048261786, 0.7172045831907733]


EVAL: [68/69] Loss: 0.8895(0.6489) 
Epoch: [2][720/757] Elapsed 6m 5s (remain 0m 18s) Loss: 0.4132(0.4552) Grad: 119591.4922  LR: 0.00001216  
Epoch: [2][740/757] Elapsed 6m 11s (remain 0m 8s) Loss: 0.5415(0.4561) Grad: 111069.4609  LR: 0.00001193  
Epoch: [2][756/757] Elapsed 6m 16s (remain 0m 0s) Loss: 0.5606(0.4577) Grad: 142479.6094  LR: 0.00001175  
EVAL: [0/69] Loss: 0.9146(0.9146) 
EVAL: [20/69] Loss: 0.6503(0.7479) 
EVAL: [40/69] Loss: 0.6794(0.7256) 
EVAL: [60/69] Loss: 0.5680(0.7210) 


Epoch 2 - avg_train_loss: 0.4577  avg_val_loss: 0.7293
INFO:__main__:Epoch 2 - avg_train_loss: 0.4577  avg_val_loss: 0.7293
Epoch 2 - Score: 0.7438  Scores: [0.5914537965768178, 0.8961043781784581]
INFO:__main__:Epoch 2 - Score: 0.7438  Scores: [0.5914537965768178, 0.8961043781784581]


EVAL: [68/69] Loss: 0.9873(0.7293) 
Epoch: [3][0/757] Elapsed 0m 0s (remain 7m 26s) Loss: 0.5661(0.5661) Grad: inf  LR: 0.00001174  
Epoch: [3][20/757] Elapsed 0m 6s (remain 3m 44s) Loss: 0.3756(0.4305) Grad: 58231.0703  LR: 0.00001151  
Epoch: [3][40/757] Elapsed 0m 12s (remain 3m 41s) Loss: 0.4826(0.4172) Grad: 65025.7930  LR: 0.00001129  
Epoch: [3][60/757] Elapsed 0m 17s (remain 3m 24s) Loss: 0.3408(0.4056) Grad: 96298.7344  LR: 0.00001106  
Epoch: [3][80/757] Elapsed 0m 24s (remain 3m 26s) Loss: 0.4412(0.4000) Grad: 20876.6367  LR: 0.00001083  
Epoch: [3][100/757] Elapsed 0m 31s (remain 3m 23s) Loss: 0.3132(0.3983) Grad: 83953.1328  LR: 0.00001060  
EVAL: [0/69] Loss: 0.8304(0.8304) 
EVAL: [20/69] Loss: 0.5255(0.6291) 
EVAL: [40/69] Loss: 0.5526(0.6062) 
EVAL: [60/69] Loss: 0.4659(0.6071) 
EVAL: [68/69] Loss: 0.8226(0.6142) 


Epoch 3 - avg_train_loss: 0.3983  avg_val_loss: 0.6142
INFO:__main__:Epoch 3 - avg_train_loss: 0.3983  avg_val_loss: 0.6142
Epoch 3 - Score: 0.6302  Scores: [0.597555554117201, 0.6628500659089074]
INFO:__main__:Epoch 3 - Score: 0.6302  Scores: [0.597555554117201, 0.6628500659089074]


Epoch: [3][120/757] Elapsed 0m 57s (remain 5m 4s) Loss: 0.5146(0.3959) Grad: 50183.6484  LR: 0.00001037  
Epoch: [3][140/757] Elapsed 1m 4s (remain 4m 40s) Loss: 0.5124(0.4010) Grad: 98125.8828  LR: 0.00001014  
Epoch: [3][160/757] Elapsed 1m 9s (remain 4m 18s) Loss: 0.3699(0.4077) Grad: 49936.0469  LR: 0.00000991  
Epoch: [3][180/757] Elapsed 1m 16s (remain 4m 3s) Loss: 0.2573(0.4042) Grad: 114750.6484  LR: 0.00000968  
Epoch: [3][200/757] Elapsed 1m 22s (remain 3m 47s) Loss: 0.6557(0.4027) Grad: 114800.0156  LR: 0.00000945  
EVAL: [0/69] Loss: 0.8625(0.8625) 
EVAL: [20/69] Loss: 0.5370(0.6380) 
EVAL: [40/69] Loss: 0.5747(0.6171) 
EVAL: [60/69] Loss: 0.5017(0.6213) 


Epoch 3 - avg_train_loss: 0.4027  avg_val_loss: 0.6268
INFO:__main__:Epoch 3 - avg_train_loss: 0.4027  avg_val_loss: 0.6268
Epoch 3 - Score: 0.6429  Scores: [0.6432834388767702, 0.6424885186575373]
INFO:__main__:Epoch 3 - Score: 0.6429  Scores: [0.6432834388767702, 0.6424885186575373]


EVAL: [68/69] Loss: 0.8047(0.6268) 
Epoch: [3][220/757] Elapsed 1m 47s (remain 4m 21s) Loss: 0.4329(0.3999) Grad: 62004.2617  LR: 0.00000922  
Epoch: [3][240/757] Elapsed 1m 53s (remain 4m 3s) Loss: 0.4870(0.3992) Grad: 113558.1641  LR: 0.00000899  
Epoch: [3][260/757] Elapsed 1m 59s (remain 3m 46s) Loss: 0.3859(0.3980) Grad: 95498.9844  LR: 0.00000876  
Epoch: [3][280/757] Elapsed 2m 5s (remain 3m 32s) Loss: 0.4758(0.3996) Grad: 115128.0938  LR: 0.00000853  
Epoch: [3][300/757] Elapsed 2m 11s (remain 3m 19s) Loss: 0.2705(0.3970) Grad: 37827.0742  LR: 0.00000830  
EVAL: [0/69] Loss: 0.8549(0.8549) 
EVAL: [20/69] Loss: 0.4967(0.6075) 
EVAL: [40/69] Loss: 0.5346(0.5850) 
EVAL: [60/69] Loss: 0.4561(0.5877) 


Epoch 3 - avg_train_loss: 0.3970  avg_val_loss: 0.5935
INFO:__main__:Epoch 3 - avg_train_loss: 0.3970  avg_val_loss: 0.5935
Epoch 3 - Score: 0.6106  Scores: [0.554858686077762, 0.6663781200644767]
INFO:__main__:Epoch 3 - Score: 0.6106  Scores: [0.554858686077762, 0.6663781200644767]


EVAL: [68/69] Loss: 0.7913(0.5935) 
Epoch: [3][320/757] Elapsed 2m 36s (remain 3m 32s) Loss: 0.4606(0.3957) Grad: 114817.8516  LR: 0.00000808  
Epoch: [3][340/757] Elapsed 2m 41s (remain 3m 17s) Loss: 0.2713(0.3936) Grad: 55370.0000  LR: 0.00000785  
Epoch: [3][360/757] Elapsed 2m 47s (remain 3m 3s) Loss: 0.3282(0.3937) Grad: 57053.3984  LR: 0.00000763  
Epoch: [3][380/757] Elapsed 2m 54s (remain 2m 51s) Loss: 0.4813(0.3951) Grad: 161484.1250  LR: 0.00000740  
Epoch: [3][400/757] Elapsed 3m 1s (remain 2m 41s) Loss: 0.4269(0.3951) Grad: 83393.9062  LR: 0.00000718  
EVAL: [0/69] Loss: 0.9043(0.9043) 
EVAL: [20/69] Loss: 0.4505(0.5495) 
EVAL: [40/69] Loss: 0.4671(0.5315) 
EVAL: [60/69] Loss: 0.4579(0.5400) 
EVAL: [68/69] Loss: 0.6464(0.5434) 


Epoch 3 - Save Best Score: 0.5620 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.5620 Model
Epoch 3 - avg_train_loss: 0.3951  avg_val_loss: 0.5434
INFO:__main__:Epoch 3 - avg_train_loss: 0.3951  avg_val_loss: 0.5434
Epoch 3 - Score: 0.5620  Scores: [0.4696775964961353, 0.6542880645733561]
INFO:__main__:Epoch 3 - Score: 0.5620  Scores: [0.4696775964961353, 0.6542880645733561]


Epoch: [3][420/757] Elapsed 3m 34s (remain 2m 51s) Loss: 0.3928(0.3957) Grad: 74931.4297  LR: 0.00000696  
Epoch: [3][440/757] Elapsed 3m 42s (remain 2m 39s) Loss: 0.3961(0.3946) Grad: 37368.2422  LR: 0.00000674  
Epoch: [3][460/757] Elapsed 3m 49s (remain 2m 27s) Loss: 0.4414(0.3940) Grad: 134098.4688  LR: 0.00000653  
Epoch: [3][480/757] Elapsed 3m 56s (remain 2m 15s) Loss: 0.3727(0.3927) Grad: 110700.4062  LR: 0.00000631  
Epoch: [3][500/757] Elapsed 4m 3s (remain 2m 4s) Loss: 0.3192(0.3910) Grad: 29681.2070  LR: 0.00000610  
EVAL: [0/69] Loss: 0.8725(0.8725) 
EVAL: [20/69] Loss: 0.4914(0.6234) 
EVAL: [40/69] Loss: 0.5639(0.6010) 
EVAL: [60/69] Loss: 0.4972(0.6063) 


Epoch 3 - avg_train_loss: 0.3910  avg_val_loss: 0.6109
INFO:__main__:Epoch 3 - avg_train_loss: 0.3910  avg_val_loss: 0.6109
Epoch 3 - Score: 0.6278  Scores: [0.6042220770717956, 0.6513361584523599]
INFO:__main__:Epoch 3 - Score: 0.6278  Scores: [0.6042220770717956, 0.6513361584523599]


EVAL: [68/69] Loss: 0.7569(0.6109) 
Epoch: [3][520/757] Elapsed 4m 28s (remain 2m 1s) Loss: 0.3116(0.3907) Grad: 42972.9180  LR: 0.00000589  
Epoch: [3][540/757] Elapsed 4m 34s (remain 1m 49s) Loss: 0.4449(0.3894) Grad: 114525.5078  LR: 0.00000568  
Epoch: [3][560/757] Elapsed 4m 40s (remain 1m 37s) Loss: 0.4038(0.3886) Grad: 61319.4336  LR: 0.00000547  
Epoch: [3][580/757] Elapsed 4m 46s (remain 1m 26s) Loss: 0.3001(0.3884) Grad: 70362.5156  LR: 0.00000527  
Epoch: [3][600/757] Elapsed 4m 53s (remain 1m 16s) Loss: 0.3737(0.3871) Grad: 100383.2422  LR: 0.00000507  
EVAL: [0/69] Loss: 0.8602(0.8602) 
EVAL: [20/69] Loss: 0.5182(0.6405) 
EVAL: [40/69] Loss: 0.5832(0.6156) 
EVAL: [60/69] Loss: 0.4920(0.6198) 


Epoch 3 - avg_train_loss: 0.3871  avg_val_loss: 0.6258
INFO:__main__:Epoch 3 - avg_train_loss: 0.3871  avg_val_loss: 0.6258
Epoch 3 - Score: 0.6434  Scores: [0.6406655451146763, 0.6462191635233121]
INFO:__main__:Epoch 3 - Score: 0.6434  Scores: [0.6406655451146763, 0.6462191635233121]


EVAL: [68/69] Loss: 0.8352(0.6258) 
Epoch: [3][620/757] Elapsed 5m 17s (remain 1m 9s) Loss: 0.2625(0.3862) Grad: 99973.3984  LR: 0.00000487  
Epoch: [3][640/757] Elapsed 5m 23s (remain 0m 58s) Loss: 0.3556(0.3860) Grad: 23455.4297  LR: 0.00000467  
Epoch: [3][660/757] Elapsed 5m 28s (remain 0m 47s) Loss: 0.4740(0.3857) Grad: 91306.2578  LR: 0.00000448  
Epoch: [3][680/757] Elapsed 5m 35s (remain 0m 37s) Loss: 0.2700(0.3850) Grad: 83987.5781  LR: 0.00000429  
Epoch: [3][700/757] Elapsed 5m 42s (remain 0m 27s) Loss: 0.4718(0.3856) Grad: 141228.2344  LR: 0.00000410  
EVAL: [0/69] Loss: 0.8485(0.8485) 
EVAL: [20/69] Loss: 0.4512(0.5769) 
EVAL: [40/69] Loss: 0.4852(0.5547) 
EVAL: [60/69] Loss: 0.4543(0.5637) 


Epoch 3 - avg_train_loss: 0.3856  avg_val_loss: 0.5684
INFO:__main__:Epoch 3 - avg_train_loss: 0.3856  avg_val_loss: 0.5684
Epoch 3 - Score: 0.5855  Scores: [0.5255886211415906, 0.6454563075600627]
INFO:__main__:Epoch 3 - Score: 0.5855  Scores: [0.5255886211415906, 0.6454563075600627]


EVAL: [68/69] Loss: 0.7084(0.5684) 
Epoch: [3][720/757] Elapsed 6m 8s (remain 0m 18s) Loss: 0.4468(0.3864) Grad: 122619.6250  LR: 0.00000392  
Epoch: [3][740/757] Elapsed 6m 14s (remain 0m 8s) Loss: 0.3727(0.3852) Grad: 62898.5547  LR: 0.00000373  
Epoch: [3][756/757] Elapsed 6m 19s (remain 0m 0s) Loss: 0.2626(0.3850) Grad: 44894.0039  LR: 0.00000359  
EVAL: [0/69] Loss: 0.8534(0.8534) 
EVAL: [20/69] Loss: 0.5310(0.6643) 
EVAL: [40/69] Loss: 0.6032(0.6373) 
EVAL: [60/69] Loss: 0.5083(0.6415) 
EVAL: [68/69] Loss: 0.8447(0.6478) 


Epoch 3 - avg_train_loss: 0.3850  avg_val_loss: 0.6478
INFO:__main__:Epoch 3 - avg_train_loss: 0.3850  avg_val_loss: 0.6478
Epoch 3 - Score: 0.6648  Scores: [0.6742329929956432, 0.6553286727514578]
INFO:__main__:Epoch 3 - Score: 0.6648  Scores: [0.6742329929956432, 0.6553286727514578]


Epoch: [4][0/757] Elapsed 0m 0s (remain 9m 47s) Loss: 0.3135(0.3135) Grad: inf  LR: 0.00000358  
Epoch: [4][20/757] Elapsed 0m 7s (remain 4m 16s) Loss: 0.4162(0.3277) Grad: 36547.0898  LR: 0.00000341  
Epoch: [4][40/757] Elapsed 0m 12s (remain 3m 37s) Loss: 0.2305(0.3183) Grad: 80999.4219  LR: 0.00000324  
Epoch: [4][60/757] Elapsed 0m 18s (remain 3m 35s) Loss: 0.2009(0.3262) Grad: 31216.2910  LR: 0.00000307  
Epoch: [4][80/757] Elapsed 0m 24s (remain 3m 22s) Loss: 0.3472(0.3256) Grad: 131092.8281  LR: 0.00000290  
Epoch: [4][100/757] Elapsed 0m 29s (remain 3m 11s) Loss: 0.3021(0.3272) Grad: 94105.4844  LR: 0.00000274  
EVAL: [0/69] Loss: 0.8587(0.8587) 
EVAL: [20/69] Loss: 0.5267(0.6390) 
EVAL: [40/69] Loss: 0.5646(0.6173) 
EVAL: [60/69] Loss: 0.4804(0.6219) 
EVAL: [68/69] Loss: 0.7900(0.6260) 


Epoch 4 - avg_train_loss: 0.3272  avg_val_loss: 0.6260
INFO:__main__:Epoch 4 - avg_train_loss: 0.3272  avg_val_loss: 0.6260
Epoch 4 - Score: 0.6428  Scores: [0.6399081021178176, 0.6456855686475346]
INFO:__main__:Epoch 4 - Score: 0.6428  Scores: [0.6399081021178176, 0.6456855686475346]


Epoch: [4][120/757] Elapsed 0m 54s (remain 4m 47s) Loss: 0.2634(0.3270) Grad: 78197.8594  LR: 0.00000259  
Epoch: [4][140/757] Elapsed 1m 0s (remain 4m 22s) Loss: 0.2583(0.3241) Grad: 112097.7969  LR: 0.00000243  
Epoch: [4][160/757] Elapsed 1m 6s (remain 4m 5s) Loss: 0.3826(0.3218) Grad: 35503.4492  LR: 0.00000229  
Epoch: [4][180/757] Elapsed 1m 12s (remain 3m 50s) Loss: 0.2777(0.3198) Grad: 89968.4453  LR: 0.00000214  
Epoch: [4][200/757] Elapsed 1m 19s (remain 3m 40s) Loss: 0.2382(0.3190) Grad: 90048.8281  LR: 0.00000200  
EVAL: [0/69] Loss: 0.8498(0.8498) 
EVAL: [20/69] Loss: 0.5186(0.6494) 
EVAL: [40/69] Loss: 0.5774(0.6244) 
EVAL: [60/69] Loss: 0.4857(0.6283) 
EVAL: [68/69] Loss: 0.8197(0.6339) 


Epoch 4 - avg_train_loss: 0.3190  avg_val_loss: 0.6339
INFO:__main__:Epoch 4 - avg_train_loss: 0.3190  avg_val_loss: 0.6339
Epoch 4 - Score: 0.6508  Scores: [0.6409311135406903, 0.6605965254943751]
INFO:__main__:Epoch 4 - Score: 0.6508  Scores: [0.6409311135406903, 0.6605965254943751]


Epoch: [4][220/757] Elapsed 1m 45s (remain 4m 15s) Loss: 0.2903(0.3203) Grad: 34002.8281  LR: 0.00000187  
Epoch: [4][240/757] Elapsed 1m 50s (remain 3m 56s) Loss: 0.3546(0.3194) Grad: 100486.3359  LR: 0.00000173  
Epoch: [4][260/757] Elapsed 1m 56s (remain 3m 42s) Loss: 0.2973(0.3196) Grad: 69462.8125  LR: 0.00000161  
Epoch: [4][280/757] Elapsed 2m 2s (remain 3m 27s) Loss: 0.1777(0.3202) Grad: 35124.6289  LR: 0.00000148  
Epoch: [4][300/757] Elapsed 2m 9s (remain 3m 15s) Loss: 0.2926(0.3225) Grad: 81486.5000  LR: 0.00000136  
EVAL: [0/69] Loss: 0.8330(0.8330) 
EVAL: [20/69] Loss: 0.4789(0.6069) 
EVAL: [40/69] Loss: 0.5310(0.5825) 
EVAL: [60/69] Loss: 0.4692(0.5891) 
EVAL: [68/69] Loss: 0.7590(0.5944) 


Epoch 4 - avg_train_loss: 0.3225  avg_val_loss: 0.5944
INFO:__main__:Epoch 4 - avg_train_loss: 0.3225  avg_val_loss: 0.5944
Epoch 4 - Score: 0.6114  Scores: [0.5757918663349642, 0.6469777333143748]
INFO:__main__:Epoch 4 - Score: 0.6114  Scores: [0.5757918663349642, 0.6469777333143748]


Epoch: [4][320/757] Elapsed 2m 34s (remain 3m 30s) Loss: 0.2429(0.3212) Grad: 34225.3555  LR: 0.00000125  
Epoch: [4][340/757] Elapsed 2m 40s (remain 3m 15s) Loss: 0.4126(0.3215) Grad: 34220.0352  LR: 0.00000114  
Epoch: [4][360/757] Elapsed 2m 46s (remain 3m 3s) Loss: 0.2441(0.3202) Grad: 86966.6094  LR: 0.00000104  
Epoch: [4][380/757] Elapsed 2m 52s (remain 2m 50s) Loss: 0.2777(0.3206) Grad: 14088.7539  LR: 0.00000094  
Epoch: [4][400/757] Elapsed 2m 57s (remain 2m 37s) Loss: 0.2316(0.3190) Grad: 103651.4297  LR: 0.00000084  
EVAL: [0/69] Loss: 0.8436(0.8436) 
EVAL: [20/69] Loss: 0.5052(0.6297) 
EVAL: [40/69] Loss: 0.5506(0.6053) 
EVAL: [60/69] Loss: 0.4799(0.6113) 
EVAL: [68/69] Loss: 0.7908(0.6165) 


Epoch 4 - avg_train_loss: 0.3190  avg_val_loss: 0.6165
INFO:__main__:Epoch 4 - avg_train_loss: 0.3190  avg_val_loss: 0.6165
Epoch 4 - Score: 0.6335  Scores: [0.6202321773305908, 0.6468487448022954]
INFO:__main__:Epoch 4 - Score: 0.6335  Scores: [0.6202321773305908, 0.6468487448022954]


Epoch: [4][420/757] Elapsed 3m 22s (remain 2m 41s) Loss: 0.2748(0.3176) Grad: 75766.1484  LR: 0.00000075  
Epoch: [4][440/757] Elapsed 3m 28s (remain 2m 29s) Loss: 0.2355(0.3162) Grad: 46938.3320  LR: 0.00000067  
Epoch: [4][460/757] Elapsed 3m 34s (remain 2m 17s) Loss: 0.3780(0.3172) Grad: 112196.9141  LR: 0.00000059  
Epoch: [4][480/757] Elapsed 3m 40s (remain 2m 6s) Loss: 0.2972(0.3168) Grad: 18754.9102  LR: 0.00000051  
Epoch: [4][500/757] Elapsed 3m 45s (remain 1m 55s) Loss: 0.3231(0.3149) Grad: 111596.7266  LR: 0.00000044  
EVAL: [0/69] Loss: 0.8394(0.8394) 
EVAL: [20/69] Loss: 0.5185(0.6332) 
EVAL: [40/69] Loss: 0.5597(0.6098) 
EVAL: [60/69] Loss: 0.4921(0.6156) 
EVAL: [68/69] Loss: 0.8019(0.6210) 


Epoch 4 - avg_train_loss: 0.3149  avg_val_loss: 0.6210
INFO:__main__:Epoch 4 - avg_train_loss: 0.3149  avg_val_loss: 0.6210
Epoch 4 - Score: 0.6376  Scores: [0.6328071942190491, 0.6423386228267831]
INFO:__main__:Epoch 4 - Score: 0.6376  Scores: [0.6328071942190491, 0.6423386228267831]


Epoch: [4][520/757] Elapsed 4m 11s (remain 1m 53s) Loss: 0.4002(0.3133) Grad: 103209.5859  LR: 0.00000038  
Epoch: [4][540/757] Elapsed 4m 17s (remain 1m 42s) Loss: 0.2956(0.3125) Grad: 28921.5957  LR: 0.00000032  
Epoch: [4][560/757] Elapsed 4m 24s (remain 1m 32s) Loss: 0.1525(0.3120) Grad: 74576.2266  LR: 0.00000026  
Epoch: [4][580/757] Elapsed 4m 30s (remain 1m 21s) Loss: 0.2682(0.3110) Grad: 83888.6250  LR: 0.00000021  
Epoch: [4][600/757] Elapsed 4m 35s (remain 1m 11s) Loss: 0.2101(0.3103) Grad: 72602.2734  LR: 0.00000017  
EVAL: [0/69] Loss: 0.8395(0.8395) 
EVAL: [20/69] Loss: 0.5208(0.6372) 
EVAL: [40/69] Loss: 0.5633(0.6137) 
EVAL: [60/69] Loss: 0.4945(0.6194) 
EVAL: [68/69] Loss: 0.8074(0.6248) 


Epoch 4 - avg_train_loss: 0.3103  avg_val_loss: 0.6248
INFO:__main__:Epoch 4 - avg_train_loss: 0.3103  avg_val_loss: 0.6248
Epoch 4 - Score: 0.6414  Scores: [0.6391458203580318, 0.6435856889320621]
INFO:__main__:Epoch 4 - Score: 0.6414  Scores: [0.6391458203580318, 0.6435856889320621]


Epoch: [4][620/757] Elapsed 5m 1s (remain 1m 5s) Loss: 0.3084(0.3096) Grad: 29691.9727  LR: 0.00000013  
Epoch: [4][640/757] Elapsed 5m 7s (remain 0m 55s) Loss: 0.2712(0.3095) Grad: 78196.9062  LR: 0.00000009  
Epoch: [4][660/757] Elapsed 5m 13s (remain 0m 45s) Loss: 0.2514(0.3093) Grad: 77824.9375  LR: 0.00000006  
Epoch: [4][680/757] Elapsed 5m 18s (remain 0m 35s) Loss: 0.2562(0.3092) Grad: 52896.4180  LR: 0.00000004  
Epoch: [4][700/757] Elapsed 5m 25s (remain 0m 25s) Loss: 0.3179(0.3089) Grad: 53603.5898  LR: 0.00000002  
EVAL: [0/69] Loss: 0.8367(0.8367) 
EVAL: [20/69] Loss: 0.5133(0.6310) 
EVAL: [40/69] Loss: 0.5563(0.6075) 
EVAL: [60/69] Loss: 0.4897(0.6134) 
EVAL: [68/69] Loss: 0.7991(0.6189) 


Epoch 4 - avg_train_loss: 0.3089  avg_val_loss: 0.6189
INFO:__main__:Epoch 4 - avg_train_loss: 0.3089  avg_val_loss: 0.6189
Epoch 4 - Score: 0.6355  Scores: [0.6276457922114124, 0.6433406051610769]
INFO:__main__:Epoch 4 - Score: 0.6355  Scores: [0.6276457922114124, 0.6433406051610769]


Epoch: [4][720/757] Elapsed 5m 50s (remain 0m 17s) Loss: 0.1940(0.3089) Grad: 25051.0527  LR: 0.00000001  
Epoch: [4][740/757] Elapsed 5m 55s (remain 0m 7s) Loss: 0.2684(0.3085) Grad: 86207.8047  LR: 0.00000000  
Epoch: [4][756/757] Elapsed 6m 1s (remain 0m 0s) Loss: 0.4032(0.3090) Grad: 48260.7109  LR: 0.00000000  
EVAL: [0/69] Loss: 0.8363(0.8363) 
EVAL: [20/69] Loss: 0.5128(0.6305) 
EVAL: [40/69] Loss: 0.5555(0.6070) 
EVAL: [60/69] Loss: 0.4887(0.6129) 
EVAL: [68/69] Loss: 0.7987(0.6183) 


Epoch 4 - avg_train_loss: 0.3090  avg_val_loss: 0.6183
INFO:__main__:Epoch 4 - avg_train_loss: 0.3090  avg_val_loss: 0.6183
Epoch 4 - Score: 0.6349  Scores: [0.6264878019677703, 0.6433960695349369]
INFO:__main__:Epoch 4 - Score: 0.6349  Scores: [0.6264878019677703, 0.6433960695349369]
Score: 0.5620  Scores: [0.4696775964961353, 0.6542880645733561]
INFO:__main__:Score: 0.5620  Scores: [0.4696775964961353, 0.6542880645733561]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_drop

Epoch: [1][0/638] Elapsed 0m 0s (remain 6m 29s) Loss: 1.7928(1.7928) Grad: inf  LR: 0.00000008  
Epoch: [1][20/638] Elapsed 0m 6s (remain 3m 8s) Loss: 0.9242(1.4728) Grad: 73931.8359  LR: 0.00000165  
Epoch: [1][40/638] Elapsed 0m 12s (remain 3m 7s) Loss: 0.6889(1.2745) Grad: 124041.9062  LR: 0.00000322  
Epoch: [1][60/638] Elapsed 0m 18s (remain 2m 53s) Loss: 0.3317(1.1080) Grad: 41657.4570  LR: 0.00000478  
Epoch: [1][80/638] Elapsed 0m 23s (remain 2m 40s) Loss: 0.6423(0.9876) Grad: 147773.2969  LR: 0.00000635  
Epoch: [1][100/638] Elapsed 0m 29s (remain 2m 37s) Loss: 0.6315(0.9351) Grad: 104223.5312  LR: 0.00000792  
EVAL: [0/129] Loss: 0.7123(0.7123) 
EVAL: [20/129] Loss: 0.6607(0.6891) 
EVAL: [40/129] Loss: 0.5857(0.6672) 
EVAL: [60/129] Loss: 0.7722(0.6796) 
EVAL: [80/129] Loss: 0.5899(0.6735) 
EVAL: [100/129] Loss: 0.8866(0.6748) 
EVAL: [120/129] Loss: 0.7514(0.6767) 


Epoch 1 - Save Best Score: 0.6877 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6877 Model


EVAL: [128/129] Loss: 0.5801(0.6759) 


Epoch 1 - avg_train_loss: 0.9351  avg_val_loss: 0.6759
INFO:__main__:Epoch 1 - avg_train_loss: 0.9351  avg_val_loss: 0.6759
Epoch 1 - Score: 0.6877  Scores: [0.5941866172989504, 0.7813007276528336]
INFO:__main__:Epoch 1 - Score: 0.6877  Scores: [0.5941866172989504, 0.7813007276528336]


Epoch: [1][120/638] Elapsed 1m 24s (remain 5m 59s) Loss: 0.8977(0.8900) Grad: 122112.6328  LR: 0.00000949  
Epoch: [1][140/638] Elapsed 1m 32s (remain 5m 27s) Loss: 0.7931(0.8594) Grad: 156009.8750  LR: 0.00001106  
Epoch: [1][160/638] Elapsed 1m 40s (remain 4m 57s) Loss: 0.5448(0.8269) Grad: 93657.6406  LR: 0.00001263  
Epoch: [1][180/638] Elapsed 1m 48s (remain 4m 34s) Loss: 1.1154(0.8010) Grad: 157627.5000  LR: 0.00001420  
Epoch: [1][200/638] Elapsed 1m 55s (remain 4m 11s) Loss: 0.5107(0.7825) Grad: 44006.8945  LR: 0.00001576  
EVAL: [0/129] Loss: 0.5044(0.5044) 
EVAL: [20/129] Loss: 0.4473(0.5436) 
EVAL: [40/129] Loss: 0.4501(0.5286) 
EVAL: [60/129] Loss: 0.6141(0.5355) 
EVAL: [80/129] Loss: 0.4873(0.5327) 
EVAL: [100/129] Loss: 0.7122(0.5326) 
EVAL: [120/129] Loss: 0.4753(0.5315) 


Epoch 1 - Save Best Score: 0.5402 Model


EVAL: [128/129] Loss: 0.4862(0.5307) 


INFO:__main__:Epoch 1 - Save Best Score: 0.5402 Model
Epoch 1 - avg_train_loss: 0.7825  avg_val_loss: 0.5307
INFO:__main__:Epoch 1 - avg_train_loss: 0.7825  avg_val_loss: 0.5307
Epoch 1 - Score: 0.5402  Scores: [0.4684724884934052, 0.6120119689577687]
INFO:__main__:Epoch 1 - Score: 0.5402  Scores: [0.4684724884934052, 0.6120119689577687]


Epoch: [1][220/638] Elapsed 2m 46s (remain 5m 13s) Loss: 0.7542(0.7640) Grad: 98741.1875  LR: 0.00001733  
Epoch: [1][240/638] Elapsed 2m 55s (remain 4m 48s) Loss: 0.7386(0.7526) Grad: 85809.2188  LR: 0.00001890  
Epoch: [1][260/638] Elapsed 3m 2s (remain 4m 23s) Loss: 0.5070(0.7391) Grad: 87688.2422  LR: 0.00002000  
Epoch: [1][280/638] Elapsed 3m 11s (remain 4m 3s) Loss: 0.5572(0.7253) Grad: 96861.3906  LR: 0.00001999  
Epoch: [1][300/638] Elapsed 3m 17s (remain 3m 41s) Loss: 0.5408(0.7149) Grad: 132200.6406  LR: 0.00001998  
EVAL: [0/129] Loss: 0.4753(0.4753) 
EVAL: [20/129] Loss: 0.4291(0.5637) 
EVAL: [40/129] Loss: 0.4809(0.5481) 
EVAL: [60/129] Loss: 0.5972(0.5527) 
EVAL: [80/129] Loss: 0.5422(0.5497) 
EVAL: [100/129] Loss: 0.6959(0.5490) 
EVAL: [120/129] Loss: 0.5244(0.5496) 
EVAL: [128/129] Loss: 0.5758(0.5487) 


Epoch 1 - avg_train_loss: 0.7149  avg_val_loss: 0.5487
INFO:__main__:Epoch 1 - avg_train_loss: 0.7149  avg_val_loss: 0.5487
Epoch 1 - Score: 0.5568  Scores: [0.5038539849549115, 0.6096580356735963]
INFO:__main__:Epoch 1 - Score: 0.5568  Scores: [0.5038539849549115, 0.6096580356735963]


Epoch: [1][320/638] Elapsed 3m 57s (remain 3m 54s) Loss: 0.5633(0.7093) Grad: 103164.0469  LR: 0.00001996  
Epoch: [1][340/638] Elapsed 4m 3s (remain 3m 32s) Loss: 0.5155(0.6997) Grad: 145962.6094  LR: 0.00001993  
Epoch: [1][360/638] Elapsed 4m 9s (remain 3m 11s) Loss: 0.8483(0.6977) Grad: 180647.0625  LR: 0.00001990  
Epoch: [1][380/638] Elapsed 4m 15s (remain 2m 52s) Loss: 0.6629(0.6936) Grad: 45795.5312  LR: 0.00001985  
Epoch: [1][400/638] Elapsed 4m 22s (remain 2m 35s) Loss: 0.6918(0.6878) Grad: 141685.2344  LR: 0.00001980  
EVAL: [0/129] Loss: 0.4545(0.4545) 
EVAL: [20/129] Loss: 0.4114(0.5311) 
EVAL: [40/129] Loss: 0.4785(0.5203) 
EVAL: [60/129] Loss: 0.5926(0.5266) 
EVAL: [80/129] Loss: 0.5355(0.5260) 
EVAL: [100/129] Loss: 0.6538(0.5258) 
EVAL: [120/129] Loss: 0.5210(0.5250) 


Epoch 1 - Save Best Score: 0.5313 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5313 Model


EVAL: [128/129] Loss: 0.5796(0.5240) 


Epoch 1 - avg_train_loss: 0.6878  avg_val_loss: 0.5240
INFO:__main__:Epoch 1 - avg_train_loss: 0.6878  avg_val_loss: 0.5240
Epoch 1 - Score: 0.5313  Scores: [0.44217592373661085, 0.6204364272345684]
INFO:__main__:Epoch 1 - Score: 0.5313  Scores: [0.44217592373661085, 0.6204364272345684]


Epoch: [1][420/638] Elapsed 5m 17s (remain 2m 43s) Loss: 0.6544(0.6812) Grad: 36524.0195  LR: 0.00001974  
Epoch: [1][440/638] Elapsed 5m 25s (remain 2m 25s) Loss: 0.7276(0.6767) Grad: 129717.0938  LR: 0.00001968  
Epoch: [1][460/638] Elapsed 5m 31s (remain 2m 7s) Loss: 0.5094(0.6734) Grad: 91582.2031  LR: 0.00001961  
Epoch: [1][480/638] Elapsed 5m 40s (remain 1m 51s) Loss: 0.5858(0.6681) Grad: 168483.5469  LR: 0.00001953  
Epoch: [1][500/638] Elapsed 5m 47s (remain 1m 34s) Loss: 0.4493(0.6616) Grad: 99144.8906  LR: 0.00001944  
EVAL: [0/129] Loss: 0.5187(0.5187) 
EVAL: [20/129] Loss: 0.4726(0.6092) 
EVAL: [40/129] Loss: 0.5382(0.6018) 
EVAL: [60/129] Loss: 0.6553(0.6074) 
EVAL: [80/129] Loss: 0.6020(0.6062) 
EVAL: [100/129] Loss: 0.7086(0.6061) 
EVAL: [120/129] Loss: 0.5642(0.6045) 


Epoch 1 - avg_train_loss: 0.6616  avg_val_loss: 0.6046


EVAL: [128/129] Loss: 0.5971(0.6046) 


INFO:__main__:Epoch 1 - avg_train_loss: 0.6616  avg_val_loss: 0.6046
Epoch 1 - Score: 0.6117  Scores: [0.6585878652738989, 0.5648014885258523]
INFO:__main__:Epoch 1 - Score: 0.6117  Scores: [0.6585878652738989, 0.5648014885258523]


Epoch: [1][520/638] Elapsed 6m 27s (remain 1m 26s) Loss: 0.5106(0.6574) Grad: 69607.3281  LR: 0.00001935  
Epoch: [1][540/638] Elapsed 6m 33s (remain 1m 10s) Loss: 0.3942(0.6535) Grad: 96865.4453  LR: 0.00001925  
Epoch: [1][560/638] Elapsed 6m 39s (remain 0m 54s) Loss: 0.4718(0.6493) Grad: 98459.2734  LR: 0.00001914  
Epoch: [1][580/638] Elapsed 6m 45s (remain 0m 39s) Loss: 0.4483(0.6443) Grad: 55961.8398  LR: 0.00001902  
Epoch: [1][600/638] Elapsed 6m 50s (remain 0m 25s) Loss: 0.3310(0.6396) Grad: 121113.6406  LR: 0.00001890  
EVAL: [0/129] Loss: 0.4340(0.4340) 
EVAL: [20/129] Loss: 0.4905(0.6027) 
EVAL: [40/129] Loss: 0.5478(0.5985) 
EVAL: [60/129] Loss: 0.6570(0.6027) 
EVAL: [80/129] Loss: 0.5889(0.6022) 
EVAL: [100/129] Loss: 0.7277(0.6001) 
EVAL: [120/129] Loss: 0.4256(0.5978) 
EVAL: [128/129] Loss: 0.4926(0.5978) 


Epoch 1 - avg_train_loss: 0.6396  avg_val_loss: 0.5978
INFO:__main__:Epoch 1 - avg_train_loss: 0.6396  avg_val_loss: 0.5978
Epoch 1 - Score: 0.6085  Scores: [0.601681774723087, 0.6152473505413603]
INFO:__main__:Epoch 1 - Score: 0.6085  Scores: [0.601681774723087, 0.6152473505413603]


Epoch: [1][620/638] Elapsed 7m 30s (remain 0m 12s) Loss: 0.3765(0.6334) Grad: 41720.5820  LR: 0.00001878  
Epoch: [1][637/638] Elapsed 7m 35s (remain 0m 0s) Loss: 0.5783(0.6301) Grad: 99524.3516  LR: 0.00001866  
EVAL: [0/129] Loss: 0.3956(0.3956) 
EVAL: [20/129] Loss: 0.3661(0.5056) 
EVAL: [40/129] Loss: 0.4843(0.5040) 
EVAL: [60/129] Loss: 0.5421(0.5057) 
EVAL: [80/129] Loss: 0.5265(0.5055) 
EVAL: [100/129] Loss: 0.5831(0.5034) 
EVAL: [120/129] Loss: 0.4199(0.5025) 
EVAL: [128/129] Loss: 0.4906(0.5015) 


Epoch 1 - Save Best Score: 0.5097 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5097 Model
Epoch 1 - avg_train_loss: 0.6301  avg_val_loss: 0.5015
INFO:__main__:Epoch 1 - avg_train_loss: 0.6301  avg_val_loss: 0.5015
Epoch 1 - Score: 0.5097  Scores: [0.49469295566483584, 0.5247229583138888]
INFO:__main__:Epoch 1 - Score: 0.5097  Scores: [0.49469295566483584, 0.5247229583138888]


Epoch: [2][0/638] Elapsed 0m 0s (remain 8m 8s) Loss: 0.6612(0.6612) Grad: inf  LR: 0.00001865  
Epoch: [2][20/638] Elapsed 0m 7s (remain 3m 30s) Loss: 0.3516(0.4670) Grad: 124418.4766  LR: 0.00001851  
Epoch: [2][40/638] Elapsed 0m 13s (remain 3m 20s) Loss: 0.8344(0.4955) Grad: 78472.8125  LR: 0.00001837  
Epoch: [2][60/638] Elapsed 0m 21s (remain 3m 21s) Loss: 0.4797(0.5037) Grad: 59967.4141  LR: 0.00001822  
Epoch: [2][80/638] Elapsed 0m 28s (remain 3m 15s) Loss: 0.3907(0.5046) Grad: 128427.4531  LR: 0.00001806  
Epoch: [2][100/638] Elapsed 0m 35s (remain 3m 8s) Loss: 0.6592(0.4959) Grad: 149838.2969  LR: 0.00001789  
EVAL: [0/129] Loss: 0.3925(0.3925) 
EVAL: [20/129] Loss: 0.3884(0.4762) 
EVAL: [40/129] Loss: 0.4871(0.4709) 
EVAL: [60/129] Loss: 0.4854(0.4738) 
EVAL: [80/129] Loss: 0.4455(0.4683) 
EVAL: [100/129] Loss: 0.6384(0.4664) 
EVAL: [120/129] Loss: 0.3753(0.4663) 
EVAL: [128/129] Loss: 0.4113(0.4655) 


Epoch 2 - Save Best Score: 0.4748 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4748 Model
Epoch 2 - avg_train_loss: 0.4959  avg_val_loss: 0.4655
INFO:__main__:Epoch 2 - avg_train_loss: 0.4959  avg_val_loss: 0.4655
Epoch 2 - Score: 0.4748  Scores: [0.41408544678905906, 0.5356130196072978]
INFO:__main__:Epoch 2 - Score: 0.4748  Scores: [0.41408544678905906, 0.5356130196072978]


Epoch: [2][120/638] Elapsed 1m 26s (remain 6m 8s) Loss: 0.4336(0.5056) Grad: 100014.2969  LR: 0.00001772  
Epoch: [2][140/638] Elapsed 1m 33s (remain 5m 29s) Loss: 0.6057(0.5013) Grad: 29475.6348  LR: 0.00001754  
Epoch: [2][160/638] Elapsed 1m 39s (remain 4m 55s) Loss: 0.5514(0.5027) Grad: 55937.9180  LR: 0.00001736  
Epoch: [2][180/638] Elapsed 1m 47s (remain 4m 31s) Loss: 0.4839(0.5055) Grad: 102982.2891  LR: 0.00001717  
Epoch: [2][200/638] Elapsed 1m 53s (remain 4m 7s) Loss: 0.4516(0.5008) Grad: 79359.6562  LR: 0.00001698  
EVAL: [0/129] Loss: 0.4322(0.4322) 
EVAL: [20/129] Loss: 0.4594(0.5616) 
EVAL: [40/129] Loss: 0.4958(0.5557) 
EVAL: [60/129] Loss: 0.6337(0.5621) 
EVAL: [80/129] Loss: 0.5231(0.5609) 
EVAL: [100/129] Loss: 0.7335(0.5602) 
EVAL: [120/129] Loss: 0.3949(0.5581) 
EVAL: [128/129] Loss: 0.4843(0.5575) 


Epoch 2 - avg_train_loss: 0.5008  avg_val_loss: 0.5575
INFO:__main__:Epoch 2 - avg_train_loss: 0.5008  avg_val_loss: 0.5575
Epoch 2 - Score: 0.5686  Scores: [0.5527937681334664, 0.5844836061253071]
INFO:__main__:Epoch 2 - Score: 0.5686  Scores: [0.5527937681334664, 0.5844836061253071]


Epoch: [2][220/638] Elapsed 2m 34s (remain 4m 51s) Loss: 0.4405(0.5079) Grad: 94389.2344  LR: 0.00001678  
Epoch: [2][240/638] Elapsed 2m 41s (remain 4m 25s) Loss: 0.5394(0.5041) Grad: 106489.2031  LR: 0.00001658  
Epoch: [2][260/638] Elapsed 2m 46s (remain 4m 1s) Loss: 0.4748(0.5082) Grad: 28831.3965  LR: 0.00001637  
Epoch: [2][280/638] Elapsed 2m 53s (remain 3m 40s) Loss: 0.4181(0.5064) Grad: 129344.8984  LR: 0.00001616  
Epoch: [2][300/638] Elapsed 2m 58s (remain 3m 20s) Loss: 0.3861(0.5053) Grad: 99127.4766  LR: 0.00001594  
EVAL: [0/129] Loss: 0.4210(0.4210) 
EVAL: [20/129] Loss: 0.4008(0.5238) 
EVAL: [40/129] Loss: 0.4747(0.5189) 
EVAL: [60/129] Loss: 0.5533(0.5229) 
EVAL: [80/129] Loss: 0.5189(0.5194) 
EVAL: [100/129] Loss: 0.6669(0.5175) 
EVAL: [120/129] Loss: 0.4087(0.5169) 
EVAL: [128/129] Loss: 0.4631(0.5160) 


Epoch 2 - avg_train_loss: 0.5053  avg_val_loss: 0.5160
INFO:__main__:Epoch 2 - avg_train_loss: 0.5053  avg_val_loss: 0.5160
Epoch 2 - Score: 0.5248  Scores: [0.5152168228574768, 0.5342970662558599]
INFO:__main__:Epoch 2 - Score: 0.5248  Scores: [0.5152168228574768, 0.5342970662558599]


Epoch: [2][320/638] Elapsed 3m 39s (remain 3m 36s) Loss: 0.6586(0.5029) Grad: 61167.5039  LR: 0.00001572  
Epoch: [2][340/638] Elapsed 3m 45s (remain 3m 16s) Loss: 0.4424(0.5013) Grad: 102761.9688  LR: 0.00001549  
Epoch: [2][360/638] Elapsed 3m 51s (remain 2m 57s) Loss: 0.4547(0.5016) Grad: 160541.3750  LR: 0.00001526  
Epoch: [2][380/638] Elapsed 3m 58s (remain 2m 40s) Loss: 0.2728(0.5034) Grad: 145644.7344  LR: 0.00001503  
Epoch: [2][400/638] Elapsed 4m 3s (remain 2m 24s) Loss: 0.6314(0.5048) Grad: 40520.0234  LR: 0.00001479  
EVAL: [0/129] Loss: 0.4534(0.4534) 
EVAL: [20/129] Loss: 0.3934(0.5465) 
EVAL: [40/129] Loss: 0.5803(0.5516) 
EVAL: [60/129] Loss: 0.5668(0.5524) 
EVAL: [80/129] Loss: 0.5433(0.5526) 
EVAL: [100/129] Loss: 0.6468(0.5513) 
EVAL: [120/129] Loss: 0.5075(0.5502) 


Epoch 2 - avg_train_loss: 0.5048  avg_val_loss: 0.5489


EVAL: [128/129] Loss: 0.5501(0.5489) 


INFO:__main__:Epoch 2 - avg_train_loss: 0.5048  avg_val_loss: 0.5489
Epoch 2 - Score: 0.5565  Scores: [0.5987317286101476, 0.5142734742778162]
INFO:__main__:Epoch 2 - Score: 0.5565  Scores: [0.5987317286101476, 0.5142734742778162]


Epoch: [2][420/638] Elapsed 4m 43s (remain 2m 26s) Loss: 0.3661(0.5011) Grad: 17324.8594  LR: 0.00001455  
Epoch: [2][440/638] Elapsed 4m 49s (remain 2m 9s) Loss: 0.4738(0.4995) Grad: 38613.5664  LR: 0.00001430  
Epoch: [2][460/638] Elapsed 4m 54s (remain 1m 53s) Loss: 0.3722(0.4983) Grad: 60273.8672  LR: 0.00001405  
Epoch: [2][480/638] Elapsed 5m 0s (remain 1m 38s) Loss: 0.6529(0.4979) Grad: 140005.7500  LR: 0.00001380  
Epoch: [2][500/638] Elapsed 5m 6s (remain 1m 23s) Loss: 0.4597(0.4980) Grad: 81888.4688  LR: 0.00001355  
EVAL: [0/129] Loss: 0.3950(0.3950) 
EVAL: [20/129] Loss: 0.3770(0.4587) 
EVAL: [40/129] Loss: 0.4346(0.4505) 
EVAL: [60/129] Loss: 0.5235(0.4566) 
EVAL: [80/129] Loss: 0.4489(0.4531) 
EVAL: [100/129] Loss: 0.5823(0.4504) 
EVAL: [120/129] Loss: 0.3631(0.4479) 


Epoch 2 - Save Best Score: 0.4553 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4553 Model


EVAL: [128/129] Loss: 0.3997(0.4468) 


Epoch 2 - avg_train_loss: 0.4980  avg_val_loss: 0.4468
INFO:__main__:Epoch 2 - avg_train_loss: 0.4980  avg_val_loss: 0.4468
Epoch 2 - Score: 0.4553  Scores: [0.39657923131630285, 0.5140009871571574]
INFO:__main__:Epoch 2 - Score: 0.4553  Scores: [0.39657923131630285, 0.5140009871571574]


Epoch: [2][520/638] Elapsed 5m 56s (remain 1m 19s) Loss: 0.5748(0.4953) Grad: 36236.9805  LR: 0.00001329  
Epoch: [2][540/638] Elapsed 6m 4s (remain 1m 5s) Loss: 0.4433(0.4942) Grad: 102035.2734  LR: 0.00001303  
Epoch: [2][560/638] Elapsed 6m 11s (remain 0m 50s) Loss: 0.5505(0.4932) Grad: 100397.0547  LR: 0.00001277  
Epoch: [2][580/638] Elapsed 6m 18s (remain 0m 37s) Loss: 0.3749(0.4927) Grad: 43001.4648  LR: 0.00001251  
Epoch: [2][600/638] Elapsed 6m 26s (remain 0m 23s) Loss: 0.4557(0.4933) Grad: 37034.3828  LR: 0.00001224  
EVAL: [0/129] Loss: 0.4242(0.4242) 
EVAL: [20/129] Loss: 0.4125(0.5208) 
EVAL: [40/129] Loss: 0.4605(0.5124) 
EVAL: [60/129] Loss: 0.5610(0.5169) 
EVAL: [80/129] Loss: 0.5316(0.5150) 
EVAL: [100/129] Loss: 0.6366(0.5141) 
EVAL: [120/129] Loss: 0.4909(0.5128) 
EVAL: [128/129] Loss: 0.5424(0.5116) 


Epoch 2 - avg_train_loss: 0.4933  avg_val_loss: 0.5116
INFO:__main__:Epoch 2 - avg_train_loss: 0.4933  avg_val_loss: 0.5116
Epoch 2 - Score: 0.5189  Scores: [0.4849963266202855, 0.5527316095271508]
INFO:__main__:Epoch 2 - Score: 0.5189  Scores: [0.4849963266202855, 0.5527316095271508]


Epoch: [2][620/638] Elapsed 7m 6s (remain 0m 11s) Loss: 0.3352(0.4905) Grad: 24107.2969  LR: 0.00001198  
Epoch: [2][637/638] Elapsed 7m 11s (remain 0m 0s) Loss: 0.5808(0.4897) Grad: 49863.6211  LR: 0.00001175  
EVAL: [0/129] Loss: 0.4203(0.4203) 
EVAL: [20/129] Loss: 0.3834(0.4880) 
EVAL: [40/129] Loss: 0.4452(0.4848) 
EVAL: [60/129] Loss: 0.5383(0.4893) 
EVAL: [80/129] Loss: 0.4934(0.4870) 
EVAL: [100/129] Loss: 0.6150(0.4839) 
EVAL: [120/129] Loss: 0.3883(0.4828) 
EVAL: [128/129] Loss: 0.4381(0.4815) 


Epoch 2 - avg_train_loss: 0.4897  avg_val_loss: 0.4815
INFO:__main__:Epoch 2 - avg_train_loss: 0.4897  avg_val_loss: 0.4815
Epoch 2 - Score: 0.4897  Scores: [0.4653739989384856, 0.5139382055253701]
INFO:__main__:Epoch 2 - Score: 0.4897  Scores: [0.4653739989384856, 0.5139382055253701]


Epoch: [3][0/638] Elapsed 0m 0s (remain 6m 7s) Loss: 0.3209(0.3209) Grad: inf  LR: 0.00001173  
Epoch: [3][20/638] Elapsed 0m 6s (remain 3m 14s) Loss: 0.7129(0.4415) Grad: 93339.6094  LR: 0.00001146  
Epoch: [3][40/638] Elapsed 0m 13s (remain 3m 15s) Loss: 0.4625(0.4273) Grad: 106073.2344  LR: 0.00001119  
Epoch: [3][60/638] Elapsed 0m 19s (remain 3m 2s) Loss: 0.5231(0.4312) Grad: 145257.1562  LR: 0.00001092  
Epoch: [3][80/638] Elapsed 0m 25s (remain 2m 54s) Loss: 0.5893(0.4328) Grad: 110735.6484  LR: 0.00001065  
Epoch: [3][100/638] Elapsed 0m 32s (remain 2m 52s) Loss: 0.5153(0.4265) Grad: 79578.7500  LR: 0.00001038  
EVAL: [0/129] Loss: 0.3884(0.3884) 
EVAL: [20/129] Loss: 0.3853(0.4878) 
EVAL: [40/129] Loss: 0.4564(0.4809) 
EVAL: [60/129] Loss: 0.5494(0.4868) 
EVAL: [80/129] Loss: 0.4768(0.4847) 
EVAL: [100/129] Loss: 0.6270(0.4839) 
EVAL: [120/129] Loss: 0.3854(0.4820) 


Epoch 3 - avg_train_loss: 0.4265  avg_val_loss: 0.4813
INFO:__main__:Epoch 3 - avg_train_loss: 0.4265  avg_val_loss: 0.4813
Epoch 3 - Score: 0.4899  Scores: [0.4549166558224774, 0.524821493164634]


EVAL: [128/129] Loss: 0.4814(0.4813) 


INFO:__main__:Epoch 3 - Score: 0.4899  Scores: [0.4549166558224774, 0.524821493164634]


Epoch: [3][120/638] Elapsed 1m 12s (remain 5m 7s) Loss: 0.3588(0.4211) Grad: 29491.8711  LR: 0.00001010  
Epoch: [3][140/638] Elapsed 1m 18s (remain 4m 36s) Loss: 0.4642(0.4167) Grad: 86694.6172  LR: 0.00000983  
Epoch: [3][160/638] Elapsed 1m 25s (remain 4m 12s) Loss: 0.2477(0.4145) Grad: 58748.3945  LR: 0.00000956  
Epoch: [3][180/638] Elapsed 1m 30s (remain 3m 48s) Loss: 0.4196(0.4092) Grad: 87592.3672  LR: 0.00000928  
Epoch: [3][200/638] Elapsed 1m 37s (remain 3m 32s) Loss: 0.4953(0.4111) Grad: 107732.6250  LR: 0.00000901  
EVAL: [0/129] Loss: 0.4305(0.4305) 
EVAL: [20/129] Loss: 0.4100(0.5255) 
EVAL: [40/129] Loss: 0.4795(0.5220) 
EVAL: [60/129] Loss: 0.5760(0.5264) 
EVAL: [80/129] Loss: 0.5149(0.5249) 
EVAL: [100/129] Loss: 0.6901(0.5229) 
EVAL: [120/129] Loss: 0.3979(0.5222) 
EVAL: [128/129] Loss: 0.4737(0.5210) 


Epoch 3 - avg_train_loss: 0.4111  avg_val_loss: 0.5210
INFO:__main__:Epoch 3 - avg_train_loss: 0.4111  avg_val_loss: 0.5210
Epoch 3 - Score: 0.5302  Scores: [0.540328203514784, 0.5201680664352204]
INFO:__main__:Epoch 3 - Score: 0.5302  Scores: [0.540328203514784, 0.5201680664352204]


Epoch: [3][220/638] Elapsed 2m 17s (remain 4m 19s) Loss: 0.3271(0.4084) Grad: 140677.5781  LR: 0.00000874  
Epoch: [3][240/638] Elapsed 2m 23s (remain 3m 55s) Loss: 0.4580(0.4074) Grad: 97857.3516  LR: 0.00000847  
Epoch: [3][260/638] Elapsed 2m 29s (remain 3m 36s) Loss: 0.4402(0.4057) Grad: 82135.8047  LR: 0.00000820  
Epoch: [3][280/638] Elapsed 2m 35s (remain 3m 17s) Loss: 0.2785(0.4011) Grad: 50627.7109  LR: 0.00000793  
Epoch: [3][300/638] Elapsed 2m 41s (remain 3m 1s) Loss: 0.3824(0.4033) Grad: 93045.5781  LR: 0.00000766  
EVAL: [0/129] Loss: 0.4172(0.4172) 
EVAL: [20/129] Loss: 0.4210(0.5053) 
EVAL: [40/129] Loss: 0.4856(0.4961) 
EVAL: [60/129] Loss: 0.5280(0.5008) 
EVAL: [80/129] Loss: 0.5004(0.4985) 
EVAL: [100/129] Loss: 0.6188(0.4967) 
EVAL: [120/129] Loss: 0.4614(0.4950) 
EVAL: [128/129] Loss: 0.5404(0.4942) 


Epoch 3 - avg_train_loss: 0.4033  avg_val_loss: 0.4942
INFO:__main__:Epoch 3 - avg_train_loss: 0.4033  avg_val_loss: 0.4942
Epoch 3 - Score: 0.5018  Scores: [0.43585555395894376, 0.5677010284190165]
INFO:__main__:Epoch 3 - Score: 0.5018  Scores: [0.43585555395894376, 0.5677010284190165]


Epoch: [3][320/638] Elapsed 3m 22s (remain 3m 20s) Loss: 0.2958(0.4025) Grad: 87362.6016  LR: 0.00000740  
Epoch: [3][340/638] Elapsed 3m 28s (remain 3m 1s) Loss: 0.5116(0.4028) Grad: 87343.5469  LR: 0.00000714  
Epoch: [3][360/638] Elapsed 3m 35s (remain 2m 45s) Loss: 0.3004(0.4011) Grad: 93132.7422  LR: 0.00000688  
Epoch: [3][380/638] Elapsed 3m 41s (remain 2m 29s) Loss: 0.5102(0.4009) Grad: 78919.1172  LR: 0.00000662  
Epoch: [3][400/638] Elapsed 3m 46s (remain 2m 14s) Loss: 0.4831(0.4008) Grad: 37620.7500  LR: 0.00000636  
EVAL: [0/129] Loss: 0.4099(0.4099) 
EVAL: [20/129] Loss: 0.3909(0.4993) 
EVAL: [40/129] Loss: 0.4714(0.4930) 
EVAL: [60/129] Loss: 0.5125(0.4948) 
EVAL: [80/129] Loss: 0.5278(0.4927) 
EVAL: [100/129] Loss: 0.6023(0.4904) 
EVAL: [120/129] Loss: 0.4436(0.4897) 
EVAL: [128/129] Loss: 0.4839(0.4878) 


Epoch 3 - avg_train_loss: 0.4008  avg_val_loss: 0.4878
INFO:__main__:Epoch 3 - avg_train_loss: 0.4008  avg_val_loss: 0.4878
Epoch 3 - Score: 0.4963  Scores: [0.4278203547016135, 0.5647347634266443]
INFO:__main__:Epoch 3 - Score: 0.4963  Scores: [0.4278203547016135, 0.5647347634266443]


Epoch: [3][420/638] Elapsed 4m 28s (remain 2m 18s) Loss: 0.3264(0.4006) Grad: 62476.6875  LR: 0.00000611  
Epoch: [3][440/638] Elapsed 4m 33s (remain 2m 2s) Loss: 0.3738(0.3992) Grad: 122423.5625  LR: 0.00000586  
Epoch: [3][460/638] Elapsed 4m 39s (remain 1m 47s) Loss: 0.2593(0.3980) Grad: 48996.7148  LR: 0.00000561  
Epoch: [3][480/638] Elapsed 4m 45s (remain 1m 33s) Loss: 0.4636(0.3990) Grad: 109069.4297  LR: 0.00000537  
Epoch: [3][500/638] Elapsed 4m 50s (remain 1m 19s) Loss: 0.4808(0.3976) Grad: 46610.7070  LR: 0.00000513  
EVAL: [0/129] Loss: 0.3805(0.3805) 
EVAL: [20/129] Loss: 0.3663(0.4847) 
EVAL: [40/129] Loss: 0.4602(0.4784) 
EVAL: [60/129] Loss: 0.5209(0.4835) 
EVAL: [80/129] Loss: 0.4979(0.4817) 
EVAL: [100/129] Loss: 0.6113(0.4802) 
EVAL: [120/129] Loss: 0.3956(0.4783) 
EVAL: [128/129] Loss: 0.4616(0.4773) 


Epoch 3 - avg_train_loss: 0.3976  avg_val_loss: 0.4773
INFO:__main__:Epoch 3 - avg_train_loss: 0.3976  avg_val_loss: 0.4773
Epoch 3 - Score: 0.4855  Scores: [0.4551100163930621, 0.5159707249415676]
INFO:__main__:Epoch 3 - Score: 0.4855  Scores: [0.4551100163930621, 0.5159707249415676]


Epoch: [3][520/638] Elapsed 5m 31s (remain 1m 14s) Loss: 0.4234(0.3958) Grad: 47267.3906  LR: 0.00000489  
Epoch: [3][540/638] Elapsed 5m 37s (remain 1m 0s) Loss: 0.2836(0.3956) Grad: 135682.9531  LR: 0.00000466  
Epoch: [3][560/638] Elapsed 5m 43s (remain 0m 47s) Loss: 0.3187(0.3968) Grad: 83405.2812  LR: 0.00000443  
Epoch: [3][580/638] Elapsed 5m 49s (remain 0m 34s) Loss: 0.4357(0.3975) Grad: 40544.3398  LR: 0.00000420  
Epoch: [3][600/638] Elapsed 5m 55s (remain 0m 21s) Loss: 0.3435(0.3976) Grad: 15592.2256  LR: 0.00000398  
EVAL: [0/129] Loss: 0.4088(0.4088) 
EVAL: [20/129] Loss: 0.3882(0.4879) 
EVAL: [40/129] Loss: 0.4588(0.4810) 
EVAL: [60/129] Loss: 0.5079(0.4840) 
EVAL: [80/129] Loss: 0.5085(0.4810) 
EVAL: [100/129] Loss: 0.6002(0.4784) 
EVAL: [120/129] Loss: 0.4189(0.4781) 
EVAL: [128/129] Loss: 0.4473(0.4765) 


Epoch 3 - avg_train_loss: 0.3976  avg_val_loss: 0.4765
INFO:__main__:Epoch 3 - avg_train_loss: 0.3976  avg_val_loss: 0.4765
Epoch 3 - Score: 0.4850  Scores: [0.42907872906950234, 0.5408580422612778]
INFO:__main__:Epoch 3 - Score: 0.4850  Scores: [0.42907872906950234, 0.5408580422612778]


Epoch: [3][620/638] Elapsed 6m 35s (remain 0m 10s) Loss: 0.4291(0.3966) Grad: 92476.8672  LR: 0.00000377  
Epoch: [3][637/638] Elapsed 6m 41s (remain 0m 0s) Loss: 0.3304(0.3964) Grad: 64883.0977  LR: 0.00000359  
EVAL: [0/129] Loss: 0.3920(0.3920) 
EVAL: [20/129] Loss: 0.3914(0.4939) 
EVAL: [40/129] Loss: 0.4474(0.4861) 
EVAL: [60/129] Loss: 0.5324(0.4900) 
EVAL: [80/129] Loss: 0.4951(0.4872) 
EVAL: [100/129] Loss: 0.6285(0.4855) 
EVAL: [120/129] Loss: 0.4166(0.4844) 
EVAL: [128/129] Loss: 0.4455(0.4832) 


Epoch 3 - avg_train_loss: 0.3964  avg_val_loss: 0.4832
INFO:__main__:Epoch 3 - avg_train_loss: 0.3964  avg_val_loss: 0.4832
Epoch 3 - Score: 0.4916  Scores: [0.45932021332344963, 0.5239655374496374]
INFO:__main__:Epoch 3 - Score: 0.4916  Scores: [0.45932021332344963, 0.5239655374496374]


Epoch: [4][0/638] Elapsed 0m 0s (remain 5m 58s) Loss: 0.3063(0.3063) Grad: inf  LR: 0.00000358  
Epoch: [4][20/638] Elapsed 0m 8s (remain 3m 55s) Loss: 0.3600(0.3460) Grad: 127093.6719  LR: 0.00000337  
Epoch: [4][40/638] Elapsed 0m 14s (remain 3m 32s) Loss: 0.4768(0.3516) Grad: 73118.9062  LR: 0.00000317  
Epoch: [4][60/638] Elapsed 0m 20s (remain 3m 11s) Loss: 0.4016(0.3470) Grad: 131103.4062  LR: 0.00000297  
Epoch: [4][80/638] Elapsed 0m 27s (remain 3m 6s) Loss: 0.2992(0.3353) Grad: 59670.3516  LR: 0.00000278  
Epoch: [4][100/638] Elapsed 0m 34s (remain 3m 2s) Loss: 0.2748(0.3293) Grad: 54833.2656  LR: 0.00000259  
EVAL: [0/129] Loss: 0.4225(0.4225) 
EVAL: [20/129] Loss: 0.4173(0.4916) 
EVAL: [40/129] Loss: 0.4474(0.4804) 
EVAL: [60/129] Loss: 0.5245(0.4845) 
EVAL: [80/129] Loss: 0.4831(0.4799) 
EVAL: [100/129] Loss: 0.6179(0.4782) 
EVAL: [120/129] Loss: 0.4366(0.4774) 
EVAL: [128/129] Loss: 0.4664(0.4764) 


Epoch 4 - avg_train_loss: 0.3293  avg_val_loss: 0.4764
INFO:__main__:Epoch 4 - avg_train_loss: 0.3293  avg_val_loss: 0.4764
Epoch 4 - Score: 0.4849  Scores: [0.40905108826398484, 0.5607250834839641]
INFO:__main__:Epoch 4 - Score: 0.4849  Scores: [0.40905108826398484, 0.5607250834839641]


Epoch: [4][120/638] Elapsed 1m 14s (remain 5m 18s) Loss: 0.2945(0.3245) Grad: 42176.6055  LR: 0.00000241  
Epoch: [4][140/638] Elapsed 1m 20s (remain 4m 45s) Loss: 0.3611(0.3254) Grad: 87224.4375  LR: 0.00000224  
Epoch: [4][160/638] Elapsed 1m 26s (remain 4m 15s) Loss: 0.2861(0.3260) Grad: 30667.7168  LR: 0.00000207  
Epoch: [4][180/638] Elapsed 1m 32s (remain 3m 53s) Loss: 0.4677(0.3254) Grad: 91649.4375  LR: 0.00000190  
Epoch: [4][200/638] Elapsed 1m 38s (remain 3m 34s) Loss: 0.3353(0.3234) Grad: 29723.8438  LR: 0.00000175  
EVAL: [0/129] Loss: 0.3830(0.3830) 
EVAL: [20/129] Loss: 0.3853(0.4896) 
EVAL: [40/129] Loss: 0.4466(0.4821) 
EVAL: [60/129] Loss: 0.5236(0.4867) 
EVAL: [80/129] Loss: 0.4907(0.4839) 
EVAL: [100/129] Loss: 0.6263(0.4826) 
EVAL: [120/129] Loss: 0.4089(0.4816) 


Epoch 4 - avg_train_loss: 0.3234  avg_val_loss: 0.4802
INFO:__main__:Epoch 4 - avg_train_loss: 0.3234  avg_val_loss: 0.4802
Epoch 4 - Score: 0.4888  Scores: [0.45420749839480856, 0.523330611791859]
INFO:__main__:Epoch 4 - Score: 0.4888  Scores: [0.45420749839480856, 0.523330611791859]


EVAL: [128/129] Loss: 0.4529(0.4802) 
Epoch: [4][220/638] Elapsed 2m 19s (remain 4m 22s) Loss: 0.3974(0.3247) Grad: 33767.4570  LR: 0.00000159  
Epoch: [4][240/638] Elapsed 2m 25s (remain 3m 59s) Loss: 0.2928(0.3259) Grad: 53350.2109  LR: 0.00000145  
Epoch: [4][260/638] Elapsed 2m 31s (remain 3m 38s) Loss: 0.3847(0.3259) Grad: 102155.1875  LR: 0.00000131  
Epoch: [4][280/638] Elapsed 2m 36s (remain 3m 19s) Loss: 0.3498(0.3274) Grad: 108869.5469  LR: 0.00000118  
Epoch: [4][300/638] Elapsed 2m 42s (remain 3m 2s) Loss: 0.2576(0.3280) Grad: 106738.2578  LR: 0.00000105  
EVAL: [0/129] Loss: 0.3935(0.3935) 
EVAL: [20/129] Loss: 0.3960(0.4919) 
EVAL: [40/129] Loss: 0.4466(0.4846) 
EVAL: [60/129] Loss: 0.5182(0.4885) 
EVAL: [80/129] Loss: 0.4885(0.4852) 
EVAL: [100/129] Loss: 0.6250(0.4834) 
EVAL: [120/129] Loss: 0.4270(0.4830) 
EVAL: [128/129] Loss: 0.4595(0.4813) 


Epoch 4 - avg_train_loss: 0.3280  avg_val_loss: 0.4813
INFO:__main__:Epoch 4 - avg_train_loss: 0.3280  avg_val_loss: 0.4813
Epoch 4 - Score: 0.4901  Scores: [0.44025161539644725, 0.5399499250994078]
INFO:__main__:Epoch 4 - Score: 0.4901  Scores: [0.44025161539644725, 0.5399499250994078]


Epoch: [4][320/638] Elapsed 3m 23s (remain 3m 20s) Loss: 0.2976(0.3273) Grad: 48888.3867  LR: 0.00000094  
Epoch: [4][340/638] Elapsed 3m 28s (remain 3m 1s) Loss: 0.2231(0.3260) Grad: 99671.5078  LR: 0.00000082  
Epoch: [4][360/638] Elapsed 3m 34s (remain 2m 44s) Loss: 0.2091(0.3251) Grad: 74102.6797  LR: 0.00000072  
Epoch: [4][380/638] Elapsed 3m 41s (remain 2m 29s) Loss: 0.3320(0.3245) Grad: 63740.9961  LR: 0.00000062  
Epoch: [4][400/638] Elapsed 3m 47s (remain 2m 14s) Loss: 0.4404(0.3250) Grad: 69658.7109  LR: 0.00000053  
EVAL: [0/129] Loss: 0.3918(0.3918) 
EVAL: [20/129] Loss: 0.4008(0.4990) 
EVAL: [40/129] Loss: 0.4530(0.4916) 
EVAL: [60/129] Loss: 0.5311(0.4961) 
EVAL: [80/129] Loss: 0.4925(0.4930) 
EVAL: [100/129] Loss: 0.6371(0.4916) 
EVAL: [120/129] Loss: 0.4251(0.4909) 
EVAL: [128/129] Loss: 0.4638(0.4895) 


Epoch 4 - avg_train_loss: 0.3250  avg_val_loss: 0.4895
INFO:__main__:Epoch 4 - avg_train_loss: 0.3250  avg_val_loss: 0.4895
Epoch 4 - Score: 0.4981  Scores: [0.46424450045286014, 0.5320050883354167]
INFO:__main__:Epoch 4 - Score: 0.4981  Scores: [0.46424450045286014, 0.5320050883354167]


Epoch: [4][420/638] Elapsed 4m 28s (remain 2m 18s) Loss: 0.3295(0.3240) Grad: 81132.6016  LR: 0.00000044  
Epoch: [4][440/638] Elapsed 4m 33s (remain 2m 2s) Loss: 0.3334(0.3229) Grad: 70532.6328  LR: 0.00000037  
Epoch: [4][460/638] Elapsed 4m 40s (remain 1m 47s) Loss: 0.3886(0.3244) Grad: 25327.3105  LR: 0.00000030  
Epoch: [4][480/638] Elapsed 4m 45s (remain 1m 33s) Loss: 0.3759(0.3232) Grad: 85061.3125  LR: 0.00000024  
Epoch: [4][500/638] Elapsed 4m 51s (remain 1m 19s) Loss: 0.1783(0.3216) Grad: 29586.0898  LR: 0.00000018  
EVAL: [0/129] Loss: 0.3931(0.3931) 
EVAL: [20/129] Loss: 0.4021(0.4995) 
EVAL: [40/129] Loss: 0.4541(0.4919) 
EVAL: [60/129] Loss: 0.5291(0.4963) 
EVAL: [80/129] Loss: 0.4955(0.4932) 
EVAL: [100/129] Loss: 0.6324(0.4917) 
EVAL: [120/129] Loss: 0.4315(0.4909) 
EVAL: [128/129] Loss: 0.4714(0.4895) 


Epoch 4 - avg_train_loss: 0.3216  avg_val_loss: 0.4895
INFO:__main__:Epoch 4 - avg_train_loss: 0.3216  avg_val_loss: 0.4895
Epoch 4 - Score: 0.4981  Scores: [0.45869916308113284, 0.5374184212298437]
INFO:__main__:Epoch 4 - Score: 0.4981  Scores: [0.45869916308113284, 0.5374184212298437]


Epoch: [4][520/638] Elapsed 5m 32s (remain 1m 14s) Loss: 0.3700(0.3205) Grad: 32030.2441  LR: 0.00000013  
Epoch: [4][540/638] Elapsed 5m 38s (remain 1m 0s) Loss: 0.3107(0.3211) Grad: 71197.0625  LR: 0.00000009  
Epoch: [4][560/638] Elapsed 5m 44s (remain 0m 47s) Loss: 0.3491(0.3208) Grad: 100904.8438  LR: 0.00000006  
Epoch: [4][580/638] Elapsed 5m 49s (remain 0m 34s) Loss: 0.2954(0.3209) Grad: 23119.6855  LR: 0.00000003  
Epoch: [4][600/638] Elapsed 5m 55s (remain 0m 21s) Loss: 0.2214(0.3206) Grad: 55732.7695  LR: 0.00000001  
EVAL: [0/129] Loss: 0.3894(0.3894) 
EVAL: [20/129] Loss: 0.3989(0.4970) 
EVAL: [40/129] Loss: 0.4537(0.4896) 
EVAL: [60/129] Loss: 0.5277(0.4940) 
EVAL: [80/129] Loss: 0.4920(0.4909) 
EVAL: [100/129] Loss: 0.6315(0.4894) 
EVAL: [120/129] Loss: 0.4250(0.4886) 
EVAL: [128/129] Loss: 0.4652(0.4872) 


Epoch 4 - avg_train_loss: 0.3206  avg_val_loss: 0.4872
INFO:__main__:Epoch 4 - avg_train_loss: 0.3206  avg_val_loss: 0.4872
Epoch 4 - Score: 0.4958  Scores: [0.4582959832415054, 0.533280909073534]
INFO:__main__:Epoch 4 - Score: 0.4958  Scores: [0.4582959832415054, 0.533280909073534]


Epoch: [4][620/638] Elapsed 6m 36s (remain 0m 10s) Loss: 0.3120(0.3201) Grad: 76837.8594  LR: 0.00000000  
Epoch: [4][637/638] Elapsed 6m 40s (remain 0m 0s) Loss: 0.2524(0.3203) Grad: 83888.0078  LR: 0.00000000  
EVAL: [0/129] Loss: 0.3893(0.3893) 
EVAL: [20/129] Loss: 0.3988(0.4968) 
EVAL: [40/129] Loss: 0.4536(0.4895) 
EVAL: [60/129] Loss: 0.5276(0.4939) 
EVAL: [80/129] Loss: 0.4919(0.4908) 
EVAL: [100/129] Loss: 0.6315(0.4892) 
EVAL: [120/129] Loss: 0.4247(0.4885) 


Epoch 4 - avg_train_loss: 0.3203  avg_val_loss: 0.4870


EVAL: [128/129] Loss: 0.4650(0.4870) 


INFO:__main__:Epoch 4 - avg_train_loss: 0.3203  avg_val_loss: 0.4870
Epoch 4 - Score: 0.4957  Scores: [0.4581102390825373, 0.5331919492014074]
INFO:__main__:Epoch 4 - Score: 0.4957  Scores: [0.4581102390825373, 0.5331919492014074]
Score: 0.4553  Scores: [0.39657923131630285, 0.5140009871571574]
INFO:__main__:Score: 0.4553  Scores: [0.39657923131630285, 0.5140009871571574]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hid

Epoch: [1][0/644] Elapsed 0m 0s (remain 6m 41s) Loss: 1.3264(1.3264) Grad: inf  LR: 0.00000008  
Epoch: [1][20/644] Elapsed 0m 5s (remain 2m 54s) Loss: 0.9045(1.1382) Grad: 74695.6172  LR: 0.00000163  
Epoch: [1][40/644] Elapsed 0m 13s (remain 3m 22s) Loss: 0.9928(1.0145) Grad: 62651.7617  LR: 0.00000319  
Epoch: [1][60/644] Elapsed 0m 19s (remain 3m 10s) Loss: 0.9307(0.9130) Grad: 82017.3984  LR: 0.00000475  
Epoch: [1][80/644] Elapsed 0m 25s (remain 2m 56s) Loss: 0.6734(0.8496) Grad: 144640.3281  LR: 0.00000630  
Epoch: [1][100/644] Elapsed 0m 32s (remain 2m 52s) Loss: 0.4412(0.7985) Grad: 140235.6719  LR: 0.00000786  
EVAL: [0/126] Loss: 0.8976(0.8976) 
EVAL: [20/126] Loss: 0.6887(0.6686) 
EVAL: [40/126] Loss: 0.8142(0.6836) 
EVAL: [60/126] Loss: 0.7165(0.6851) 
EVAL: [80/126] Loss: 0.7320(0.6830) 
EVAL: [100/126] Loss: 0.6229(0.6815) 
EVAL: [120/126] Loss: 0.7582(0.6792) 


Epoch 1 - Save Best Score: 0.6921 Model


EVAL: [125/126] Loss: 0.5538(0.6797) 


INFO:__main__:Epoch 1 - Save Best Score: 0.6921 Model
Epoch 1 - avg_train_loss: 0.7985  avg_val_loss: 0.6797
INFO:__main__:Epoch 1 - avg_train_loss: 0.7985  avg_val_loss: 0.6797
Epoch 1 - Score: 0.6921  Scores: [0.6099899285851682, 0.7742981960593097]
INFO:__main__:Epoch 1 - Score: 0.6921  Scores: [0.6099899285851682, 0.7742981960593097]


Epoch: [1][120/644] Elapsed 1m 38s (remain 7m 5s) Loss: 0.6542(0.7599) Grad: 52970.4727  LR: 0.00000942  
Epoch: [1][140/644] Elapsed 1m 44s (remain 6m 13s) Loss: 0.6616(0.7395) Grad: 43267.3633  LR: 0.00001097  
Epoch: [1][160/644] Elapsed 1m 52s (remain 5m 38s) Loss: 0.7456(0.7268) Grad: 163078.1719  LR: 0.00001253  
Epoch: [1][180/644] Elapsed 1m 58s (remain 5m 4s) Loss: 0.6755(0.7144) Grad: 45909.1680  LR: 0.00001409  
Epoch: [1][200/644] Elapsed 2m 5s (remain 4m 36s) Loss: 0.4381(0.7036) Grad: 108774.7344  LR: 0.00001564  
EVAL: [0/126] Loss: 0.9471(0.9471) 
EVAL: [20/126] Loss: 0.6929(0.6885) 
EVAL: [40/126] Loss: 0.7643(0.6936) 
EVAL: [60/126] Loss: 0.7371(0.6973) 
EVAL: [80/126] Loss: 0.7656(0.6996) 
EVAL: [100/126] Loss: 0.6396(0.7007) 
EVAL: [120/126] Loss: 0.7675(0.6994) 
EVAL: [125/126] Loss: 0.4201(0.7005) 


Epoch 1 - avg_train_loss: 0.7036  avg_val_loss: 0.7005
INFO:__main__:Epoch 1 - avg_train_loss: 0.7036  avg_val_loss: 0.7005
Epoch 1 - Score: 0.7152  Scores: [0.5544607260659464, 0.8759082492970862]
INFO:__main__:Epoch 1 - Score: 0.7152  Scores: [0.5544607260659464, 0.8759082492970862]


Epoch: [1][220/644] Elapsed 3m 0s (remain 5m 46s) Loss: 0.5282(0.6865) Grad: 93029.1172  LR: 0.00001720  
Epoch: [1][240/644] Elapsed 3m 6s (remain 5m 11s) Loss: 0.4210(0.6747) Grad: 146271.8594  LR: 0.00001875  
Epoch: [1][260/644] Elapsed 3m 11s (remain 4m 41s) Loss: 0.3836(0.6626) Grad: 60862.6094  LR: 0.00002000  
Epoch: [1][280/644] Elapsed 3m 17s (remain 4m 15s) Loss: 0.6772(0.6548) Grad: 67126.1328  LR: 0.00001999  
Epoch: [1][300/644] Elapsed 3m 23s (remain 3m 51s) Loss: 0.5382(0.6463) Grad: 140766.0781  LR: 0.00001998  
EVAL: [0/126] Loss: 0.9821(0.9821) 
EVAL: [20/126] Loss: 0.6549(0.6917) 
EVAL: [40/126] Loss: 0.7143(0.6922) 
EVAL: [60/126] Loss: 0.6787(0.6939) 
EVAL: [80/126] Loss: 0.7008(0.7002) 
EVAL: [100/126] Loss: 0.6705(0.7047) 
EVAL: [120/126] Loss: 0.8033(0.7021) 


Epoch 1 - avg_train_loss: 0.6463  avg_val_loss: 0.7038


EVAL: [125/126] Loss: 0.4481(0.7038) 


INFO:__main__:Epoch 1 - avg_train_loss: 0.6463  avg_val_loss: 0.7038
Epoch 1 - Score: 0.7170  Scores: [0.6491717728996442, 0.7848484966306243]
INFO:__main__:Epoch 1 - Score: 0.7170  Scores: [0.6491717728996442, 0.7848484966306243]


Epoch: [1][320/644] Elapsed 4m 17s (remain 4m 19s) Loss: 0.5168(0.6397) Grad: 96698.1953  LR: 0.00001996  
Epoch: [1][340/644] Elapsed 4m 23s (remain 3m 54s) Loss: 0.5752(0.6374) Grad: 105560.3594  LR: 0.00001994  
Epoch: [1][360/644] Elapsed 4m 28s (remain 3m 30s) Loss: 0.6843(0.6310) Grad: 75834.4375  LR: 0.00001990  
Epoch: [1][380/644] Elapsed 4m 34s (remain 3m 9s) Loss: 0.6686(0.6240) Grad: 135658.9531  LR: 0.00001986  
Epoch: [1][400/644] Elapsed 4m 40s (remain 2m 49s) Loss: 0.6137(0.6209) Grad: 74423.2500  LR: 0.00001981  
EVAL: [0/126] Loss: 0.9682(0.9682) 
EVAL: [20/126] Loss: 0.5046(0.6255) 
EVAL: [40/126] Loss: 0.6274(0.6365) 
EVAL: [60/126] Loss: 0.5782(0.6371) 
EVAL: [80/126] Loss: 0.5240(0.6409) 
EVAL: [100/126] Loss: 0.5469(0.6382) 
EVAL: [120/126] Loss: 0.7350(0.6347) 
EVAL: [125/126] Loss: 0.5112(0.6351) 


Epoch 1 - Save Best Score: 0.6499 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6499 Model
Epoch 1 - avg_train_loss: 0.6209  avg_val_loss: 0.6351
INFO:__main__:Epoch 1 - avg_train_loss: 0.6209  avg_val_loss: 0.6351
Epoch 1 - Score: 0.6499  Scores: [0.5384688562754499, 0.7613674695856866]
INFO:__main__:Epoch 1 - Score: 0.6499  Scores: [0.5384688562754499, 0.7613674695856866]


Epoch: [1][420/644] Elapsed 5m 50s (remain 3m 5s) Loss: 0.4560(0.6198) Grad: 31482.1914  LR: 0.00001975  
Epoch: [1][440/644] Elapsed 5m 58s (remain 2m 44s) Loss: 0.5471(0.6149) Grad: 122224.5234  LR: 0.00001969  
Epoch: [1][460/644] Elapsed 6m 5s (remain 2m 24s) Loss: 0.3818(0.6104) Grad: 93004.7109  LR: 0.00001962  
Epoch: [1][480/644] Elapsed 6m 12s (remain 2m 6s) Loss: 0.7438(0.6106) Grad: 142500.8750  LR: 0.00001954  
Epoch: [1][500/644] Elapsed 6m 19s (remain 1m 48s) Loss: 0.5306(0.6070) Grad: 88711.5781  LR: 0.00001946  
EVAL: [0/126] Loss: 0.8586(0.8586) 
EVAL: [20/126] Loss: 0.5525(0.6152) 
EVAL: [40/126] Loss: 0.6530(0.6160) 
EVAL: [60/126] Loss: 0.6253(0.6217) 
EVAL: [80/126] Loss: 0.6463(0.6342) 
EVAL: [100/126] Loss: 0.5651(0.6417) 
EVAL: [120/126] Loss: 0.7511(0.6380) 


Epoch 1 - avg_train_loss: 0.6070  avg_val_loss: 0.6416


EVAL: [125/126] Loss: 0.3671(0.6416) 


INFO:__main__:Epoch 1 - avg_train_loss: 0.6070  avg_val_loss: 0.6416
Epoch 1 - Score: 0.6602  Scores: [0.5143440204537282, 0.806095121363933]
INFO:__main__:Epoch 1 - Score: 0.6602  Scores: [0.5143440204537282, 0.806095121363933]


Epoch: [1][520/644] Elapsed 7m 13s (remain 1m 42s) Loss: 0.4084(0.6021) Grad: 64478.1953  LR: 0.00001937  
Epoch: [1][540/644] Elapsed 7m 21s (remain 1m 24s) Loss: 0.3142(0.5987) Grad: 58019.0977  LR: 0.00001927  
Epoch: [1][560/644] Elapsed 7m 27s (remain 1m 6s) Loss: 0.4601(0.5957) Grad: 129868.9922  LR: 0.00001917  
Epoch: [1][580/644] Elapsed 7m 33s (remain 0m 49s) Loss: 0.3318(0.5911) Grad: 88167.3438  LR: 0.00001905  
Epoch: [1][600/644] Elapsed 7m 39s (remain 0m 32s) Loss: 0.5228(0.5897) Grad: 104983.8516  LR: 0.00001894  
EVAL: [0/126] Loss: 1.1148(1.1148) 
EVAL: [20/126] Loss: 0.9420(0.9524) 
EVAL: [40/126] Loss: 0.8892(0.9383) 
EVAL: [60/126] Loss: 0.9955(0.9462) 
EVAL: [80/126] Loss: 1.1003(0.9615) 
EVAL: [100/126] Loss: 0.8844(0.9667) 
EVAL: [120/126] Loss: 0.9428(0.9603) 


Epoch 1 - avg_train_loss: 0.5897  avg_val_loss: 0.9646


EVAL: [125/126] Loss: 0.7221(0.9646) 


INFO:__main__:Epoch 1 - avg_train_loss: 0.5897  avg_val_loss: 0.9646
Epoch 1 - Score: 0.9803  Scores: [0.6721479052182132, 1.2884631453507542]
INFO:__main__:Epoch 1 - Score: 0.9803  Scores: [0.6721479052182132, 1.2884631453507542]


Epoch: [1][620/644] Elapsed 8m 34s (remain 0m 19s) Loss: 0.5828(0.5904) Grad: 107000.6328  LR: 0.00001881  
Epoch: [1][640/644] Elapsed 8m 39s (remain 0m 2s) Loss: 0.5294(0.5879) Grad: 97805.0391  LR: 0.00001868  
Epoch: [1][643/644] Elapsed 8m 40s (remain 0m 0s) Loss: 0.4689(0.5874) Grad: 82164.7500  LR: 0.00001866  
EVAL: [0/126] Loss: 0.9740(0.9740) 
EVAL: [20/126] Loss: 0.5809(0.6601) 
EVAL: [40/126] Loss: 0.6432(0.6566) 
EVAL: [60/126] Loss: 0.6717(0.6651) 
EVAL: [80/126] Loss: 0.7241(0.6761) 
EVAL: [100/126] Loss: 0.5871(0.6798) 
EVAL: [120/126] Loss: 0.7150(0.6727) 
EVAL: [125/126] Loss: 0.4305(0.6763) 


Epoch 1 - avg_train_loss: 0.5874  avg_val_loss: 0.6763
INFO:__main__:Epoch 1 - avg_train_loss: 0.5874  avg_val_loss: 0.6763
Epoch 1 - Score: 0.6950  Scores: [0.5928731360045844, 0.7971294894003168]
INFO:__main__:Epoch 1 - Score: 0.6950  Scores: [0.5928731360045844, 0.7971294894003168]


Epoch: [2][0/644] Elapsed 0m 0s (remain 8m 7s) Loss: 0.4134(0.4134) Grad: inf  LR: 0.00001865  
Epoch: [2][20/644] Elapsed 0m 6s (remain 3m 16s) Loss: 0.5437(0.5113) Grad: 152587.5625  LR: 0.00001851  
Epoch: [2][40/644] Elapsed 0m 13s (remain 3m 16s) Loss: 0.5319(0.5185) Grad: 93950.6562  LR: 0.00001837  
Epoch: [2][60/644] Elapsed 0m 20s (remain 3m 20s) Loss: 0.4984(0.4947) Grad: 109105.3438  LR: 0.00001822  
Epoch: [2][80/644] Elapsed 0m 26s (remain 3m 1s) Loss: 0.5948(0.4982) Grad: 93277.1172  LR: 0.00001806  
Epoch: [2][100/644] Elapsed 0m 31s (remain 2m 48s) Loss: 0.6493(0.5039) Grad: 81536.0547  LR: 0.00001790  
EVAL: [0/126] Loss: 0.9975(0.9975) 
EVAL: [20/126] Loss: 0.7219(0.7664) 
EVAL: [40/126] Loss: 0.7455(0.7536) 
EVAL: [60/126] Loss: 0.7666(0.7611) 
EVAL: [80/126] Loss: 0.8360(0.7768) 
EVAL: [100/126] Loss: 0.7205(0.7849) 
EVAL: [120/126] Loss: 0.8391(0.7790) 
EVAL: [125/126] Loss: 0.5186(0.7832) 


Epoch 2 - avg_train_loss: 0.5039  avg_val_loss: 0.7832
INFO:__main__:Epoch 2 - avg_train_loss: 0.5039  avg_val_loss: 0.7832
Epoch 2 - Score: 0.8017  Scores: [0.669156745563339, 0.9342042792795642]
INFO:__main__:Epoch 2 - Score: 0.8017  Scores: [0.669156745563339, 0.9342042792795642]


Epoch: [2][120/644] Elapsed 1m 26s (remain 6m 15s) Loss: 0.3648(0.4959) Grad: 55104.7227  LR: 0.00001773  
Epoch: [2][140/644] Elapsed 1m 32s (remain 5m 29s) Loss: 0.5354(0.4956) Grad: 88857.7109  LR: 0.00001755  
Epoch: [2][160/644] Elapsed 1m 38s (remain 4m 54s) Loss: 0.3666(0.4965) Grad: 76352.4531  LR: 0.00001737  
Epoch: [2][180/644] Elapsed 1m 44s (remain 4m 27s) Loss: 0.4886(0.4934) Grad: 79159.5078  LR: 0.00001719  
Epoch: [2][200/644] Elapsed 1m 49s (remain 4m 1s) Loss: 0.3277(0.4895) Grad: 87295.9922  LR: 0.00001700  
EVAL: [0/126] Loss: 0.8772(0.8772) 
EVAL: [20/126] Loss: 0.5010(0.5862) 
EVAL: [40/126] Loss: 0.6247(0.5909) 
EVAL: [60/126] Loss: 0.5758(0.5971) 
EVAL: [80/126] Loss: 0.5853(0.6079) 
EVAL: [100/126] Loss: 0.5226(0.6133) 
EVAL: [120/126] Loss: 0.7066(0.6103) 
EVAL: [125/126] Loss: 0.3333(0.6129) 


Epoch 2 - Save Best Score: 0.6319 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6319 Model
Epoch 2 - avg_train_loss: 0.4895  avg_val_loss: 0.6129
INFO:__main__:Epoch 2 - avg_train_loss: 0.4895  avg_val_loss: 0.6129
Epoch 2 - Score: 0.6319  Scores: [0.49017247465103336, 0.7735725576612498]
INFO:__main__:Epoch 2 - Score: 0.6319  Scores: [0.49017247465103336, 0.7735725576612498]


Epoch: [2][220/644] Elapsed 2m 53s (remain 5m 31s) Loss: 0.3720(0.4854) Grad: 68589.1094  LR: 0.00001680  
Epoch: [2][240/644] Elapsed 3m 1s (remain 5m 4s) Loss: 0.7388(0.4823) Grad: 120981.4609  LR: 0.00001660  
Epoch: [2][260/644] Elapsed 3m 7s (remain 4m 35s) Loss: 0.3728(0.4793) Grad: 106112.7031  LR: 0.00001639  
Epoch: [2][280/644] Elapsed 3m 14s (remain 4m 11s) Loss: 0.3936(0.4749) Grad: 93605.8203  LR: 0.00001618  
Epoch: [2][300/644] Elapsed 3m 22s (remain 3m 51s) Loss: 0.4512(0.4725) Grad: 66709.4531  LR: 0.00001597  
EVAL: [0/126] Loss: 1.0468(1.0468) 
EVAL: [20/126] Loss: 0.7730(0.8097) 
EVAL: [40/126] Loss: 0.7908(0.8032) 
EVAL: [60/126] Loss: 0.8314(0.8102) 
EVAL: [80/126] Loss: 0.9060(0.8257) 
EVAL: [100/126] Loss: 0.7787(0.8327) 
EVAL: [120/126] Loss: 0.8585(0.8269) 
EVAL: [125/126] Loss: 0.5576(0.8309) 


Epoch 2 - avg_train_loss: 0.4725  avg_val_loss: 0.8309
INFO:__main__:Epoch 2 - avg_train_loss: 0.4725  avg_val_loss: 0.8309
Epoch 2 - Score: 0.8479  Scores: [0.5990875713981092, 1.096701875818099]
INFO:__main__:Epoch 2 - Score: 0.8479  Scores: [0.5990875713981092, 1.096701875818099]


Epoch: [2][320/644] Elapsed 4m 17s (remain 4m 18s) Loss: 0.2752(0.4713) Grad: 88750.3828  LR: 0.00001575  
Epoch: [2][340/644] Elapsed 4m 23s (remain 3m 54s) Loss: 0.3373(0.4702) Grad: 49061.1875  LR: 0.00001553  
Epoch: [2][360/644] Elapsed 4m 29s (remain 3m 30s) Loss: 0.4466(0.4731) Grad: 66602.6172  LR: 0.00001530  
Epoch: [2][380/644] Elapsed 4m 34s (remain 3m 9s) Loss: 0.3153(0.4713) Grad: 28827.2910  LR: 0.00001507  
Epoch: [2][400/644] Elapsed 4m 40s (remain 2m 50s) Loss: 0.4241(0.4698) Grad: 50381.6992  LR: 0.00001483  
EVAL: [0/126] Loss: 0.9632(0.9632) 
EVAL: [20/126] Loss: 0.4890(0.6097) 
EVAL: [40/126] Loss: 0.6419(0.6128) 
EVAL: [60/126] Loss: 0.5716(0.6192) 
EVAL: [80/126] Loss: 0.6037(0.6357) 
EVAL: [100/126] Loss: 0.5027(0.6427) 
EVAL: [120/126] Loss: 0.7404(0.6392) 
EVAL: [125/126] Loss: 0.3526(0.6438) 


Epoch 2 - avg_train_loss: 0.4698  avg_val_loss: 0.6438
INFO:__main__:Epoch 2 - avg_train_loss: 0.4698  avg_val_loss: 0.6438
Epoch 2 - Score: 0.6651  Scores: [0.5332088358867784, 0.7969808873622473]
INFO:__main__:Epoch 2 - Score: 0.6651  Scores: [0.5332088358867784, 0.7969808873622473]


Epoch: [2][420/644] Elapsed 5m 35s (remain 2m 57s) Loss: 0.4159(0.4712) Grad: 150135.8438  LR: 0.00001459  
Epoch: [2][440/644] Elapsed 5m 41s (remain 2m 37s) Loss: 0.6357(0.4727) Grad: 28121.9023  LR: 0.00001435  
Epoch: [2][460/644] Elapsed 5m 47s (remain 2m 17s) Loss: 0.4565(0.4748) Grad: 88905.7344  LR: 0.00001410  
Epoch: [2][480/644] Elapsed 5m 52s (remain 1m 59s) Loss: 0.4287(0.4723) Grad: 113873.2031  LR: 0.00001386  
Epoch: [2][500/644] Elapsed 5m 58s (remain 1m 42s) Loss: 0.4682(0.4705) Grad: 124304.4844  LR: 0.00001361  
EVAL: [0/126] Loss: 0.9298(0.9298) 
EVAL: [20/126] Loss: 0.6265(0.6938) 
EVAL: [40/126] Loss: 0.7219(0.6845) 
EVAL: [60/126] Loss: 0.6457(0.6957) 
EVAL: [80/126] Loss: 0.7283(0.7164) 
EVAL: [100/126] Loss: 0.6141(0.7244) 
EVAL: [120/126] Loss: 0.7842(0.7195) 
EVAL: [125/126] Loss: 0.4915(0.7246) 


Epoch 2 - avg_train_loss: 0.4705  avg_val_loss: 0.7246
INFO:__main__:Epoch 2 - avg_train_loss: 0.4705  avg_val_loss: 0.7246
Epoch 2 - Score: 0.7479  Scores: [0.5888310056680188, 0.9068742079109733]
INFO:__main__:Epoch 2 - Score: 0.7479  Scores: [0.5888310056680188, 0.9068742079109733]


Epoch: [2][520/644] Elapsed 6m 54s (remain 1m 37s) Loss: 0.4708(0.4704) Grad: 148212.2969  LR: 0.00001335  
Epoch: [2][540/644] Elapsed 6m 59s (remain 1m 19s) Loss: 0.3886(0.4692) Grad: 26278.0137  LR: 0.00001310  
Epoch: [2][560/644] Elapsed 7m 5s (remain 1m 2s) Loss: 0.4251(0.4676) Grad: 91927.0938  LR: 0.00001284  
Epoch: [2][580/644] Elapsed 7m 11s (remain 0m 46s) Loss: 0.4113(0.4659) Grad: 91210.8750  LR: 0.00001258  
Epoch: [2][600/644] Elapsed 7m 16s (remain 0m 31s) Loss: 0.3185(0.4641) Grad: 79207.8750  LR: 0.00001231  
EVAL: [0/126] Loss: 0.8870(0.8870) 
EVAL: [20/126] Loss: 0.5715(0.6524) 
EVAL: [40/126] Loss: 0.6369(0.6406) 
EVAL: [60/126] Loss: 0.6321(0.6511) 
EVAL: [80/126] Loss: 0.6976(0.6681) 
EVAL: [100/126] Loss: 0.5742(0.6763) 
EVAL: [120/126] Loss: 0.7473(0.6703) 
EVAL: [125/126] Loss: 0.4804(0.6748) 


Epoch 2 - avg_train_loss: 0.4641  avg_val_loss: 0.6748
INFO:__main__:Epoch 2 - avg_train_loss: 0.4641  avg_val_loss: 0.6748
Epoch 2 - Score: 0.6965  Scores: [0.5618424795923059, 0.8311213403570183]
INFO:__main__:Epoch 2 - Score: 0.6965  Scores: [0.5618424795923059, 0.8311213403570183]


Epoch: [2][620/644] Elapsed 8m 11s (remain 0m 18s) Loss: 0.3680(0.4634) Grad: 156715.0312  LR: 0.00001205  
Epoch: [2][640/644] Elapsed 8m 17s (remain 0m 2s) Loss: 0.4549(0.4619) Grad: 51474.0742  LR: 0.00001178  
Epoch: [2][643/644] Elapsed 8m 17s (remain 0m 0s) Loss: 0.5043(0.4621) Grad: 56687.3555  LR: 0.00001174  
EVAL: [0/126] Loss: 0.8659(0.8659) 
EVAL: [20/126] Loss: 0.5608(0.6239) 
EVAL: [40/126] Loss: 0.6334(0.6177) 
EVAL: [60/126] Loss: 0.6063(0.6253) 
EVAL: [80/126] Loss: 0.6680(0.6376) 
EVAL: [100/126] Loss: 0.5932(0.6452) 
EVAL: [120/126] Loss: 0.7157(0.6403) 
EVAL: [125/126] Loss: 0.4268(0.6436) 


Epoch 2 - avg_train_loss: 0.4621  avg_val_loss: 0.6436
INFO:__main__:Epoch 2 - avg_train_loss: 0.4621  avg_val_loss: 0.6436
Epoch 2 - Score: 0.6620  Scores: [0.536640420919037, 0.7873234830285725]
INFO:__main__:Epoch 2 - Score: 0.6620  Scores: [0.536640420919037, 0.7873234830285725]


Epoch: [3][0/644] Elapsed 0m 0s (remain 6m 11s) Loss: 0.3033(0.3033) Grad: inf  LR: 0.00001173  
Epoch: [3][20/644] Elapsed 0m 6s (remain 3m 7s) Loss: 0.3878(0.4128) Grad: 124841.0938  LR: 0.00001146  
Epoch: [3][40/644] Elapsed 0m 12s (remain 3m 6s) Loss: 0.4397(0.3980) Grad: 112090.6719  LR: 0.00001120  
Epoch: [3][60/644] Elapsed 0m 17s (remain 2m 47s) Loss: 0.4964(0.3978) Grad: 36498.3320  LR: 0.00001093  
Epoch: [3][80/644] Elapsed 0m 23s (remain 2m 40s) Loss: 0.4070(0.3957) Grad: 25304.5059  LR: 0.00001066  
Epoch: [3][100/644] Elapsed 0m 29s (remain 2m 40s) Loss: 0.4022(0.3950) Grad: 106133.1172  LR: 0.00001039  
EVAL: [0/126] Loss: 0.8378(0.8378) 
EVAL: [20/126] Loss: 0.5259(0.6033) 
EVAL: [40/126] Loss: 0.6248(0.5977) 
EVAL: [60/126] Loss: 0.5929(0.6072) 
EVAL: [80/126] Loss: 0.6010(0.6177) 
EVAL: [100/126] Loss: 0.5685(0.6254) 
EVAL: [120/126] Loss: 0.7156(0.6207) 
EVAL: [125/126] Loss: 0.4288(0.6237) 


Epoch 3 - avg_train_loss: 0.3950  avg_val_loss: 0.6237
INFO:__main__:Epoch 3 - avg_train_loss: 0.3950  avg_val_loss: 0.6237
Epoch 3 - Score: 0.6399  Scores: [0.5477819397762593, 0.732000473956585]
INFO:__main__:Epoch 3 - Score: 0.6399  Scores: [0.5477819397762593, 0.732000473956585]


Epoch: [3][120/644] Elapsed 1m 24s (remain 6m 3s) Loss: 0.5064(0.3961) Grad: 87243.2266  LR: 0.00001012  
Epoch: [3][140/644] Elapsed 1m 29s (remain 5m 19s) Loss: 0.6545(0.3947) Grad: 76375.4375  LR: 0.00000984  
Epoch: [3][160/644] Elapsed 1m 35s (remain 4m 46s) Loss: 0.4525(0.3906) Grad: 112752.9453  LR: 0.00000957  
Epoch: [3][180/644] Elapsed 1m 40s (remain 4m 17s) Loss: 0.3403(0.3868) Grad: 62242.6367  LR: 0.00000930  
Epoch: [3][200/644] Elapsed 1m 46s (remain 3m 54s) Loss: 0.3880(0.3842) Grad: 100434.0781  LR: 0.00000903  
EVAL: [0/126] Loss: 0.9170(0.9170) 
EVAL: [20/126] Loss: 0.5788(0.6553) 
EVAL: [40/126] Loss: 0.6572(0.6478) 
EVAL: [60/126] Loss: 0.6172(0.6566) 
EVAL: [80/126] Loss: 0.6983(0.6740) 
EVAL: [100/126] Loss: 0.5686(0.6825) 
EVAL: [120/126] Loss: 0.7187(0.6768) 
EVAL: [125/126] Loss: 0.4259(0.6814) 


Epoch 3 - avg_train_loss: 0.3842  avg_val_loss: 0.6814
INFO:__main__:Epoch 3 - avg_train_loss: 0.3842  avg_val_loss: 0.6814
Epoch 3 - Score: 0.7038  Scores: [0.5343915157753103, 0.873208101071687]
INFO:__main__:Epoch 3 - Score: 0.7038  Scores: [0.5343915157753103, 0.873208101071687]


Epoch: [3][220/644] Elapsed 2m 41s (remain 5m 8s) Loss: 0.4153(0.3821) Grad: 145395.2812  LR: 0.00000876  
Epoch: [3][240/644] Elapsed 2m 46s (remain 4m 38s) Loss: 0.2841(0.3830) Grad: 99802.6016  LR: 0.00000850  
Epoch: [3][260/644] Elapsed 2m 52s (remain 4m 12s) Loss: 0.2499(0.3839) Grad: 19409.1387  LR: 0.00000823  
Epoch: [3][280/644] Elapsed 2m 58s (remain 3m 50s) Loss: 0.5797(0.3851) Grad: 69679.5547  LR: 0.00000796  
Epoch: [3][300/644] Elapsed 3m 5s (remain 3m 31s) Loss: 0.4708(0.3865) Grad: 93948.9609  LR: 0.00000770  
EVAL: [0/126] Loss: 0.8658(0.8658) 
EVAL: [20/126] Loss: 0.5420(0.5976) 
EVAL: [40/126] Loss: 0.6169(0.6048) 
EVAL: [60/126] Loss: 0.5600(0.6102) 
EVAL: [80/126] Loss: 0.5903(0.6201) 
EVAL: [100/126] Loss: 0.5290(0.6272) 
EVAL: [120/126] Loss: 0.6694(0.6253) 
EVAL: [125/126] Loss: 0.3445(0.6277) 


Epoch 3 - avg_train_loss: 0.3865  avg_val_loss: 0.6277
INFO:__main__:Epoch 3 - avg_train_loss: 0.3865  avg_val_loss: 0.6277
Epoch 3 - Score: 0.6469  Scores: [0.5070012616208033, 0.7868498681076079]
INFO:__main__:Epoch 3 - Score: 0.6469  Scores: [0.5070012616208033, 0.7868498681076079]


Epoch: [3][320/644] Elapsed 4m 1s (remain 4m 2s) Loss: 0.4593(0.3864) Grad: 70667.4531  LR: 0.00000744  
Epoch: [3][340/644] Elapsed 4m 6s (remain 3m 38s) Loss: 0.4529(0.3875) Grad: 97906.3906  LR: 0.00000718  
Epoch: [3][360/644] Elapsed 4m 11s (remain 3m 17s) Loss: 0.4231(0.3876) Grad: 82499.5234  LR: 0.00000692  
Epoch: [3][380/644] Elapsed 4m 18s (remain 2m 58s) Loss: 0.3671(0.3875) Grad: 59540.7422  LR: 0.00000666  
Epoch: [3][400/644] Elapsed 4m 23s (remain 2m 39s) Loss: 0.4898(0.3857) Grad: 114304.9766  LR: 0.00000641  
EVAL: [0/126] Loss: 0.8277(0.8277) 
EVAL: [20/126] Loss: 0.5322(0.5808) 
EVAL: [40/126] Loss: 0.6059(0.5863) 
EVAL: [60/126] Loss: 0.5754(0.5889) 
EVAL: [80/126] Loss: 0.5748(0.5953) 
EVAL: [100/126] Loss: 0.5372(0.6001) 
EVAL: [120/126] Loss: 0.6476(0.5986) 
EVAL: [125/126] Loss: 0.3685(0.6002) 


Epoch 3 - Save Best Score: 0.6162 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.6162 Model
Epoch 3 - avg_train_loss: 0.3857  avg_val_loss: 0.6002
INFO:__main__:Epoch 3 - avg_train_loss: 0.3857  avg_val_loss: 0.6002
Epoch 3 - Score: 0.6162  Scores: [0.5003494744174944, 0.7320634278404903]
INFO:__main__:Epoch 3 - Score: 0.6162  Scores: [0.5003494744174944, 0.7320634278404903]


Epoch: [3][420/644] Elapsed 5m 37s (remain 2m 58s) Loss: 0.6725(0.3847) Grad: 89854.7578  LR: 0.00000616  
Epoch: [3][440/644] Elapsed 5m 43s (remain 2m 38s) Loss: 0.1889(0.3844) Grad: 27667.9922  LR: 0.00000591  
Epoch: [3][460/644] Elapsed 5m 50s (remain 2m 19s) Loss: 0.4089(0.3837) Grad: 69401.5547  LR: 0.00000566  
Epoch: [3][480/644] Elapsed 5m 57s (remain 2m 1s) Loss: 0.4020(0.3834) Grad: 76431.7109  LR: 0.00000542  
Epoch: [3][500/644] Elapsed 6m 3s (remain 1m 43s) Loss: 0.4465(0.3832) Grad: 110995.8594  LR: 0.00000518  
EVAL: [0/126] Loss: 0.8553(0.8553) 
EVAL: [20/126] Loss: 0.5900(0.6431) 
EVAL: [40/126] Loss: 0.6630(0.6339) 
EVAL: [60/126] Loss: 0.6513(0.6384) 
EVAL: [80/126] Loss: 0.6554(0.6486) 
EVAL: [100/126] Loss: 0.6253(0.6555) 
EVAL: [120/126] Loss: 0.7324(0.6512) 
EVAL: [125/126] Loss: 0.4306(0.6539) 


Epoch 3 - avg_train_loss: 0.3832  avg_val_loss: 0.6539
INFO:__main__:Epoch 3 - avg_train_loss: 0.3832  avg_val_loss: 0.6539
Epoch 3 - Score: 0.6698  Scores: [0.5507786203927757, 0.7888251044316367]
INFO:__main__:Epoch 3 - Score: 0.6698  Scores: [0.5507786203927757, 0.7888251044316367]


Epoch: [3][520/644] Elapsed 6m 59s (remain 1m 38s) Loss: 0.3722(0.3822) Grad: 45632.9766  LR: 0.00000495  
Epoch: [3][540/644] Elapsed 7m 4s (remain 1m 20s) Loss: 0.3471(0.3816) Grad: 62076.5703  LR: 0.00000471  
Epoch: [3][560/644] Elapsed 7m 9s (remain 1m 3s) Loss: 0.3510(0.3817) Grad: 51229.0000  LR: 0.00000449  
Epoch: [3][580/644] Elapsed 7m 15s (remain 0m 47s) Loss: 0.3492(0.3818) Grad: 54669.2695  LR: 0.00000426  
Epoch: [3][600/644] Elapsed 7m 21s (remain 0m 31s) Loss: 0.3070(0.3822) Grad: 34296.8047  LR: 0.00000404  
EVAL: [0/126] Loss: 0.9005(0.9005) 
EVAL: [20/126] Loss: 0.5511(0.6370) 
EVAL: [40/126] Loss: 0.6483(0.6298) 
EVAL: [60/126] Loss: 0.6062(0.6386) 
EVAL: [80/126] Loss: 0.6368(0.6550) 
EVAL: [100/126] Loss: 0.5614(0.6632) 
EVAL: [120/126] Loss: 0.7283(0.6581) 


Epoch 3 - avg_train_loss: 0.3822  avg_val_loss: 0.6627
INFO:__main__:Epoch 3 - avg_train_loss: 0.3822  avg_val_loss: 0.6627
Epoch 3 - Score: 0.6842  Scores: [0.538281419690178, 0.8301125693156368]
INFO:__main__:Epoch 3 - Score: 0.6842  Scores: [0.538281419690178, 0.8301125693156368]


EVAL: [125/126] Loss: 0.4370(0.6627) 
Epoch: [3][620/644] Elapsed 8m 15s (remain 0m 18s) Loss: 0.3542(0.3819) Grad: 74957.4141  LR: 0.00000383  
Epoch: [3][640/644] Elapsed 8m 21s (remain 0m 2s) Loss: 0.3916(0.3816) Grad: 28139.7500  LR: 0.00000362  
Epoch: [3][643/644] Elapsed 8m 22s (remain 0m 0s) Loss: 0.3490(0.3813) Grad: 109343.4219  LR: 0.00000359  
EVAL: [0/126] Loss: 0.8640(0.8640) 
EVAL: [20/126] Loss: 0.5266(0.5963) 
EVAL: [40/126] Loss: 0.6242(0.5971) 
EVAL: [60/126] Loss: 0.5761(0.6040) 
EVAL: [80/126] Loss: 0.5927(0.6173) 
EVAL: [100/126] Loss: 0.5225(0.6253) 
EVAL: [120/126] Loss: 0.6858(0.6225) 
EVAL: [125/126] Loss: 0.3596(0.6262) 


Epoch 3 - avg_train_loss: 0.3813  avg_val_loss: 0.6262
INFO:__main__:Epoch 3 - avg_train_loss: 0.3813  avg_val_loss: 0.6262
Epoch 3 - Score: 0.6467  Scores: [0.4822970715450143, 0.8111770068456532]
INFO:__main__:Epoch 3 - Score: 0.6467  Scores: [0.4822970715450143, 0.8111770068456532]


Epoch: [4][0/644] Elapsed 0m 0s (remain 9m 9s) Loss: 0.3104(0.3104) Grad: inf  LR: 0.00000358  
Epoch: [4][20/644] Elapsed 0m 5s (remain 2m 48s) Loss: 0.2972(0.2949) Grad: 106564.5234  LR: 0.00000337  
Epoch: [4][40/644] Elapsed 0m 10s (remain 2m 41s) Loss: 0.2317(0.3085) Grad: 20741.9375  LR: 0.00000317  
Epoch: [4][60/644] Elapsed 0m 17s (remain 2m 49s) Loss: 0.1988(0.3078) Grad: 54788.0703  LR: 0.00000297  
Epoch: [4][80/644] Elapsed 0m 23s (remain 2m 42s) Loss: 0.3226(0.3133) Grad: 63291.1016  LR: 0.00000278  
Epoch: [4][100/644] Elapsed 0m 28s (remain 2m 35s) Loss: 0.2665(0.3154) Grad: 116032.0703  LR: 0.00000260  
EVAL: [0/126] Loss: 0.8748(0.8748) 
EVAL: [20/126] Loss: 0.5574(0.6253) 
EVAL: [40/126] Loss: 0.6383(0.6214) 
EVAL: [60/126] Loss: 0.6108(0.6265) 
EVAL: [80/126] Loss: 0.6374(0.6385) 
EVAL: [100/126] Loss: 0.5671(0.6462) 
EVAL: [120/126] Loss: 0.6900(0.6417) 
EVAL: [125/126] Loss: 0.3941(0.6452) 


Epoch 4 - avg_train_loss: 0.3154  avg_val_loss: 0.6452
INFO:__main__:Epoch 4 - avg_train_loss: 0.3154  avg_val_loss: 0.6452
Epoch 4 - Score: 0.6639  Scores: [0.5068854449214184, 0.8208968213848035]
INFO:__main__:Epoch 4 - Score: 0.6639  Scores: [0.5068854449214184, 0.8208968213848035]


Epoch: [4][120/644] Elapsed 1m 24s (remain 6m 3s) Loss: 0.2526(0.3139) Grad: 55007.5234  LR: 0.00000242  
Epoch: [4][140/644] Elapsed 1m 29s (remain 5m 19s) Loss: 0.3434(0.3169) Grad: 24284.6875  LR: 0.00000225  
Epoch: [4][160/644] Elapsed 1m 35s (remain 4m 45s) Loss: 0.2536(0.3174) Grad: 50165.6133  LR: 0.00000208  
Epoch: [4][180/644] Elapsed 1m 41s (remain 4m 18s) Loss: 0.3269(0.3177) Grad: 66724.2891  LR: 0.00000192  
Epoch: [4][200/644] Elapsed 1m 46s (remain 3m 54s) Loss: 0.3178(0.3156) Grad: 54594.1445  LR: 0.00000176  
EVAL: [0/126] Loss: 0.8710(0.8710) 
EVAL: [20/126] Loss: 0.5543(0.6228) 
EVAL: [40/126] Loss: 0.6396(0.6196) 
EVAL: [60/126] Loss: 0.5956(0.6257) 
EVAL: [80/126] Loss: 0.6360(0.6390) 
EVAL: [100/126] Loss: 0.5655(0.6475) 
EVAL: [120/126] Loss: 0.6962(0.6432) 
EVAL: [125/126] Loss: 0.3849(0.6469) 


Epoch 4 - avg_train_loss: 0.3156  avg_val_loss: 0.6469
INFO:__main__:Epoch 4 - avg_train_loss: 0.3156  avg_val_loss: 0.6469
Epoch 4 - Score: 0.6668  Scores: [0.500369977854, 0.833242454355808]
INFO:__main__:Epoch 4 - Score: 0.6668  Scores: [0.500369977854, 0.833242454355808]


Epoch: [4][220/644] Elapsed 2m 41s (remain 5m 9s) Loss: 0.2685(0.3149) Grad: 34128.9219  LR: 0.00000161  
Epoch: [4][240/644] Elapsed 2m 47s (remain 4m 39s) Loss: 0.4392(0.3143) Grad: 66510.3359  LR: 0.00000147  
Epoch: [4][260/644] Elapsed 2m 52s (remain 4m 13s) Loss: 0.3251(0.3133) Grad: 51821.5508  LR: 0.00000133  
Epoch: [4][280/644] Elapsed 2m 59s (remain 3m 51s) Loss: 0.3442(0.3120) Grad: 37440.6172  LR: 0.00000120  
Epoch: [4][300/644] Elapsed 3m 4s (remain 3m 30s) Loss: 0.2735(0.3118) Grad: 142818.3281  LR: 0.00000107  
EVAL: [0/126] Loss: 0.8639(0.8639) 
EVAL: [20/126] Loss: 0.5351(0.6101) 
EVAL: [40/126] Loss: 0.6335(0.6073) 
EVAL: [60/126] Loss: 0.5825(0.6127) 
EVAL: [80/126] Loss: 0.6132(0.6253) 
EVAL: [100/126] Loss: 0.5595(0.6335) 
EVAL: [120/126] Loss: 0.6892(0.6295) 


Epoch 4 - avg_train_loss: 0.3118  avg_val_loss: 0.6329


EVAL: [125/126] Loss: 0.3652(0.6329) 


INFO:__main__:Epoch 4 - avg_train_loss: 0.3118  avg_val_loss: 0.6329
Epoch 4 - Score: 0.6522  Scores: [0.4915504909079061, 0.8127894201394962]
INFO:__main__:Epoch 4 - Score: 0.6522  Scores: [0.4915504909079061, 0.8127894201394962]


Epoch: [4][320/644] Elapsed 3m 59s (remain 4m 0s) Loss: 0.2687(0.3108) Grad: 107384.4297  LR: 0.00000095  
Epoch: [4][340/644] Elapsed 4m 5s (remain 3m 37s) Loss: 0.1922(0.3105) Grad: 48501.5273  LR: 0.00000084  
Epoch: [4][360/644] Elapsed 4m 10s (remain 3m 16s) Loss: 0.2468(0.3089) Grad: 34780.6445  LR: 0.00000073  
Epoch: [4][380/644] Elapsed 4m 15s (remain 2m 56s) Loss: 0.3478(0.3074) Grad: 73973.9844  LR: 0.00000064  
Epoch: [4][400/644] Elapsed 4m 22s (remain 2m 39s) Loss: 0.2794(0.3082) Grad: 57768.0859  LR: 0.00000054  
EVAL: [0/126] Loss: 0.8832(0.8832) 
EVAL: [20/126] Loss: 0.5427(0.6236) 
EVAL: [40/126] Loss: 0.6429(0.6193) 
EVAL: [60/126] Loss: 0.5913(0.6258) 
EVAL: [80/126] Loss: 0.6290(0.6397) 
EVAL: [100/126] Loss: 0.5669(0.6482) 
EVAL: [120/126] Loss: 0.7025(0.6433) 
EVAL: [125/126] Loss: 0.3930(0.6472) 


Epoch 4 - avg_train_loss: 0.3082  avg_val_loss: 0.6472
INFO:__main__:Epoch 4 - avg_train_loss: 0.3082  avg_val_loss: 0.6472
Epoch 4 - Score: 0.6675  Scores: [0.5035641040939466, 0.8314628262750612]
INFO:__main__:Epoch 4 - Score: 0.6675  Scores: [0.5035641040939466, 0.8314628262750612]


Epoch: [4][420/644] Elapsed 5m 16s (remain 2m 47s) Loss: 0.4876(0.3092) Grad: 97232.0078  LR: 0.00000046  
Epoch: [4][440/644] Elapsed 5m 22s (remain 2m 28s) Loss: 0.4361(0.3094) Grad: 33879.2617  LR: 0.00000038  
Epoch: [4][460/644] Elapsed 5m 28s (remain 2m 10s) Loss: 0.3677(0.3099) Grad: 36180.7695  LR: 0.00000031  
Epoch: [4][480/644] Elapsed 5m 34s (remain 1m 53s) Loss: 0.3284(0.3107) Grad: 74766.4766  LR: 0.00000025  
Epoch: [4][500/644] Elapsed 5m 42s (remain 1m 37s) Loss: 0.3397(0.3105) Grad: 22226.3105  LR: 0.00000019  
EVAL: [0/126] Loss: 0.8847(0.8847) 
EVAL: [20/126] Loss: 0.5281(0.6143) 
EVAL: [40/126] Loss: 0.6332(0.6108) 
EVAL: [60/126] Loss: 0.5798(0.6171) 
EVAL: [80/126] Loss: 0.6178(0.6305) 
EVAL: [100/126] Loss: 0.5571(0.6388) 
EVAL: [120/126] Loss: 0.6942(0.6342) 
EVAL: [125/126] Loss: 0.3805(0.6379) 


Epoch 4 - avg_train_loss: 0.3105  avg_val_loss: 0.6379
INFO:__main__:Epoch 4 - avg_train_loss: 0.3105  avg_val_loss: 0.6379
Epoch 4 - Score: 0.6579  Scores: [0.5040516159071486, 0.8116504518276794]
INFO:__main__:Epoch 4 - Score: 0.6579  Scores: [0.5040516159071486, 0.8116504518276794]


Epoch: [4][520/644] Elapsed 6m 38s (remain 1m 33s) Loss: 0.3618(0.3119) Grad: 77555.3906  LR: 0.00000014  
Epoch: [4][540/644] Elapsed 6m 43s (remain 1m 16s) Loss: 0.4033(0.3121) Grad: 63056.1211  LR: 0.00000010  
Epoch: [4][560/644] Elapsed 6m 49s (remain 1m 0s) Loss: 0.2715(0.3124) Grad: 86758.2266  LR: 0.00000007  
Epoch: [4][580/644] Elapsed 6m 54s (remain 0m 44s) Loss: 0.3118(0.3123) Grad: 48254.1641  LR: 0.00000004  
Epoch: [4][600/644] Elapsed 6m 59s (remain 0m 30s) Loss: 0.5087(0.3120) Grad: 40863.5898  LR: 0.00000002  
EVAL: [0/126] Loss: 0.8813(0.8813) 
EVAL: [20/126] Loss: 0.5284(0.6119) 
EVAL: [40/126] Loss: 0.6326(0.6091) 
EVAL: [60/126] Loss: 0.5782(0.6151) 
EVAL: [80/126] Loss: 0.6155(0.6283) 
EVAL: [100/126] Loss: 0.5550(0.6366) 
EVAL: [120/126] Loss: 0.6901(0.6322) 
EVAL: [125/126] Loss: 0.3739(0.6359) 


Epoch 4 - avg_train_loss: 0.3120  avg_val_loss: 0.6359
INFO:__main__:Epoch 4 - avg_train_loss: 0.3120  avg_val_loss: 0.6359
Epoch 4 - Score: 0.6558  Scores: [0.49700087437481344, 0.8145193845838]
INFO:__main__:Epoch 4 - Score: 0.6558  Scores: [0.49700087437481344, 0.8145193845838]


Epoch: [4][620/644] Elapsed 7m 55s (remain 0m 17s) Loss: 0.2644(0.3116) Grad: 78095.2812  LR: 0.00000001  
Epoch: [4][640/644] Elapsed 8m 1s (remain 0m 2s) Loss: 0.2652(0.3115) Grad: 118530.3438  LR: 0.00000000  
Epoch: [4][643/644] Elapsed 8m 2s (remain 0m 0s) Loss: 0.2899(0.3112) Grad: 53183.5781  LR: 0.00000000  
EVAL: [0/126] Loss: 0.8816(0.8816) 
EVAL: [20/126] Loss: 0.5291(0.6125) 
EVAL: [40/126] Loss: 0.6331(0.6096) 
EVAL: [60/126] Loss: 0.5788(0.6156) 
EVAL: [80/126] Loss: 0.6164(0.6289) 
EVAL: [100/126] Loss: 0.5557(0.6372) 
EVAL: [120/126] Loss: 0.6906(0.6328) 
EVAL: [125/126] Loss: 0.3747(0.6364) 


Epoch 4 - avg_train_loss: 0.3112  avg_val_loss: 0.6364
INFO:__main__:Epoch 4 - avg_train_loss: 0.3112  avg_val_loss: 0.6364
Epoch 4 - Score: 0.6563  Scores: [0.4973836701470568, 0.8153007805272662]
INFO:__main__:Epoch 4 - Score: 0.6563  Scores: [0.4973836701470568, 0.8153007805272662]
Score: 0.6162  Scores: [0.5003494744174944, 0.7320634278404903]
INFO:__main__:Score: 0.6162  Scores: [0.5003494744174944, 0.7320634278404903]
DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-large",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_drop

Epoch: [1][0/646] Elapsed 0m 0s (remain 6m 6s) Loss: 2.1080(2.1080) Grad: inf  LR: 0.00000008  
Epoch: [1][20/646] Elapsed 0m 6s (remain 3m 14s) Loss: 1.6752(2.1798) Grad: 193463.1875  LR: 0.00000163  
Epoch: [1][40/646] Elapsed 0m 12s (remain 3m 2s) Loss: 1.0901(1.6213) Grad: 123770.1250  LR: 0.00000318  
Epoch: [1][60/646] Elapsed 0m 17s (remain 2m 50s) Loss: 0.7164(1.3291) Grad: 145292.7500  LR: 0.00000473  
Epoch: [1][80/646] Elapsed 0m 23s (remain 2m 43s) Loss: 0.7092(1.1686) Grad: 160711.5625  LR: 0.00000628  
Epoch: [1][100/646] Elapsed 0m 29s (remain 2m 39s) Loss: 0.9710(1.0741) Grad: 114785.9766  LR: 0.00000783  
EVAL: [0/125] Loss: 0.7825(0.7825) 
EVAL: [20/125] Loss: 0.6392(0.6229) 
EVAL: [40/125] Loss: 0.7755(0.6529) 
EVAL: [60/125] Loss: 0.6562(0.6398) 
EVAL: [80/125] Loss: 0.4666(0.6315) 
EVAL: [100/125] Loss: 0.8640(0.6249) 
EVAL: [120/125] Loss: 0.6690(0.6259) 
EVAL: [124/125] Loss: 0.6071(0.6239) 


Epoch 1 - Save Best Score: 0.6351 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6351 Model
Epoch 1 - avg_train_loss: 1.0741  avg_val_loss: 0.6239
INFO:__main__:Epoch 1 - avg_train_loss: 1.0741  avg_val_loss: 0.6239
Epoch 1 - Score: 0.6351  Scores: [0.56658158001388, 0.7037016232243923]
INFO:__main__:Epoch 1 - Score: 0.6351  Scores: [0.56658158001388, 0.7037016232243923]


Epoch: [1][120/646] Elapsed 1m 31s (remain 6m 37s) Loss: 0.4525(0.9948) Grad: 42761.0859  LR: 0.00000938  
Epoch: [1][140/646] Elapsed 1m 37s (remain 5m 48s) Loss: 1.0956(0.9326) Grad: 103605.1719  LR: 0.00001093  
Epoch: [1][160/646] Elapsed 1m 46s (remain 5m 22s) Loss: 0.7066(0.9050) Grad: 92688.1016  LR: 0.00001248  
Epoch: [1][180/646] Elapsed 1m 55s (remain 4m 55s) Loss: 0.5710(0.8698) Grad: 86883.4141  LR: 0.00001403  
Epoch: [1][200/646] Elapsed 2m 3s (remain 4m 32s) Loss: 0.4727(0.8435) Grad: 117472.2969  LR: 0.00001558  
EVAL: [0/125] Loss: 0.8265(0.8265) 
EVAL: [20/125] Loss: 0.4773(0.5874) 
EVAL: [40/125] Loss: 0.6038(0.5972) 
EVAL: [60/125] Loss: 0.5498(0.5863) 
EVAL: [80/125] Loss: 0.5696(0.5863) 
EVAL: [100/125] Loss: 0.6868(0.5803) 
EVAL: [120/125] Loss: 0.6530(0.5857) 
EVAL: [124/125] Loss: 0.6481(0.5854) 


Epoch 1 - Save Best Score: 0.5952 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5952 Model
Epoch 1 - avg_train_loss: 0.8435  avg_val_loss: 0.5854
INFO:__main__:Epoch 1 - avg_train_loss: 0.8435  avg_val_loss: 0.5854
Epoch 1 - Score: 0.5952  Scores: [0.562390416525826, 0.6280171233332159]
INFO:__main__:Epoch 1 - Score: 0.5952  Scores: [0.562390416525826, 0.6280171233332159]


Epoch: [1][220/646] Elapsed 3m 2s (remain 5m 50s) Loss: 0.4782(0.8173) Grad: 167201.4062  LR: 0.00001713  
Epoch: [1][240/646] Elapsed 3m 10s (remain 5m 19s) Loss: 0.7044(0.7977) Grad: 147053.7656  LR: 0.00001868  
Epoch: [1][260/646] Elapsed 3m 16s (remain 4m 50s) Loss: 0.8058(0.7889) Grad: 68846.8828  LR: 0.00002000  
Epoch: [1][280/646] Elapsed 3m 24s (remain 4m 25s) Loss: 0.7470(0.7744) Grad: 122069.7344  LR: 0.00002000  
Epoch: [1][300/646] Elapsed 3m 31s (remain 4m 2s) Loss: 0.5418(0.7665) Grad: 116427.5781  LR: 0.00001998  
EVAL: [0/125] Loss: 0.6779(0.6779) 
EVAL: [20/125] Loss: 0.5359(0.5464) 
EVAL: [40/125] Loss: 0.6516(0.5727) 
EVAL: [60/125] Loss: 0.6162(0.5586) 
EVAL: [80/125] Loss: 0.4488(0.5495) 
EVAL: [100/125] Loss: 0.6452(0.5428) 
EVAL: [120/125] Loss: 0.5459(0.5458) 
EVAL: [124/125] Loss: 0.5758(0.5445) 


Epoch 1 - Save Best Score: 0.5536 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5536 Model
Epoch 1 - avg_train_loss: 0.7665  avg_val_loss: 0.5445
INFO:__main__:Epoch 1 - avg_train_loss: 0.7665  avg_val_loss: 0.5445
Epoch 1 - Score: 0.5536  Scores: [0.46223975908121956, 0.6449377540902933]
INFO:__main__:Epoch 1 - Score: 0.5536  Scores: [0.46223975908121956, 0.6449377540902933]


Epoch: [1][320/646] Elapsed 4m 31s (remain 4m 34s) Loss: 0.6823(0.7573) Grad: 127526.7109  LR: 0.00001996  
Epoch: [1][340/646] Elapsed 4m 38s (remain 4m 9s) Loss: 0.7524(0.7501) Grad: 154723.3750  LR: 0.00001994  
Epoch: [1][360/646] Elapsed 4m 45s (remain 3m 45s) Loss: 0.5322(0.7417) Grad: 123067.2188  LR: 0.00001990  
Epoch: [1][380/646] Elapsed 4m 54s (remain 3m 24s) Loss: 0.7841(0.7363) Grad: 118042.4453  LR: 0.00001986  
Epoch: [1][400/646] Elapsed 5m 0s (remain 3m 3s) Loss: 0.7483(0.7302) Grad: 63484.7852  LR: 0.00001981  
EVAL: [0/125] Loss: 0.7469(0.7469) 
EVAL: [20/125] Loss: 0.5236(0.5819) 
EVAL: [40/125] Loss: 0.5924(0.5805) 
EVAL: [60/125] Loss: 0.5241(0.5697) 
EVAL: [80/125] Loss: 0.5197(0.5662) 
EVAL: [100/125] Loss: 0.6356(0.5609) 
EVAL: [120/125] Loss: 0.6215(0.5658) 
EVAL: [124/125] Loss: 0.7105(0.5660) 


Epoch 1 - avg_train_loss: 0.7302  avg_val_loss: 0.5660
INFO:__main__:Epoch 1 - avg_train_loss: 0.7302  avg_val_loss: 0.5660
Epoch 1 - Score: 0.5749  Scores: [0.5693053760065856, 0.5803947228989473]
INFO:__main__:Epoch 1 - Score: 0.5749  Scores: [0.5693053760065856, 0.5803947228989473]


Epoch: [1][420/646] Elapsed 5m 48s (remain 3m 6s) Loss: 0.7899(0.7247) Grad: 155580.3750  LR: 0.00001976  
Epoch: [1][440/646] Elapsed 5m 54s (remain 2m 44s) Loss: 0.4969(0.7153) Grad: 134591.8906  LR: 0.00001970  
Epoch: [1][460/646] Elapsed 6m 0s (remain 2m 24s) Loss: 0.6731(0.7075) Grad: 146384.7500  LR: 0.00001963  
Epoch: [1][480/646] Elapsed 6m 7s (remain 2m 5s) Loss: 0.6175(0.7003) Grad: 69417.5625  LR: 0.00001955  
Epoch: [1][500/646] Elapsed 6m 13s (remain 1m 48s) Loss: 0.6322(0.6946) Grad: 132441.4375  LR: 0.00001947  
EVAL: [0/125] Loss: 0.6990(0.6990) 
EVAL: [20/125] Loss: 0.5439(0.5241) 
EVAL: [40/125] Loss: 0.5786(0.5640) 
EVAL: [60/125] Loss: 0.6271(0.5529) 
EVAL: [80/125] Loss: 0.4436(0.5461) 
EVAL: [100/125] Loss: 0.6434(0.5342) 
EVAL: [120/125] Loss: 0.4621(0.5378) 


Epoch 1 - Save Best Score: 0.5490 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5490 Model


EVAL: [124/125] Loss: 0.5302(0.5358) 


Epoch 1 - avg_train_loss: 0.6946  avg_val_loss: 0.5358
INFO:__main__:Epoch 1 - avg_train_loss: 0.6946  avg_val_loss: 0.5358
Epoch 1 - Score: 0.5490  Scores: [0.5620987603269604, 0.5359295653220469]
INFO:__main__:Epoch 1 - Score: 0.5490  Scores: [0.5620987603269604, 0.5359295653220469]


Epoch: [1][520/646] Elapsed 7m 21s (remain 1m 45s) Loss: 0.6609(0.6899) Grad: 105990.1719  LR: 0.00001938  
Epoch: [1][540/646] Elapsed 7m 27s (remain 1m 26s) Loss: 0.6647(0.6868) Grad: 119590.8281  LR: 0.00001928  
Epoch: [1][560/646] Elapsed 7m 36s (remain 1m 9s) Loss: 0.3740(0.6819) Grad: 99826.7266  LR: 0.00001917  
Epoch: [1][580/646] Elapsed 7m 43s (remain 0m 51s) Loss: 0.5321(0.6759) Grad: 133609.3750  LR: 0.00001906  
Epoch: [1][600/646] Elapsed 7m 51s (remain 0m 35s) Loss: 0.4753(0.6696) Grad: 70129.9453  LR: 0.00001895  
EVAL: [0/125] Loss: 0.7575(0.7575) 
EVAL: [20/125] Loss: 0.5195(0.5227) 
EVAL: [40/125] Loss: 0.6176(0.5539) 
EVAL: [60/125] Loss: 0.5256(0.5362) 
EVAL: [80/125] Loss: 0.4003(0.5249) 
EVAL: [100/125] Loss: 0.6493(0.5162) 
EVAL: [120/125] Loss: 0.4863(0.5201) 
EVAL: [124/125] Loss: 0.5173(0.5185) 


Epoch 1 - Save Best Score: 0.5310 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5310 Model
Epoch 1 - avg_train_loss: 0.6696  avg_val_loss: 0.5185
INFO:__main__:Epoch 1 - avg_train_loss: 0.6696  avg_val_loss: 0.5185
Epoch 1 - Score: 0.5310  Scores: [0.4703469926556282, 0.5917286035440845]
INFO:__main__:Epoch 1 - Score: 0.5310  Scores: [0.4703469926556282, 0.5917286035440845]


Epoch: [1][620/646] Elapsed 8m 47s (remain 0m 21s) Loss: 0.4643(0.6645) Grad: 99005.4688  LR: 0.00001882  
Epoch: [1][640/646] Elapsed 8m 55s (remain 0m 4s) Loss: 0.6323(0.6610) Grad: 109600.8125  LR: 0.00001869  
Epoch: [1][645/646] Elapsed 8m 57s (remain 0m 0s) Loss: 0.6905(0.6601) Grad: 160868.4688  LR: 0.00001866  
EVAL: [0/125] Loss: 0.8776(0.8776) 
EVAL: [20/125] Loss: 0.5497(0.6693) 
EVAL: [40/125] Loss: 0.5979(0.6636) 
EVAL: [60/125] Loss: 0.5561(0.6510) 
EVAL: [80/125] Loss: 0.5543(0.6506) 
EVAL: [100/125] Loss: 0.7954(0.6434) 
EVAL: [120/125] Loss: 0.7329(0.6485) 
EVAL: [124/125] Loss: 0.7441(0.6492) 


Epoch 1 - avg_train_loss: 0.6601  avg_val_loss: 0.6492
INFO:__main__:Epoch 1 - avg_train_loss: 0.6601  avg_val_loss: 0.6492
Epoch 1 - Score: 0.6596  Scores: [0.6612137823692663, 0.6580241761089949]
INFO:__main__:Epoch 1 - Score: 0.6596  Scores: [0.6612137823692663, 0.6580241761089949]


Epoch: [2][0/646] Elapsed 0m 0s (remain 8m 37s) Loss: 0.7241(0.7241) Grad: inf  LR: 0.00001865  
Epoch: [2][20/646] Elapsed 0m 6s (remain 3m 22s) Loss: 0.3853(0.5683) Grad: 86933.4688  LR: 0.00001851  
Epoch: [2][40/646] Elapsed 0m 13s (remain 3m 12s) Loss: 0.6187(0.5295) Grad: 59128.3984  LR: 0.00001837  
Epoch: [2][60/646] Elapsed 0m 20s (remain 3m 12s) Loss: 0.3687(0.5364) Grad: 114346.3281  LR: 0.00001822  
Epoch: [2][80/646] Elapsed 0m 26s (remain 3m 3s) Loss: 0.4407(0.5309) Grad: 126783.5859  LR: 0.00001806  
Epoch: [2][100/646] Elapsed 0m 31s (remain 2m 51s) Loss: 0.3902(0.5349) Grad: 53069.6016  LR: 0.00001790  
EVAL: [0/125] Loss: 0.7323(0.7323) 
EVAL: [20/125] Loss: 0.5503(0.5308) 
EVAL: [40/125] Loss: 0.6235(0.5612) 
EVAL: [60/125] Loss: 0.6041(0.5522) 
EVAL: [80/125] Loss: 0.4094(0.5408) 
EVAL: [100/125] Loss: 0.6666(0.5330) 
EVAL: [120/125] Loss: 0.4887(0.5360) 
EVAL: [124/125] Loss: 0.4437(0.5332) 


Epoch 2 - avg_train_loss: 0.5349  avg_val_loss: 0.5332
INFO:__main__:Epoch 2 - avg_train_loss: 0.5349  avg_val_loss: 0.5332
Epoch 2 - Score: 0.5446  Scores: [0.5214734449109638, 0.5677881216602438]
INFO:__main__:Epoch 2 - Score: 0.5446  Scores: [0.5214734449109638, 0.5677881216602438]


Epoch: [2][120/646] Elapsed 1m 19s (remain 5m 45s) Loss: 0.4542(0.5361) Grad: 104150.0469  LR: 0.00001773  
Epoch: [2][140/646] Elapsed 1m 25s (remain 5m 6s) Loss: 0.3781(0.5379) Grad: 74740.6016  LR: 0.00001755  
Epoch: [2][160/646] Elapsed 1m 31s (remain 4m 35s) Loss: 0.8182(0.5341) Grad: 52409.4336  LR: 0.00001737  
Epoch: [2][180/646] Elapsed 1m 37s (remain 4m 9s) Loss: 0.5146(0.5300) Grad: 135663.2031  LR: 0.00001719  
Epoch: [2][200/646] Elapsed 1m 44s (remain 3m 50s) Loss: 0.4297(0.5279) Grad: 105933.8750  LR: 0.00001700  
EVAL: [0/125] Loss: 0.7483(0.7483) 
EVAL: [20/125] Loss: 0.4682(0.5187) 
EVAL: [40/125] Loss: 0.5604(0.5369) 
EVAL: [60/125] Loss: 0.4836(0.5275) 
EVAL: [80/125] Loss: 0.4003(0.5219) 
EVAL: [100/125] Loss: 0.5937(0.5134) 
EVAL: [120/125] Loss: 0.5688(0.5196) 


Epoch 2 - Save Best Score: 0.5298 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5298 Model


EVAL: [124/125] Loss: 0.5333(0.5192) 


Epoch 2 - avg_train_loss: 0.5279  avg_val_loss: 0.5192
INFO:__main__:Epoch 2 - avg_train_loss: 0.5279  avg_val_loss: 0.5192
Epoch 2 - Score: 0.5298  Scores: [0.5033452690643875, 0.5563323181468978]
INFO:__main__:Epoch 2 - Score: 0.5298  Scores: [0.5033452690643875, 0.5563323181468978]


Epoch: [2][220/646] Elapsed 2m 39s (remain 5m 6s) Loss: 0.3767(0.5173) Grad: 85903.6953  LR: 0.00001680  
Epoch: [2][240/646] Elapsed 2m 46s (remain 4m 40s) Loss: 0.3529(0.5143) Grad: 59201.2148  LR: 0.00001660  
Epoch: [2][260/646] Elapsed 2m 53s (remain 4m 15s) Loss: 0.9173(0.5162) Grad: 78822.2422  LR: 0.00001640  
Epoch: [2][280/646] Elapsed 2m 59s (remain 3m 53s) Loss: 0.3681(0.5151) Grad: 70902.6719  LR: 0.00001619  
Epoch: [2][300/646] Elapsed 3m 7s (remain 3m 34s) Loss: 0.5547(0.5131) Grad: 123154.5312  LR: 0.00001597  
EVAL: [0/125] Loss: 0.7346(0.7346) 
EVAL: [20/125] Loss: 0.5907(0.6327) 
EVAL: [40/125] Loss: 0.7156(0.6380) 
EVAL: [60/125] Loss: 0.5801(0.6250) 
EVAL: [80/125] Loss: 0.5684(0.6215) 
EVAL: [100/125] Loss: 0.7226(0.6139) 
EVAL: [120/125] Loss: 0.6625(0.6190) 
EVAL: [124/125] Loss: 0.7339(0.6198) 


Epoch 2 - avg_train_loss: 0.5131  avg_val_loss: 0.6198
INFO:__main__:Epoch 2 - avg_train_loss: 0.5131  avg_val_loss: 0.6198
Epoch 2 - Score: 0.6272  Scores: [0.6875094939723433, 0.5668246169428394]
INFO:__main__:Epoch 2 - Score: 0.6272  Scores: [0.6875094939723433, 0.5668246169428394]


Epoch: [2][320/646] Elapsed 3m 56s (remain 3m 59s) Loss: 0.3889(0.5136) Grad: 91284.3125  LR: 0.00001575  
Epoch: [2][340/646] Elapsed 4m 1s (remain 3m 36s) Loss: 0.4002(0.5127) Grad: 121972.3281  LR: 0.00001553  
Epoch: [2][360/646] Elapsed 4m 7s (remain 3m 15s) Loss: 0.4853(0.5128) Grad: 64779.9531  LR: 0.00001530  
Epoch: [2][380/646] Elapsed 4m 13s (remain 2m 56s) Loss: 0.3958(0.5108) Grad: 46249.1484  LR: 0.00001507  
Epoch: [2][400/646] Elapsed 4m 19s (remain 2m 38s) Loss: 0.3110(0.5069) Grad: 23226.6543  LR: 0.00001484  
EVAL: [0/125] Loss: 0.6849(0.6849) 
EVAL: [20/125] Loss: 0.4531(0.5039) 
EVAL: [40/125] Loss: 0.5082(0.5141) 
EVAL: [60/125] Loss: 0.4971(0.5048) 
EVAL: [80/125] Loss: 0.4338(0.5007) 
EVAL: [100/125] Loss: 0.5660(0.4918) 
EVAL: [120/125] Loss: 0.5499(0.4962) 
EVAL: [124/125] Loss: 0.5091(0.4957) 


Epoch 2 - Save Best Score: 0.5057 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.5057 Model
Epoch 2 - avg_train_loss: 0.5069  avg_val_loss: 0.4957
INFO:__main__:Epoch 2 - avg_train_loss: 0.5069  avg_val_loss: 0.4957
Epoch 2 - Score: 0.5057  Scores: [0.45741102513651516, 0.5540174157606219]
INFO:__main__:Epoch 2 - Score: 0.5057  Scores: [0.45741102513651516, 0.5540174157606219]


Epoch: [2][420/646] Elapsed 5m 22s (remain 2m 52s) Loss: 0.6624(0.5063) Grad: 129144.0938  LR: 0.00001460  
Epoch: [2][440/646] Elapsed 5m 30s (remain 2m 33s) Loss: 0.4666(0.5035) Grad: 97238.8047  LR: 0.00001436  
Epoch: [2][460/646] Elapsed 5m 37s (remain 2m 15s) Loss: 0.5702(0.5023) Grad: 116856.0781  LR: 0.00001412  
Epoch: [2][480/646] Elapsed 5m 44s (remain 1m 58s) Loss: 0.3815(0.4997) Grad: 118746.2891  LR: 0.00001387  
Epoch: [2][500/646] Elapsed 5m 52s (remain 1m 42s) Loss: 0.5595(0.4984) Grad: 65976.1719  LR: 0.00001362  
EVAL: [0/125] Loss: 0.5894(0.5894) 
EVAL: [20/125] Loss: 0.4305(0.4607) 
EVAL: [40/125] Loss: 0.5056(0.4757) 
EVAL: [60/125] Loss: 0.4873(0.4672) 
EVAL: [80/125] Loss: 0.3799(0.4637) 
EVAL: [100/125] Loss: 0.5543(0.4545) 
EVAL: [120/125] Loss: 0.5154(0.4591) 
EVAL: [124/125] Loss: 0.4841(0.4586) 


Epoch 2 - Save Best Score: 0.4675 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4675 Model
Epoch 2 - avg_train_loss: 0.4984  avg_val_loss: 0.4586
INFO:__main__:Epoch 2 - avg_train_loss: 0.4984  avg_val_loss: 0.4586
Epoch 2 - Score: 0.4675  Scores: [0.426549822113809, 0.5084444108404428]
INFO:__main__:Epoch 2 - Score: 0.4675  Scores: [0.426549822113809, 0.5084444108404428]


Epoch: [2][520/646] Elapsed 6m 53s (remain 1m 39s) Loss: 0.4708(0.4957) Grad: 156479.8594  LR: 0.00001336  
Epoch: [2][540/646] Elapsed 6m 59s (remain 1m 21s) Loss: 0.5076(0.4940) Grad: 146933.8281  LR: 0.00001311  
Epoch: [2][560/646] Elapsed 7m 5s (remain 1m 4s) Loss: 0.6254(0.4929) Grad: 46677.4062  LR: 0.00001285  
Epoch: [2][580/646] Elapsed 7m 12s (remain 0m 48s) Loss: 0.5413(0.4917) Grad: 40372.4219  LR: 0.00001259  
Epoch: [2][600/646] Elapsed 7m 18s (remain 0m 32s) Loss: 0.3642(0.4900) Grad: 77529.9453  LR: 0.00001233  
EVAL: [0/125] Loss: 0.7154(0.7154) 
EVAL: [20/125] Loss: 0.4963(0.5683) 
EVAL: [40/125] Loss: 0.5959(0.5616) 
EVAL: [60/125] Loss: 0.4872(0.5547) 
EVAL: [80/125] Loss: 0.5213(0.5547) 
EVAL: [100/125] Loss: 0.6492(0.5476) 
EVAL: [120/125] Loss: 0.6545(0.5525) 


Epoch 2 - avg_train_loss: 0.4900  avg_val_loss: 0.5529
INFO:__main__:Epoch 2 - avg_train_loss: 0.4900  avg_val_loss: 0.5529
Epoch 2 - Score: 0.5615  Scores: [0.5589174909802521, 0.5640841479611421]


EVAL: [124/125] Loss: 0.6058(0.5529) 


INFO:__main__:Epoch 2 - Score: 0.5615  Scores: [0.5589174909802521, 0.5640841479611421]


Epoch: [2][620/646] Elapsed 8m 7s (remain 0m 19s) Loss: 0.4450(0.4911) Grad: 28949.2695  LR: 0.00001207  
Epoch: [2][640/646] Elapsed 8m 15s (remain 0m 3s) Loss: 0.4481(0.4917) Grad: 42425.1406  LR: 0.00001180  
Epoch: [2][645/646] Elapsed 8m 17s (remain 0m 0s) Loss: 0.4845(0.4919) Grad: 94778.3203  LR: 0.00001173  
EVAL: [0/125] Loss: 0.7236(0.7236) 
EVAL: [20/125] Loss: 0.5180(0.4968) 
EVAL: [40/125] Loss: 0.6035(0.5357) 
EVAL: [60/125] Loss: 0.5451(0.5240) 
EVAL: [80/125] Loss: 0.4295(0.5154) 
EVAL: [100/125] Loss: 0.5965(0.5064) 
EVAL: [120/125] Loss: 0.4405(0.5110) 
EVAL: [124/125] Loss: 0.5280(0.5094) 


Epoch 2 - avg_train_loss: 0.4919  avg_val_loss: 0.5094
INFO:__main__:Epoch 2 - avg_train_loss: 0.4919  avg_val_loss: 0.5094
Epoch 2 - Score: 0.5214  Scores: [0.46088880730330717, 0.5818421302624864]
INFO:__main__:Epoch 2 - Score: 0.5214  Scores: [0.46088880730330717, 0.5818421302624864]


Epoch: [3][0/646] Elapsed 0m 0s (remain 7m 1s) Loss: 0.4947(0.4947) Grad: inf  LR: 0.00001172  
Epoch: [3][20/646] Elapsed 0m 7s (remain 3m 29s) Loss: 0.3749(0.3685) Grad: 30338.1016  LR: 0.00001145  
Epoch: [3][40/646] Elapsed 0m 12s (remain 3m 8s) Loss: 0.4937(0.4061) Grad: 137318.9844  LR: 0.00001119  
Epoch: [3][60/646] Elapsed 0m 19s (remain 3m 9s) Loss: 0.4666(0.3999) Grad: 135948.8125  LR: 0.00001092  
Epoch: [3][80/646] Elapsed 0m 26s (remain 3m 2s) Loss: 0.3582(0.3977) Grad: 125771.1875  LR: 0.00001065  
Epoch: [3][100/646] Elapsed 0m 31s (remain 2m 47s) Loss: 0.3558(0.4004) Grad: 141728.1562  LR: 0.00001038  
EVAL: [0/125] Loss: 0.6263(0.6263) 
EVAL: [20/125] Loss: 0.4983(0.4834) 
EVAL: [40/125] Loss: 0.5666(0.5159) 
EVAL: [60/125] Loss: 0.5893(0.5077) 
EVAL: [80/125] Loss: 0.3990(0.4988) 
EVAL: [100/125] Loss: 0.5699(0.4915) 
EVAL: [120/125] Loss: 0.4426(0.4936) 
EVAL: [124/125] Loss: 0.4734(0.4919) 


Epoch 3 - avg_train_loss: 0.4004  avg_val_loss: 0.4919
INFO:__main__:Epoch 3 - avg_train_loss: 0.4004  avg_val_loss: 0.4919
Epoch 3 - Score: 0.5009  Scores: [0.43539668252816316, 0.5663128264816114]
INFO:__main__:Epoch 3 - Score: 0.5009  Scores: [0.43539668252816316, 0.5663128264816114]


Epoch: [3][120/646] Elapsed 1m 19s (remain 5m 44s) Loss: 0.4414(0.3937) Grad: 74116.2734  LR: 0.00001011  
Epoch: [3][140/646] Elapsed 1m 25s (remain 5m 7s) Loss: 0.3374(0.3906) Grad: 129229.2969  LR: 0.00000984  
Epoch: [3][160/646] Elapsed 1m 31s (remain 4m 36s) Loss: 0.4005(0.3912) Grad: 115254.7656  LR: 0.00000957  
Epoch: [3][180/646] Elapsed 1m 37s (remain 4m 10s) Loss: 0.2294(0.3935) Grad: 118622.3906  LR: 0.00000930  
Epoch: [3][200/646] Elapsed 1m 44s (remain 3m 50s) Loss: 0.4342(0.4002) Grad: 69321.2188  LR: 0.00000903  
EVAL: [0/125] Loss: 0.6660(0.6660) 
EVAL: [20/125] Loss: 0.4471(0.4665) 
EVAL: [40/125] Loss: 0.5528(0.4925) 
EVAL: [60/125] Loss: 0.5057(0.4830) 
EVAL: [80/125] Loss: 0.3878(0.4775) 
EVAL: [100/125] Loss: 0.5799(0.4688) 
EVAL: [120/125] Loss: 0.4771(0.4725) 


Epoch 3 - avg_train_loss: 0.4002  avg_val_loss: 0.4715


EVAL: [124/125] Loss: 0.4781(0.4715) 


INFO:__main__:Epoch 3 - avg_train_loss: 0.4002  avg_val_loss: 0.4715
Epoch 3 - Score: 0.4810  Scores: [0.4246054873250497, 0.5373481781751728]
INFO:__main__:Epoch 3 - Score: 0.4810  Scores: [0.4246054873250497, 0.5373481781751728]


Epoch: [3][220/646] Elapsed 2m 33s (remain 4m 54s) Loss: 0.4359(0.3986) Grad: 77182.0156  LR: 0.00000876  
Epoch: [3][240/646] Elapsed 2m 38s (remain 4m 26s) Loss: 0.5404(0.3992) Grad: 106019.9844  LR: 0.00000849  
Epoch: [3][260/646] Elapsed 2m 45s (remain 4m 4s) Loss: 0.2250(0.4008) Grad: 91987.8672  LR: 0.00000823  
Epoch: [3][280/646] Elapsed 2m 51s (remain 3m 42s) Loss: 0.5654(0.4021) Grad: 83341.0000  LR: 0.00000796  
Epoch: [3][300/646] Elapsed 2m 56s (remain 3m 22s) Loss: 0.6686(0.4030) Grad: 63412.7188  LR: 0.00000770  
EVAL: [0/125] Loss: 0.7030(0.7030) 
EVAL: [20/125] Loss: 0.4338(0.4652) 
EVAL: [40/125] Loss: 0.5413(0.4930) 
EVAL: [60/125] Loss: 0.4891(0.4854) 
EVAL: [80/125] Loss: 0.3845(0.4804) 
EVAL: [100/125] Loss: 0.5537(0.4721) 
EVAL: [120/125] Loss: 0.4517(0.4769) 
EVAL: [124/125] Loss: 0.4976(0.4759) 


Epoch 3 - avg_train_loss: 0.4030  avg_val_loss: 0.4759
INFO:__main__:Epoch 3 - avg_train_loss: 0.4030  avg_val_loss: 0.4759
Epoch 3 - Score: 0.4865  Scores: [0.452783373135128, 0.5202164637006457]
INFO:__main__:Epoch 3 - Score: 0.4865  Scores: [0.452783373135128, 0.5202164637006457]


Epoch: [3][320/646] Elapsed 3m 44s (remain 3m 46s) Loss: 0.4013(0.4047) Grad: 112174.1016  LR: 0.00000744  
Epoch: [3][340/646] Elapsed 3m 49s (remain 3m 25s) Loss: 0.3214(0.4047) Grad: 29873.4434  LR: 0.00000718  
Epoch: [3][360/646] Elapsed 3m 56s (remain 3m 6s) Loss: 0.6462(0.4044) Grad: 73759.8672  LR: 0.00000692  
Epoch: [3][380/646] Elapsed 4m 2s (remain 2m 48s) Loss: 0.2625(0.4015) Grad: 38759.2930  LR: 0.00000666  
Epoch: [3][400/646] Elapsed 4m 9s (remain 2m 32s) Loss: 0.3282(0.4007) Grad: 115109.0078  LR: 0.00000641  
EVAL: [0/125] Loss: 0.6751(0.6751) 
EVAL: [20/125] Loss: 0.4363(0.4580) 
EVAL: [40/125] Loss: 0.5444(0.4838) 
EVAL: [60/125] Loss: 0.4745(0.4772) 
EVAL: [80/125] Loss: 0.3538(0.4704) 
EVAL: [100/125] Loss: 0.5489(0.4622) 
EVAL: [120/125] Loss: 0.4600(0.4666) 
EVAL: [124/125] Loss: 0.4634(0.4655) 


Epoch 3 - avg_train_loss: 0.4007  avg_val_loss: 0.4655
INFO:__main__:Epoch 3 - avg_train_loss: 0.4007  avg_val_loss: 0.4655
Epoch 3 - Score: 0.4755  Scores: [0.4282231358776429, 0.5227680035765199]
INFO:__main__:Epoch 3 - Score: 0.4755  Scores: [0.4282231358776429, 0.5227680035765199]


Epoch: [3][420/646] Elapsed 5m 0s (remain 2m 40s) Loss: 0.4190(0.4016) Grad: 78937.9844  LR: 0.00000616  
Epoch: [3][440/646] Elapsed 5m 9s (remain 2m 23s) Loss: 0.4949(0.4007) Grad: 87633.4453  LR: 0.00000591  
Epoch: [3][460/646] Elapsed 5m 19s (remain 2m 8s) Loss: 0.4239(0.3993) Grad: 101898.7344  LR: 0.00000566  
Epoch: [3][480/646] Elapsed 5m 28s (remain 1m 52s) Loss: 0.2067(0.3987) Grad: 114028.5469  LR: 0.00000542  
Epoch: [3][500/646] Elapsed 5m 38s (remain 1m 37s) Loss: 0.3189(0.3965) Grad: 86183.2578  LR: 0.00000518  
EVAL: [0/125] Loss: 0.6676(0.6676) 
EVAL: [20/125] Loss: 0.4327(0.4828) 
EVAL: [40/125] Loss: 0.5720(0.5035) 
EVAL: [60/125] Loss: 0.4783(0.4958) 
EVAL: [80/125] Loss: 0.4079(0.4909) 
EVAL: [100/125] Loss: 0.5710(0.4833) 
EVAL: [120/125] Loss: 0.4892(0.4890) 
EVAL: [124/125] Loss: 0.5239(0.4883) 


Epoch 3 - avg_train_loss: 0.3965  avg_val_loss: 0.4883
INFO:__main__:Epoch 3 - avg_train_loss: 0.3965  avg_val_loss: 0.4883
Epoch 3 - Score: 0.4971  Scores: [0.46652957670019707, 0.5276377618228231]
INFO:__main__:Epoch 3 - Score: 0.4971  Scores: [0.46652957670019707, 0.5276377618228231]


Epoch: [3][520/646] Elapsed 6m 28s (remain 1m 33s) Loss: 0.4136(0.3957) Grad: 85047.9141  LR: 0.00000495  
Epoch: [3][540/646] Elapsed 6m 33s (remain 1m 16s) Loss: 0.4598(0.3947) Grad: 31171.8984  LR: 0.00000472  
Epoch: [3][560/646] Elapsed 6m 39s (remain 1m 0s) Loss: 0.4278(0.3950) Grad: 21606.0000  LR: 0.00000449  
Epoch: [3][580/646] Elapsed 6m 46s (remain 0m 45s) Loss: 0.4292(0.3939) Grad: 73239.8516  LR: 0.00000427  
Epoch: [3][600/646] Elapsed 6m 52s (remain 0m 30s) Loss: 0.2884(0.3935) Grad: 40172.8594  LR: 0.00000405  
EVAL: [0/125] Loss: 0.6660(0.6660) 
EVAL: [20/125] Loss: 0.4610(0.4808) 
EVAL: [40/125] Loss: 0.5480(0.5042) 
EVAL: [60/125] Loss: 0.5149(0.4964) 
EVAL: [80/125] Loss: 0.3821(0.4897) 
EVAL: [100/125] Loss: 0.5730(0.4819) 
EVAL: [120/125] Loss: 0.4934(0.4863) 


Epoch 3 - avg_train_loss: 0.3935  avg_val_loss: 0.4854
INFO:__main__:Epoch 3 - avg_train_loss: 0.3935  avg_val_loss: 0.4854


EVAL: [124/125] Loss: 0.5072(0.4854) 


Epoch 3 - Score: 0.4948  Scores: [0.44078671357233046, 0.5488164693121674]
INFO:__main__:Epoch 3 - Score: 0.4948  Scores: [0.44078671357233046, 0.5488164693121674]


Epoch: [3][620/646] Elapsed 7m 40s (remain 0m 18s) Loss: 0.3283(0.3922) Grad: 47633.8516  LR: 0.00000383  
Epoch: [3][640/646] Elapsed 7m 46s (remain 0m 3s) Loss: 0.3379(0.3930) Grad: 122117.5703  LR: 0.00000362  
Epoch: [3][645/646] Elapsed 7m 48s (remain 0m 0s) Loss: 0.5273(0.3926) Grad: 69935.9922  LR: 0.00000357  
EVAL: [0/125] Loss: 0.6585(0.6585) 
EVAL: [20/125] Loss: 0.4352(0.4678) 
EVAL: [40/125] Loss: 0.5256(0.4867) 
EVAL: [60/125] Loss: 0.4825(0.4792) 
EVAL: [80/125] Loss: 0.3747(0.4748) 
EVAL: [100/125] Loss: 0.5571(0.4667) 
EVAL: [120/125] Loss: 0.4862(0.4719) 
EVAL: [124/125] Loss: 0.5081(0.4713) 


Epoch 3 - avg_train_loss: 0.3926  avg_val_loss: 0.4713
INFO:__main__:Epoch 3 - avg_train_loss: 0.3926  avg_val_loss: 0.4713
Epoch 3 - Score: 0.4806  Scores: [0.43984822137446, 0.5213880048852476]
INFO:__main__:Epoch 3 - Score: 0.4806  Scores: [0.43984822137446, 0.5213880048852476]


Epoch: [4][0/646] Elapsed 0m 0s (remain 9m 25s) Loss: 0.3291(0.3291) Grad: inf  LR: 0.00000356  
Epoch: [4][20/646] Elapsed 0m 7s (remain 3m 42s) Loss: 0.3143(0.3063) Grad: 152700.4375  LR: 0.00000336  
Epoch: [4][40/646] Elapsed 0m 13s (remain 3m 22s) Loss: 0.2752(0.3159) Grad: 65387.6172  LR: 0.00000316  
Epoch: [4][60/646] Elapsed 0m 19s (remain 3m 10s) Loss: 0.2845(0.3055) Grad: 79938.3438  LR: 0.00000296  
Epoch: [4][80/646] Elapsed 0m 25s (remain 2m 58s) Loss: 0.2227(0.3058) Grad: 73369.0469  LR: 0.00000277  
Epoch: [4][100/646] Elapsed 0m 30s (remain 2m 44s) Loss: 0.2459(0.3049) Grad: 65920.9219  LR: 0.00000259  
EVAL: [0/125] Loss: 0.6894(0.6894) 
EVAL: [20/125] Loss: 0.4571(0.4763) 
EVAL: [40/125] Loss: 0.5562(0.5043) 
EVAL: [60/125] Loss: 0.4992(0.4960) 
EVAL: [80/125] Loss: 0.3903(0.4901) 
EVAL: [100/125] Loss: 0.5514(0.4820) 
EVAL: [120/125] Loss: 0.4612(0.4871) 
EVAL: [124/125] Loss: 0.5239(0.4864) 


Epoch 4 - avg_train_loss: 0.3049  avg_val_loss: 0.4864
INFO:__main__:Epoch 4 - avg_train_loss: 0.3049  avg_val_loss: 0.4864
Epoch 4 - Score: 0.4965  Scores: [0.4451800804219707, 0.5478373449527479]
INFO:__main__:Epoch 4 - Score: 0.4965  Scores: [0.4451800804219707, 0.5478373449527479]


Epoch: [4][120/646] Elapsed 1m 18s (remain 5m 38s) Loss: 0.3790(0.3060) Grad: 88952.3125  LR: 0.00000241  
Epoch: [4][140/646] Elapsed 1m 23s (remain 5m 0s) Loss: 0.1386(0.3074) Grad: 43486.6562  LR: 0.00000224  
Epoch: [4][160/646] Elapsed 1m 29s (remain 4m 29s) Loss: 0.2009(0.3090) Grad: 79449.6953  LR: 0.00000207  
Epoch: [4][180/646] Elapsed 1m 34s (remain 4m 3s) Loss: 0.3315(0.3077) Grad: 45208.4336  LR: 0.00000191  
Epoch: [4][200/646] Elapsed 1m 41s (remain 3m 44s) Loss: 0.2698(0.3078) Grad: 120839.6641  LR: 0.00000175  
EVAL: [0/125] Loss: 0.7058(0.7058) 
EVAL: [20/125] Loss: 0.4729(0.4962) 
EVAL: [40/125] Loss: 0.5901(0.5245) 
EVAL: [60/125] Loss: 0.5084(0.5163) 
EVAL: [80/125] Loss: 0.4120(0.5105) 
EVAL: [100/125] Loss: 0.5783(0.5028) 
EVAL: [120/125] Loss: 0.4841(0.5082) 
EVAL: [124/125] Loss: 0.5600(0.5076) 


Epoch 4 - avg_train_loss: 0.3078  avg_val_loss: 0.5076
INFO:__main__:Epoch 4 - avg_train_loss: 0.3078  avg_val_loss: 0.5076
Epoch 4 - Score: 0.5178  Scores: [0.4644851178495131, 0.571154964241513]
INFO:__main__:Epoch 4 - Score: 0.5178  Scores: [0.4644851178495131, 0.571154964241513]


Epoch: [4][220/646] Elapsed 2m 30s (remain 4m 50s) Loss: 0.4881(0.3098) Grad: 53290.9648  LR: 0.00000160  
Epoch: [4][240/646] Elapsed 2m 37s (remain 4m 25s) Loss: 0.3060(0.3075) Grad: 109042.2266  LR: 0.00000146  
Epoch: [4][260/646] Elapsed 2m 42s (remain 4m 0s) Loss: 0.3233(0.3083) Grad: 118291.0078  LR: 0.00000132  
Epoch: [4][280/646] Elapsed 2m 49s (remain 3m 40s) Loss: 0.2837(0.3093) Grad: 112120.9688  LR: 0.00000119  
Epoch: [4][300/646] Elapsed 2m 55s (remain 3m 20s) Loss: 0.1581(0.3077) Grad: 21122.1992  LR: 0.00000107  
EVAL: [0/125] Loss: 0.6826(0.6826) 
EVAL: [20/125] Loss: 0.4530(0.4731) 
EVAL: [40/125] Loss: 0.5553(0.5008) 
EVAL: [60/125] Loss: 0.4959(0.4934) 
EVAL: [80/125] Loss: 0.3776(0.4867) 
EVAL: [100/125] Loss: 0.5581(0.4786) 
EVAL: [120/125] Loss: 0.4689(0.4837) 
EVAL: [124/125] Loss: 0.5191(0.4829) 


Epoch 4 - avg_train_loss: 0.3077  avg_val_loss: 0.4829
INFO:__main__:Epoch 4 - avg_train_loss: 0.3077  avg_val_loss: 0.4829
Epoch 4 - Score: 0.4930  Scores: [0.4430917852309073, 0.5428158075181004]
INFO:__main__:Epoch 4 - Score: 0.4930  Scores: [0.4430917852309073, 0.5428158075181004]


Epoch: [4][320/646] Elapsed 3m 43s (remain 3m 46s) Loss: 0.2634(0.3074) Grad: 51528.4375  LR: 0.00000095  
Epoch: [4][340/646] Elapsed 3m 49s (remain 3m 24s) Loss: 0.3438(0.3086) Grad: 44326.7227  LR: 0.00000084  
Epoch: [4][360/646] Elapsed 3m 55s (remain 3m 5s) Loss: 0.3111(0.3081) Grad: 55761.1250  LR: 0.00000073  
Epoch: [4][380/646] Elapsed 4m 1s (remain 2m 48s) Loss: 0.3355(0.3081) Grad: 27384.6191  LR: 0.00000063  
Epoch: [4][400/646] Elapsed 4m 7s (remain 2m 31s) Loss: 0.3559(0.3090) Grad: 53353.2656  LR: 0.00000054  
EVAL: [0/125] Loss: 0.6878(0.6878) 
EVAL: [20/125] Loss: 0.4515(0.4760) 
EVAL: [40/125] Loss: 0.5542(0.5016) 
EVAL: [60/125] Loss: 0.4887(0.4946) 
EVAL: [80/125] Loss: 0.3778(0.4881) 
EVAL: [100/125] Loss: 0.5641(0.4800) 
EVAL: [120/125] Loss: 0.4841(0.4851) 
EVAL: [124/125] Loss: 0.5245(0.4844) 


Epoch 4 - avg_train_loss: 0.3090  avg_val_loss: 0.4844
INFO:__main__:Epoch 4 - avg_train_loss: 0.3090  avg_val_loss: 0.4844
Epoch 4 - Score: 0.4944  Scores: [0.45153120050046164, 0.537277166606772]
INFO:__main__:Epoch 4 - Score: 0.4944  Scores: [0.45153120050046164, 0.537277166606772]


Epoch: [4][420/646] Elapsed 4m 57s (remain 2m 38s) Loss: 0.2924(0.3075) Grad: 28050.8262  LR: 0.00000046  
Epoch: [4][440/646] Elapsed 5m 3s (remain 2m 21s) Loss: 0.3668(0.3087) Grad: 44587.3477  LR: 0.00000038  
Epoch: [4][460/646] Elapsed 5m 9s (remain 2m 4s) Loss: 0.3211(0.3083) Grad: 93804.6172  LR: 0.00000031  
Epoch: [4][480/646] Elapsed 5m 15s (remain 1m 48s) Loss: 0.3702(0.3083) Grad: 115785.9922  LR: 0.00000025  
Epoch: [4][500/646] Elapsed 5m 21s (remain 1m 33s) Loss: 0.1917(0.3081) Grad: 63893.6484  LR: 0.00000019  
EVAL: [0/125] Loss: 0.6832(0.6832) 
EVAL: [20/125] Loss: 0.4545(0.4757) 
EVAL: [40/125] Loss: 0.5617(0.5042) 
EVAL: [60/125] Loss: 0.5035(0.4971) 
EVAL: [80/125] Loss: 0.3835(0.4903) 
EVAL: [100/125] Loss: 0.5601(0.4824) 
EVAL: [120/125] Loss: 0.4689(0.4874) 
EVAL: [124/125] Loss: 0.5215(0.4866) 


Epoch 4 - avg_train_loss: 0.3081  avg_val_loss: 0.4866
INFO:__main__:Epoch 4 - avg_train_loss: 0.3081  avg_val_loss: 0.4866
Epoch 4 - Score: 0.4966  Scores: [0.4466610506848508, 0.5464743034205515]
INFO:__main__:Epoch 4 - Score: 0.4966  Scores: [0.4466610506848508, 0.5464743034205515]


Epoch: [4][520/646] Elapsed 6m 10s (remain 1m 28s) Loss: 0.3342(0.3080) Grad: 101702.4219  LR: 0.00000014  
Epoch: [4][540/646] Elapsed 6m 15s (remain 1m 12s) Loss: 0.4697(0.3086) Grad: 56443.1289  LR: 0.00000010  
Epoch: [4][560/646] Elapsed 6m 22s (remain 0m 57s) Loss: 0.3211(0.3083) Grad: 70298.7656  LR: 0.00000007  
Epoch: [4][580/646] Elapsed 6m 28s (remain 0m 43s) Loss: 0.3411(0.3085) Grad: 117608.8438  LR: 0.00000004  
Epoch: [4][600/646] Elapsed 6m 33s (remain 0m 29s) Loss: 0.3547(0.3087) Grad: 91041.5312  LR: 0.00000002  
EVAL: [0/125] Loss: 0.6818(0.6818) 
EVAL: [20/125] Loss: 0.4492(0.4726) 
EVAL: [40/125] Loss: 0.5551(0.5001) 
EVAL: [60/125] Loss: 0.4951(0.4932) 
EVAL: [80/125] Loss: 0.3791(0.4867) 
EVAL: [100/125] Loss: 0.5561(0.4786) 
EVAL: [120/125] Loss: 0.4696(0.4837) 
EVAL: [124/125] Loss: 0.5173(0.4829) 


Epoch 4 - avg_train_loss: 0.3087  avg_val_loss: 0.4829
INFO:__main__:Epoch 4 - avg_train_loss: 0.3087  avg_val_loss: 0.4829
Epoch 4 - Score: 0.4929  Scores: [0.4465301614972554, 0.5392612660401601]
INFO:__main__:Epoch 4 - Score: 0.4929  Scores: [0.4465301614972554, 0.5392612660401601]


Epoch: [4][620/646] Elapsed 7m 23s (remain 0m 17s) Loss: 0.3337(0.3086) Grad: 79469.6719  LR: 0.00000001  
Epoch: [4][640/646] Elapsed 7m 29s (remain 0m 3s) Loss: 0.3102(0.3090) Grad: 25205.0254  LR: 0.00000000  
Epoch: [4][645/646] Elapsed 7m 30s (remain 0m 0s) Loss: 0.2419(0.3088) Grad: 100096.6328  LR: 0.00000000  
EVAL: [0/125] Loss: 0.6816(0.6816) 
EVAL: [20/125] Loss: 0.4495(0.4726) 
EVAL: [40/125] Loss: 0.5555(0.5002) 
EVAL: [60/125] Loss: 0.4958(0.4934) 
EVAL: [80/125] Loss: 0.3793(0.4868) 
EVAL: [100/125] Loss: 0.5561(0.4788) 
EVAL: [120/125] Loss: 0.4692(0.4839) 
EVAL: [124/125] Loss: 0.5174(0.4831) 


Epoch 4 - avg_train_loss: 0.3088  avg_val_loss: 0.4831
INFO:__main__:Epoch 4 - avg_train_loss: 0.3088  avg_val_loss: 0.4831
Epoch 4 - Score: 0.4931  Scores: [0.4463640539394489, 0.5397573649054535]
INFO:__main__:Epoch 4 - Score: 0.4931  Scores: [0.4463640539394489, 0.5397573649054535]
