In [1]:
# !pip install transformers==4.21.2
# !pip install tokenizers==0.12.1

# !pip install -q joblib scikit-learn scipy pandas

In [2]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time

import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# CFG


In [3]:
DATA_P = '/root/autodl-tmp/fb3/inputs/'
OUTPUT_DIR = '/root/autodl-tmp/fb3/output/trained_tiny_model/lsg-roberta-large/'

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    # competition='FB3'
    # _wandb_kernel='nakama'
    
    path=f"{DATA_P}common-nlp-tokenizer/model_tokenizer/tiny/lsg-roberta-large/"
    model=path
    config_path=model+'config.pth'
    tokenizer = AutoTokenizer.from_pretrained(model)
    cfg_save_output = OUTPUT_DIR

    debug=False
    train=True
    trust_remote_code=True
    
    apex=True
    print_freq=20
    num_workers=4
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    max_len=1536
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
# # ====================================================
# # tokenizer
# # ====================================================
# tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# # tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
# CFG.tokenizer = tokenizer

# Utils


In [6]:
# ====================================================
# Utils
# ====================================================

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

##  Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/sample_submission.csv')


In [8]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [9]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
# lengths = []
# tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
# for text in tk0:
#     length = len(CFG.tokenizer(text, add_special_tokens=False)['input_ids'])
#     lengths.append(length)
# CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

max_len: 1536


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True, trust_remote_code=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config, trust_remote_code=True)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
    
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss
    
    
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [13]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        f"{CFG.cfg_save_output}_fold{fold}_best.pth")
                        # OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

            cfg_save_output = f"{OUTPUT_DIR}trained_tiny_model/lsg-electra-base/"
            
            
    # predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
    #                          map_location=torch.device('cpu'))['predictions']
    predictions = torch.load(f"{CFG.cfg_save_output}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [14]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    print('finised')

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGRobertaConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-roberta-large/",
  "adaptive": true,
  "architectures": [
    "LSGRobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_roberta.LSGRobertaModel",
    "AutoModelForCausalLM": "modeling_lsg_roberta.LSGRobertaForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_roberta.LSGRobertaForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_roberta.LSGRobertaForMultipleChoice",
    "AutoModelForQuestionAnswering": "modeling_lsg_roberta.LSGRobertaForQuestionAnswering",
    "AutoModelForSequenceClassification": "modeling_lsg_roberta.LSGRobertaForSequenceClassification",
    "AutoModelForTokenClassification": "modeling_lsg_roberta.LSGRo

Epoch: [1][0/366] Elapsed 0m 1s (remain 10m 30s) Loss: 2.6484(2.6484) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 22s (remain 6m 17s) Loss: 0.2559(1.0182) Grad: 72839.5312  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 42s (remain 5m 38s) Loss: 0.1860(0.6095) Grad: 137043.0156  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 2s (remain 5m 12s) Loss: 0.2269(0.4655) Grad: 102770.7031  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 22s (remain 4m 51s) Loss: 0.1589(0.3859) Grad: 58324.6406  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 41s (remain 4m 26s) Loss: 0.1973(0.3394) Grad: 112786.6719  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 2m 0s (remain 4m 4s) Loss: 0.1717(0.3072) Grad: 51925.4648  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 20s (remain 3m 44s) Loss: 0.1548(0.2837) Grad: 32417.6367  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 42s (remain 3m 27s) Loss: 0.1442(0.2664) Grad: 41136.9414  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 3m 3s (remain 3m 7s) L

Epoch 1 - avg_train_loss: 0.1887  avg_val_loss: 0.1071  time: 452s
Epoch 1 - Score: 0.4632  Scores: [0.5115188405499517, 0.4557977532004715, 0.4232776800288024, 0.462409202057009, 0.48106842611800643, 0.4453430347602914]
Epoch 1 - Save Best Score: 0.4632 Model


EVAL: [60/62] Elapsed 1m 19s (remain 0m 1s) Loss: 0.1008(0.1071) 
EVAL: [61/62] Elapsed 1m 20s (remain 0m 0s) Loss: 0.1022(0.1071) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 9m 16s) Loss: 0.1085(0.1085) Grad: 323820.5938  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 21s (remain 5m 56s) Loss: 0.1970(0.1726) Grad: 472716.5938  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 42s (remain 5m 37s) Loss: 0.0865(0.1433) Grad: 33924.3789  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 3s (remain 5m 15s) Loss: 0.1825(0.1299) Grad: 108181.2891  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 23s (remain 4m 55s) Loss: 0.1406(0.1217) Grad: 134916.7969  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 44s (remain 4m 34s) Loss: 0.1316(0.1183) Grad: 99629.2266  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 2m 6s (remain 4m 16s) Loss: 0.1149(0.1141) Grad: 69112.9531  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 27s (remain 3m 55s) Loss: 0.0729(0.1106) Grad: 61378.2227  LR: 0.00001466  
Epoch: [2][160/36

Epoch 2 - avg_train_loss: 0.1020  avg_val_loss: 0.1143  time: 453s
Epoch 2 - Score: 0.4789  Scores: [0.5152511361385136, 0.4746899850337709, 0.41382723528255944, 0.46422510384075977, 0.5045997067908694, 0.5009600319427913]


EVAL: [60/62] Elapsed 1m 19s (remain 0m 1s) Loss: 0.1105(0.1145) 
EVAL: [61/62] Elapsed 1m 20s (remain 0m 0s) Loss: 0.0464(0.1143) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 11m 14s) Loss: 0.1029(0.1029) Grad: 245852.7812  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 22s (remain 6m 15s) Loss: 0.0674(0.0925) Grad: 112748.8906  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 43s (remain 5m 48s) Loss: 0.0743(0.0886) Grad: 91753.9688  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 1m 2s (remain 5m 12s) Loss: 0.0644(0.0833) Grad: 49524.2070  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 22s (remain 4m 51s) Loss: 0.0544(0.0811) Grad: 50591.6680  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 42s (remain 4m 28s) Loss: 0.0598(0.0807) Grad: 65211.3359  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 2m 3s (remain 4m 9s) Loss: 0.2065(0.0819) Grad: 569860.7500  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 23s (remain 3m 48s) Loss: 0.1242(0.0815) Grad: 68175.4375  LR: 0.00000704  
Epoch: [3][160/366

Epoch 3 - avg_train_loss: 0.0780  avg_val_loss: 0.1051  time: 455s
Epoch 3 - Score: 0.4591  Scores: [0.48800976980298394, 0.449876876155688, 0.41473998080111657, 0.46435275598964876, 0.4797654701913428, 0.4576240322704085]
Epoch 3 - Save Best Score: 0.4591 Model


EVAL: [60/62] Elapsed 1m 19s (remain 0m 1s) Loss: 0.1006(0.1051) 
EVAL: [61/62] Elapsed 1m 20s (remain 0m 0s) Loss: 0.0861(0.1051) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 21s) Loss: 0.0673(0.0673) Grad: 120809.7031  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 23s (remain 6m 21s) Loss: 0.0495(0.0651) Grad: 93917.3594  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 43s (remain 5m 41s) Loss: 0.0425(0.0639) Grad: 76096.0781  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 4s (remain 5m 21s) Loss: 0.0350(0.0630) Grad: 30480.9355  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 25s (remain 5m 1s) Loss: 0.0420(0.0613) Grad: 30966.2383  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 44s (remain 4m 33s) Loss: 0.0959(0.0609) Grad: 86406.6328  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 2m 4s (remain 4m 13s) Loss: 0.0648(0.0606) Grad: 49174.4297  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 25s (remain 3m 52s) Loss: 0.0710(0.0608) Grad: 41804.5625  LR: 0.00000116  
Epoch: [4][160/366] E

Epoch 4 - avg_train_loss: 0.0601  avg_val_loss: 0.1062  time: 454s
Epoch 4 - Score: 0.4617  Scores: [0.4890047923137198, 0.45402126197800524, 0.41779939269316174, 0.4674607308666758, 0.4819860024291688, 0.4597733960511384]


EVAL: [60/62] Elapsed 1m 20s (remain 0m 1s) Loss: 0.1077(0.1063) 
EVAL: [61/62] Elapsed 1m 20s (remain 0m 0s) Loss: 0.0663(0.1062) 


Score: 0.4591  Scores: [0.48800976980298394, 0.449876876155688, 0.41473998080111657, 0.46435275598964876, 0.4797654701913428, 0.4576240322704085]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGRobertaConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-roberta-large/",
  "adaptive": true,
  "architectures": [
    "LSGRobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_roberta.LSGRobertaModel",
    "AutoModelForCausalLM": "modeling_lsg_roberta.LSGRobertaForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_roberta.LSGRobertaForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_roberta.LSGRobertaForMultipleChoice",
    "AutoModelForQuestionAnswering": "modeling_lsg_roberta.LSGRobertaForQuestionAnswering",
    "AutoModelForSeque

Epoch: [1][0/366] Elapsed 0m 2s (remain 12m 52s) Loss: 2.6034(2.6034) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 21s (remain 5m 54s) Loss: 0.1593(1.2056) Grad: 77986.7812  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 40s (remain 5m 18s) Loss: 0.1441(0.6957) Grad: 149316.4219  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 0s (remain 5m 3s) Loss: 0.1071(0.5175) Grad: 67985.0703  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 23s (remain 4m 52s) Loss: 0.2051(0.4453) Grad: 216523.3438  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 43s (remain 4m 32s) Loss: 0.1969(0.3900) Grad: 81942.7031  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 2m 4s (remain 4m 11s) Loss: 0.1523(0.3486) Grad: 90648.4375  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 23s (remain 3m 49s) Loss: 0.1366(0.3199) Grad: 60521.3477  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 43s (remain 3m 28s) Loss: 0.1491(0.2983) Grad: 46702.0586  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 3m 3s (remain 3m 7s) Lo

Epoch 1 - avg_train_loss: 0.2047  avg_val_loss: 0.1232  time: 452s
Epoch 1 - Score: 0.4983  Scores: [0.5467130909578175, 0.5013520968202666, 0.45282066050227476, 0.49961139011930483, 0.5141662413613545, 0.4749672983527204]


EVAL: [60/62] Elapsed 1m 22s (remain 0m 1s) Loss: 0.1014(0.1233) 
EVAL: [61/62] Elapsed 1m 22s (remain 0m 0s) Loss: 0.0458(0.1232) 


Epoch 1 - Save Best Score: 0.4983 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 8m 20s) Loss: 0.0612(0.0612) Grad: 107059.4922  LR: 0.00001707  
Epoch: [2][20/366] Elapsed 0m 23s (remain 6m 18s) Loss: 0.0671(0.1172) Grad: 48043.7969  LR: 0.00001676  
Epoch: [2][40/366] Elapsed 0m 42s (remain 5m 38s) Loss: 0.1113(0.1097) Grad: 55183.1367  LR: 0.00001644  
Epoch: [2][60/366] Elapsed 1m 5s (remain 5m 27s) Loss: 0.0683(0.1080) Grad: 48561.8125  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 26s (remain 5m 3s) Loss: 0.1022(0.1054) Grad: 42344.0625  LR: 0.00001576  
Epoch: [2][100/366] Elapsed 1m 46s (remain 4m 38s) Loss: 0.0948(0.1032) Grad: 59557.8516  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 2m 5s (remain 4m 14s) Loss: 0.2028(0.1044) Grad: 110316.5156  LR: 0.00001504  
Epoch: [2][140/366] Elapsed 2m 27s (remain 3m 54s) Loss: 0.0677(0.1047) Grad: 53880.8125  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 47s (remain 3m 33s) Loss: 0.0746(0.1040) Grad: 38056.3203  LR: 0.00001428  
Epoch: [2][180/366] Elapsed 3m 9s (remain 3m 

Epoch 2 - avg_train_loss: 0.0998  avg_val_loss: 0.1105  time: 458s
Epoch 2 - Score: 0.4714  Scores: [0.5161704733289567, 0.4589547357915421, 0.4328601476292719, 0.4794403192568423, 0.478749265539737, 0.462518629759681]
Epoch 2 - Save Best Score: 0.4714 Model


Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 56s) Loss: 0.0575(0.0575) Grad: inf  LR: 0.00001001  
Epoch: [3][20/366] Elapsed 0m 21s (remain 5m 57s) Loss: 0.0815(0.0809) Grad: 81214.8828  LR: 0.00000958  
Epoch: [3][40/366] Elapsed 0m 41s (remain 5m 29s) Loss: 0.0744(0.0785) Grad: 90250.4531  LR: 0.00000916  
Epoch: [3][60/366] Elapsed 1m 2s (remain 5m 11s) Loss: 0.0768(0.0792) Grad: 50576.1289  LR: 0.00000873  
Epoch: [3][80/366] Elapsed 1m 21s (remain 4m 47s) Loss: 0.0562(0.0795) Grad: 60141.0703  LR: 0.00000831  
Epoch: [3][100/366] Elapsed 1m 41s (remain 4m 26s) Loss: 0.0999(0.0811) Grad: 46836.8047  LR: 0.00000789  
Epoch: [3][120/366] Elapsed 2m 2s (remain 4m 8s) Loss: 0.0999(0.0815) Grad: 28930.6348  LR: 0.00000747  
Epoch: [3][140/366] Elapsed 2m 22s (remain 3m 48s) Loss: 0.1026(0.0815) Grad: 27975.7109  LR: 0.00000706  
Epoch: [3][160/366] Elapsed 2m 43s (remain 3m 28s) Loss: 0.0772(0.0810) Grad: 34808.2500  LR: 0.00000665  
Epoch: [3][180/366] Elapsed 3m 3s (remain 3m 7s) Loss:

Epoch 3 - avg_train_loss: 0.0802  avg_val_loss: 0.1095  time: 455s
Epoch 3 - Score: 0.4695  Scores: [0.5023983912980233, 0.4510255634138228, 0.4355078752192412, 0.47155232491980587, 0.4818898174888112, 0.4748614751008684]
Epoch 3 - Save Best Score: 0.4695 Model


EVAL: [60/62] Elapsed 1m 22s (remain 0m 1s) Loss: 0.1011(0.1095) 
EVAL: [61/62] Elapsed 1m 22s (remain 0m 0s) Loss: 0.0975(0.1095) 
Epoch: [4][0/366] Elapsed 0m 2s (remain 12m 38s) Loss: 0.0763(0.0763) Grad: inf  LR: 0.00000295  
Epoch: [4][20/366] Elapsed 0m 22s (remain 6m 6s) Loss: 0.0592(0.0708) Grad: 118672.0859  LR: 0.00000265  
Epoch: [4][40/366] Elapsed 0m 42s (remain 5m 35s) Loss: 0.0482(0.0681) Grad: 70350.7031  LR: 0.00000237  
Epoch: [4][60/366] Elapsed 1m 2s (remain 5m 14s) Loss: 0.0694(0.0650) Grad: 65947.5703  LR: 0.00000210  
Epoch: [4][80/366] Elapsed 1m 25s (remain 5m 1s) Loss: 0.0534(0.0644) Grad: 164223.2656  LR: 0.00000184  
Epoch: [4][100/366] Elapsed 1m 47s (remain 4m 41s) Loss: 0.0661(0.0642) Grad: 102581.3750  LR: 0.00000160  
Epoch: [4][120/366] Elapsed 2m 8s (remain 4m 20s) Loss: 0.0593(0.0639) Grad: 130870.6406  LR: 0.00000138  
Epoch: [4][140/366] Elapsed 2m 29s (remain 3m 59s) Loss: 0.0705(0.0634) Grad: 99549.3984  LR: 0.00000117  
Epoch: [4][160/366] Elaps

Epoch 4 - avg_train_loss: 0.0622  avg_val_loss: 0.1081  time: 456s


EVAL: [60/62] Elapsed 1m 22s (remain 0m 1s) Loss: 0.1021(0.1081) 
EVAL: [61/62] Elapsed 1m 22s (remain 0m 0s) Loss: 0.0989(0.1081) 


Epoch 4 - Score: 0.4663  Scores: [0.5021884360711852, 0.4555507010405653, 0.42721438551001023, 0.4652661169739467, 0.4822360288131827, 0.4652723727226996]
Epoch 4 - Save Best Score: 0.4663 Model
Score: 0.4663  Scores: [0.5021884360711852, 0.4555507010405653, 0.42721438551001023, 0.4652661169739467, 0.4822360288131827, 0.4652723727226996]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGRobertaConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-roberta-large/",
  "adaptive": true,
  "architectures": [
    "LSGRobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_roberta.LSGRobertaModel",
    "AutoModelForCausalLM": "modeling_lsg_roberta.LSGRobertaForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_roberta.LSGRobertaForMaskedLM",
    "A

Epoch: [1][0/366] Elapsed 0m 1s (remain 9m 32s) Loss: 2.8474(2.8474) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 22s (remain 6m 9s) Loss: 0.2772(1.0829) Grad: 153385.0156  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 44s (remain 5m 54s) Loss: 0.0938(0.6485) Grad: 17450.7480  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 7s (remain 5m 35s) Loss: 0.2656(0.4884) Grad: 110228.5312  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 28s (remain 5m 12s) Loss: 0.1582(0.4089) Grad: 38478.4062  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 49s (remain 4m 46s) Loss: 0.1849(0.3591) Grad: 30988.8574  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 2m 9s (remain 4m 21s) Loss: 0.1821(0.3233) Grad: 26948.2363  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 31s (remain 4m 1s) Loss: 0.1353(0.3012) Grad: 31192.8496  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 50s (remain 3m 37s) Loss: 0.1338(0.2854) Grad: 17400.1289  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 3m 11s (remain 3m 15s) Lo

Epoch 1 - avg_train_loss: 0.1992  avg_val_loss: 0.1147  time: 457s
Epoch 1 - Score: 0.4806  Scores: [0.5048764213380247, 0.48005302089817686, 0.4310334409137284, 0.48358612447330074, 0.5074383406335587, 0.47640331055001806]
Epoch 1 - Save Best Score: 0.4806 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 11m 32s) Loss: 0.0836(0.0836) Grad: inf  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 22s (remain 6m 13s) Loss: 0.1505(0.1280) Grad: 82463.6641  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 43s (remain 5m 45s) Loss: 0.0861(0.1129) Grad: 100359.4297  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 3s (remain 5m 18s) Loss: 0.1166(0.1082) Grad: 88511.6641  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 23s (remain 4m 52s) Loss: 0.0774(0.1052) Grad: 95481.3594  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 44s (remain 4m 33s) Loss: 0.0684(0.1024) Grad: 85932.0625  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 2m 4s (remain 4m 12s) Loss: 0.1382(0.1028) Grad: 102850.2812  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 25s (remain 3m 51s) Loss: 0.0835(0.1021) Grad: 160546.2656  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 45s (remain 3m 30s) Loss: 0.0928(0.1008) Grad: 38087.4570  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 3m 7s (remain 3m 11s)

Epoch 2 - avg_train_loss: 0.0999  avg_val_loss: 0.1160  time: 454s
Epoch 2 - Score: 0.4835  Scores: [0.5016467486645255, 0.4678615699832894, 0.43810574927779267, 0.49082275983285856, 0.5212211716638004, 0.48124613828360385]


Epoch: [3][0/366] Elapsed 0m 1s (remain 10m 29s) Loss: 0.0599(0.0599) Grad: 115961.7344  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 23s (remain 6m 23s) Loss: 0.0555(0.0798) Grad: 51371.7266  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 43s (remain 5m 41s) Loss: 0.0871(0.0801) Grad: 123604.2188  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 1m 2s (remain 5m 12s) Loss: 0.0763(0.0803) Grad: 116414.3438  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 23s (remain 4m 54s) Loss: 0.0966(0.0797) Grad: 102455.5938  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 45s (remain 4m 36s) Loss: 0.0660(0.0785) Grad: 46876.4648  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 2m 6s (remain 4m 15s) Loss: 0.0639(0.0790) Grad: 28410.2461  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 26s (remain 3m 54s) Loss: 0.0905(0.0799) Grad: 46145.7695  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 2m 49s (remain 3m 35s) Loss: 0.0720(0.0799) Grad: 37023.7031  LR: 0.00000664  
Epoch: [3][180/366] Elapsed 3m 8s (remain

Epoch 3 - avg_train_loss: 0.0784  avg_val_loss: 0.1106  time: 453s
Epoch 3 - Score: 0.4719  Scores: [0.49021717086416267, 0.4640692987093321, 0.4244595303300146, 0.48397979817210024, 0.49624336910996236, 0.4723036962719627]
Epoch 3 - Save Best Score: 0.4719 Model


Epoch: [4][0/366] Elapsed 0m 1s (remain 10m 19s) Loss: 0.0827(0.0827) Grad: 194514.1562  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 22s (remain 6m 7s) Loss: 0.0484(0.0732) Grad: 86514.9688  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 41s (remain 5m 31s) Loss: 0.0798(0.0717) Grad: 92546.0469  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 2s (remain 5m 14s) Loss: 0.0600(0.0716) Grad: 47755.4961  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 23s (remain 4m 55s) Loss: 0.0759(0.0710) Grad: 97034.2188  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 44s (remain 4m 35s) Loss: 0.0897(0.0710) Grad: 90438.4141  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 2m 5s (remain 4m 14s) Loss: 0.0562(0.0701) Grad: 66291.7500  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 25s (remain 3m 51s) Loss: 0.0686(0.0693) Grad: 38542.0898  LR: 0.00000116  
Epoch: [4][160/366] Elapsed 2m 44s (remain 3m 29s) Loss: 0.0472(0.0693) Grad: 80411.5547  LR: 0.00000097  
Epoch: [4][180/366] Elapsed 3m 3s (remain 3m 

Epoch 4 - avg_train_loss: 0.0682  avg_val_loss: 0.1093  time: 456s
Epoch 4 - Score: 0.4688  Scores: [0.4908077140426496, 0.4584616953273456, 0.42302174243291835, 0.4745555359199424, 0.49821622138214083, 0.4675034422030983]
Epoch 4 - Save Best Score: 0.4688 Model
Score: 0.4688  Scores: [0.4908077140426496, 0.4584616953273456, 0.42302174243291835, 0.4745555359199424, 0.49821622138214083, 0.4675034422030983]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGRobertaConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-roberta-large/",
  "adaptive": true,
  "architectures": [
    "LSGRobertaForMaskedLM"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_roberta.LSGRobertaModel",
    "AutoModelForCausalLM": "modeling_lsg_roberta.LSGRobertaForCausalLM",
    "AutoMo

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 42s) Loss: 2.4858(2.4858) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 21s (remain 5m 55s) Loss: 0.2982(0.9732) Grad: 79796.3984  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 42s (remain 5m 35s) Loss: 0.3802(0.6396) Grad: 105512.8359  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 1m 3s (remain 5m 18s) Loss: 0.1523(0.4986) Grad: 22185.8789  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 23s (remain 4m 53s) Loss: 0.1712(0.4133) Grad: 56381.2305  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 42s (remain 4m 29s) Loss: 0.1331(0.3633) Grad: 23590.2480  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 2m 3s (remain 4m 11s) Loss: 0.2231(0.3275) Grad: 40156.6797  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 23s (remain 3m 49s) Loss: 0.1549(0.3068) Grad: 186607.7969  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 43s (remain 3m 27s) Loss: 0.1490(0.2861) Grad: 46320.6758  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 3m 3s (remain 3m 7s) Lo

Epoch 1 - avg_train_loss: 0.2001  avg_val_loss: 0.1529  time: 454s
Epoch 1 - Score: 0.5581  Scores: [0.5273308714999403, 0.5475843815470336, 0.5215275948857212, 0.5280622762099592, 0.6169559107153975, 0.6070126791761387]
Epoch 1 - Save Best Score: 0.5581 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 9m 8s) Loss: 0.1033(0.1033) Grad: 250486.7969  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 22s (remain 6m 5s) Loss: 0.0776(0.1343) Grad: 102519.8906  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 43s (remain 5m 41s) Loss: 0.1099(0.1144) Grad: 183167.5312  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 1m 4s (remain 5m 23s) Loss: 0.0788(0.1077) Grad: 39579.7930  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 25s (remain 4m 59s) Loss: 0.0582(0.1034) Grad: 40317.3477  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 46s (remain 4m 38s) Loss: 0.0693(0.1026) Grad: 38740.3672  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 2m 6s (remain 4m 16s) Loss: 0.1026(0.1036) Grad: 52286.5391  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 26s (remain 3m 54s) Loss: 0.0786(0.1014) Grad: 35736.5195  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 47s (remain 3m 32s) Loss: 0.0787(0.1014) Grad: 165026.1562  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 3m 5s (remain 3m

Epoch 2 - avg_train_loss: 0.0987  avg_val_loss: 0.1092  time: 457s
Epoch 2 - Score: 0.4691  Scores: [0.5020485545096478, 0.4613233953088541, 0.4390017136544526, 0.4674648733048099, 0.48941377804783537, 0.45550117561165127]
Epoch 2 - Save Best Score: 0.4691 Model


Epoch: [3][0/366] Elapsed 0m 1s (remain 10m 47s) Loss: 0.1291(0.1291) Grad: inf  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 21s (remain 5m 56s) Loss: 0.1099(0.0834) Grad: 102910.4141  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 39s (remain 5m 13s) Loss: 0.0825(0.0780) Grad: 55956.2578  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 1m 1s (remain 5m 6s) Loss: 0.0773(0.0782) Grad: 84161.1484  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 22s (remain 4m 51s) Loss: 0.0814(0.0801) Grad: 134037.2656  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 45s (remain 4m 35s) Loss: 0.0746(0.0793) Grad: 97301.9219  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 2m 5s (remain 4m 13s) Loss: 0.0728(0.0795) Grad: 71563.6875  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 24s (remain 3m 51s) Loss: 0.0567(0.0784) Grad: 68603.6016  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 2m 44s (remain 3m 29s) Loss: 0.0863(0.0780) Grad: 160311.4219  LR: 0.00000664  
Epoch: [3][180/366] Elapsed 3m 2s (remain 3m 6s) L

Epoch 3 - avg_train_loss: 0.0776  avg_val_loss: 0.1078  time: 457s
Epoch 3 - Score: 0.4658  Scores: [0.49818176717137325, 0.45720928505874914, 0.4354261865872814, 0.45778207723611886, 0.48439806409800545, 0.4618206706460046]
Epoch 3 - Save Best Score: 0.4658 Model


Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 41s) Loss: 0.0548(0.0548) Grad: 116350.3203  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 20s (remain 5m 34s) Loss: 0.0945(0.0653) Grad: 185711.0781  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 41s (remain 5m 29s) Loss: 0.0746(0.0701) Grad: 72668.3203  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 1m 2s (remain 5m 13s) Loss: 0.0769(0.0710) Grad: 165328.3438  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 23s (remain 4m 55s) Loss: 0.0554(0.0697) Grad: 94074.9688  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 43s (remain 4m 32s) Loss: 0.0545(0.0688) Grad: 71667.5859  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 2m 2s (remain 4m 8s) Loss: 0.0760(0.0688) Grad: 72818.8281  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 21s (remain 3m 46s) Loss: 0.0496(0.0689) Grad: 53871.7266  LR: 0.00000116  
Epoch: [4][160/366] Elapsed 2m 42s (remain 3m 26s) Loss: 0.0701(0.0689) Grad: 120543.3125  LR: 0.00000097  
Epoch: [4][180/366] Elapsed 3m 3s (remain 3

Epoch 4 - avg_train_loss: 0.0684  avg_val_loss: 0.1070  time: 455s
Epoch 4 - Score: 0.4639  Scores: [0.494342279197483, 0.45934233850396916, 0.42832823860701014, 0.4597823294633263, 0.4857399590049584, 0.456092299660875]
Epoch 4 - Save Best Score: 0.4639 Model
Score: 0.4639  Scores: [0.494342279197483, 0.45934233850396916, 0.42832823860701014, 0.4597823294633263, 0.4857399590049584, 0.456092299660875]
Score: 0.4645  Scores: [0.49386353969364916, 0.4558229943426986, 0.423358743899167, 0.466020253549629, 0.4865422567772854, 0.46164768889322527]


finised
