In [1]:
# import os
# import gc
# import re
# import ast
# import sys
# import copy
# import json
# import time
# import math
# import string
# import pickle
# import random
# import joblib
# import itertools
# import warnings
# warnings.filterwarnings("ignore")

# import scipy as sp
# import numpy as np
# import pandas as pd
# from tqdm.auto import tqdm
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

# import torch
# import torch.nn as nn
# from torch.nn import Parameter
# import torch.nn.functional as F
# from torch.optim import Adam, SGD, AdamW
# from torch.utils.data import DataLoader, Dataset


# import tokenizers
# import transformers
# from transformers import AutoTokenizer, AutoModel, AutoConfig
# from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
# from transformers import DataCollatorWithPadding
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# import codecs
# from typing import Dict, List, Tuple
# from text_unidecode import unidecode

In [2]:
# !pip install transformers==4.21.2
# !pip install tokenizers==0.12.1

# !pip install -q joblib scikit-learn scipy 

In [3]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# CFG


In [4]:
DATA_P = '/root/autodl-tmp/fb3/inputs/'
OUTPUT_DIR = '/root/autodl-tmp/fb3/output/trained_tiny_model/lsg-electra-large/'

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    # competition='FB3'
    # _wandb_kernel='nakama'
    
    path=f"{DATA_P}common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-large/"
    model=path
    config_path=model+'config.pth'
    tokenizer = AutoTokenizer.from_pretrained(model)
    cfg_save_output = OUTPUT_DIR

    debug=False
    train=True
    trust_remote_code=True
    
    apex=True
    print_freq=20
    num_workers=4
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    max_len=1436
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0,1]

In [6]:
# # ====================================================
# # tokenizer
# # ====================================================
# tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# # tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
# CFG.tokenizer = tokenizer

# Utils


In [7]:
# ====================================================
# Utils
# ====================================================

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

##  Data Loading

In [8]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/sample_submission.csv')


In [9]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [10]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# Dataset

In [11]:
# ====================================================
# Define max_len
# ====================================================
# lengths = []
# tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
# for text in tk0:
#     length = len(CFG.tokenizer(text, add_special_tokens=False)['input_ids'])
#     lengths.append(length)
# CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1436


In [12]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [13]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True, trust_remote_code=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config, trust_remote_code=True)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
    
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss
    
    
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [14]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        f"{CFG.cfg_save_output}_fold{fold}_best.pth")
                        # OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

            cfg_save_output = f"{OUTPUT_DIR}trained_tiny_model/lsg-electra-base/"
            
            
    # predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
    #                          map_location=torch.device('cpu'))['predictions']
    predictions = torch.load(f"{CFG.cfg_save_output}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [15]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    print('finised')

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-large/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswering": "modeling_lsg_electra.LSGElectraForQuestionAnswering",
    "AutoModelForSequenceClassification": "modeling_lsg_electra.LSGElectraForSequence

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 44s) Loss: 2.4513(2.4513) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 19s (remain 5m 18s) Loss: 0.5995(1.7873) Grad: 79806.5859  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 36s (remain 4m 52s) Loss: 0.1643(1.0636) Grad: 36787.8828  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 53s (remain 4m 29s) Loss: 0.1423(0.7862) Grad: 28208.1914  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 11s (remain 4m 12s) Loss: 0.2602(0.6334) Grad: 32376.5176  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 30s (remain 3m 56s) Loss: 0.1779(0.5404) Grad: 32684.6250  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 48s (remain 3m 39s) Loss: 0.1588(0.4776) Grad: 46305.2383  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 5s (remain 3m 20s) Loss: 0.2052(0.4322) Grad: 65033.2578  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 22s (remain 3m 2s) Loss: 0.1065(0.3954) Grad: 24834.2070  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 40s (remain 2m 43s) Lo

Epoch 1 - avg_train_loss: 0.2486  avg_val_loss: 0.1296  time: 367s
Epoch 1 - Score: 0.5104  Scores: [0.5349587138136509, 0.4819101628412786, 0.44152836985017385, 0.49421544618776214, 0.5600548872522627, 0.5496831590807624]
Epoch 1 - Save Best Score: 0.5104 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1183(0.1295) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1636(0.1296) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 24s) Loss: 0.1124(0.1124) Grad: 86180.1406  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 17s (remain 4m 53s) Loss: 0.1522(0.1175) Grad: 82338.6641  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 35s (remain 4m 38s) Loss: 0.1291(0.1131) Grad: 108955.8203  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 53s (remain 4m 25s) Loss: 0.1281(0.1130) Grad: 75574.2031  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 10s (remain 4m 9s) Loss: 0.1165(0.1111) Grad: 223909.6719  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 29s (remain 3m 55s) Loss: 0.0927(0.1074) Grad: 76715.0938  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 48s (remain 3m 40s) Loss: 0.0899(0.1063) Grad: 168863.6406  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 5s (remain 3m 20s) Loss: 0.1053(0.1062) Grad: 38979.8789  LR: 0.00001466  
Epoch: [2][160/366

Epoch 2 - avg_train_loss: 0.1085  avg_val_loss: 0.1185  time: 367s
Epoch 2 - Score: 0.4880  Scores: [0.519461455331861, 0.47648281828373573, 0.44165038598407763, 0.4760569462750187, 0.5078580703258441, 0.5062380442096157]
Epoch 2 - Save Best Score: 0.4880 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1153(0.1184) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1627(0.1185) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 6m 30s) Loss: 0.0725(0.0725) Grad: 63312.1523  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 16s (remain 4m 38s) Loss: 0.1073(0.0905) Grad: 115395.7031  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 34s (remain 4m 37s) Loss: 0.1103(0.0881) Grad: 86188.0703  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 52s (remain 4m 21s) Loss: 0.0940(0.0877) Grad: 50945.1172  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 10s (remain 4m 8s) Loss: 0.0833(0.0901) Grad: 71948.5391  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 28s (remain 3m 51s) Loss: 0.1117(0.0893) Grad: 47878.9727  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 47s (remain 3m 37s) Loss: 0.1096(0.0913) Grad: 75674.9375  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 5s (remain 3m 19s) Loss: 0.1207(0.0917) Grad: 37546.1406  LR: 0.00000704  
Epoch: [3][160/366] 

Epoch 3 - avg_train_loss: 0.0897  avg_val_loss: 0.1145  time: 366s
Epoch 3 - Score: 0.4796  Scores: [0.5076432816266627, 0.4622656538197381, 0.4318123868901831, 0.5042788194050093, 0.4976369237731934, 0.47388284469728464]
Epoch 3 - Save Best Score: 0.4796 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1035(0.1145) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1109(0.1145) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 54s) Loss: 0.0674(0.0674) Grad: 71707.0156  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 18s (remain 4m 56s) Loss: 0.0543(0.0710) Grad: 85380.2188  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 36s (remain 4m 46s) Loss: 0.0517(0.0732) Grad: 91083.1641  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 51s (remain 4m 19s) Loss: 0.0666(0.0713) Grad: 48334.4336  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 9s (remain 4m 5s) Loss: 0.0704(0.0731) Grad: 23581.1797  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 27s (remain 3m 50s) Loss: 0.0604(0.0749) Grad: 33873.4648  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 44s (remain 3m 31s) Loss: 0.1019(0.0747) Grad: 82461.8516  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 5s (remain 3m 19s) Loss: 0.0770(0.0752) Grad: 23375.4043  LR: 0.00000116  
Epoch: [4][160/366] El

Epoch 4 - avg_train_loss: 0.0763  avg_val_loss: 0.1097  time: 366s
Epoch 4 - Score: 0.4693  Scores: [0.5023296752323001, 0.45780164733906664, 0.4296937637104543, 0.47494005695060215, 0.48556008093723096, 0.4654498016739922]
Epoch 4 - Save Best Score: 0.4693 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1053(0.1097) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1176(0.1097) 


Score: 0.4693  Scores: [0.5023296752323001, 0.45780164733906664, 0.4296937637104543, 0.47494005695060215, 0.48556008093723096, 0.4654498016739922]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-large/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswe

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 9s) Loss: 2.7642(2.7642) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 18s (remain 5m 3s) Loss: 0.4952(1.6326) Grad: 97076.4297  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 34s (remain 4m 34s) Loss: 0.3411(0.9571) Grad: 38215.4102  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 53s (remain 4m 25s) Loss: 0.0813(0.6983) Grad: 35848.8438  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 11s (remain 4m 10s) Loss: 0.1409(0.5700) Grad: 27394.1328  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 28s (remain 3m 51s) Loss: 0.0947(0.4856) Grad: 18912.0703  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 45s (remain 3m 34s) Loss: 0.1066(0.4312) Grad: 15152.8926  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 2s (remain 3m 15s) Loss: 0.1293(0.3888) Grad: 55415.4141  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 19s (remain 2m 58s) Loss: 0.0975(0.3582) Grad: 22064.1445  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 37s (remain 2m 41s) Los

Epoch 1 - avg_train_loss: 0.2307  avg_val_loss: 0.1472  time: 368s
Epoch 1 - Score: 0.5456  Scores: [0.5720681733222518, 0.5185588485050601, 0.49873729285286694, 0.5368863630050684, 0.555588433936822, 0.5916005829484906]
Epoch 1 - Save Best Score: 0.5456 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1433(0.1473) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0616(0.1472) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 19s) Loss: 0.0954(0.0954) Grad: 107952.7578  LR: 0.00001707  
Epoch: [2][20/366] Elapsed 0m 19s (remain 5m 16s) Loss: 0.0850(0.1209) Grad: 141110.9531  LR: 0.00001676  
Epoch: [2][40/366] Elapsed 0m 38s (remain 5m 3s) Loss: 0.0659(0.1119) Grad: 67878.3750  LR: 0.00001644  
Epoch: [2][60/366] Elapsed 0m 54s (remain 4m 34s) Loss: 0.1189(0.1109) Grad: 101492.7344  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 11s (remain 4m 11s) Loss: 0.0590(0.1095) Grad: 60558.0664  LR: 0.00001576  
Epoch: [2][100/366] Elapsed 1m 28s (remain 3m 52s) Loss: 0.1729(0.1093) Grad: 68205.8828  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 44s (remain 3m 32s) Loss: 0.0898(0.1086) Grad: 41637.3828  LR: 0.00001504  
Epoch: [2][140/366] Elapsed 2m 3s (remain 3m 17s) Loss: 0.1307(0.1088) Grad: 49901.5625  LR: 0.00001466  
Epoch: [2][160/366

Epoch 2 - avg_train_loss: 0.1059  avg_val_loss: 0.1180  time: 364s
Epoch 2 - Score: 0.4872  Scores: [0.5167625402189895, 0.4799722420159963, 0.4738867418944833, 0.4722022957167923, 0.49156186063502016, 0.48893408122735676]
Epoch 2 - Save Best Score: 0.4872 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1094(0.1180) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0504(0.1180) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 51s) Loss: 0.1658(0.1658) Grad: 231759.1250  LR: 0.00001001  
Epoch: [3][20/366] Elapsed 0m 18s (remain 5m 9s) Loss: 0.1345(0.1082) Grad: 47594.7422  LR: 0.00000958  
Epoch: [3][40/366] Elapsed 0m 37s (remain 5m 0s) Loss: 0.0875(0.1025) Grad: 29824.3145  LR: 0.00000916  
Epoch: [3][60/366] Elapsed 0m 55s (remain 4m 39s) Loss: 0.0600(0.0979) Grad: 39593.7227  LR: 0.00000873  
Epoch: [3][80/366] Elapsed 1m 12s (remain 4m 14s) Loss: 0.0942(0.0946) Grad: 52522.9180  LR: 0.00000831  
Epoch: [3][100/366] Elapsed 1m 30s (remain 3m 58s) Loss: 0.0642(0.0926) Grad: 37454.5000  LR: 0.00000789  
Epoch: [3][120/366] Elapsed 1m 48s (remain 3m 39s) Loss: 0.1203(0.0924) Grad: 42094.4297  LR: 0.00000747  
Epoch: [3][140/366] Elapsed 2m 5s (remain 3m 20s) Loss: 0.0577(0.0917) Grad: 49000.1914  LR: 0.00000706  
Epoch: [3][160/366] E

Epoch 3 - avg_train_loss: 0.0896  avg_val_loss: 0.1115  time: 367s
Epoch 3 - Score: 0.4732  Scores: [0.504623016864147, 0.46450889575043164, 0.4326800645162056, 0.4625047393048289, 0.4996598220624935, 0.4752077673634043]
Epoch 3 - Save Best Score: 0.4732 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0935(0.1115) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0547(0.1115) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 17s) Loss: 0.0548(0.0548) Grad: 68047.7578  LR: 0.00000295  
Epoch: [4][20/366] Elapsed 0m 17s (remain 4m 52s) Loss: 0.0771(0.0741) Grad: 125922.9062  LR: 0.00000265  
Epoch: [4][40/366] Elapsed 0m 34s (remain 4m 34s) Loss: 0.0654(0.0748) Grad: 85669.4844  LR: 0.00000237  
Epoch: [4][60/366] Elapsed 0m 52s (remain 4m 24s) Loss: 0.0577(0.0725) Grad: 64299.3711  LR: 0.00000210  
Epoch: [4][80/366] Elapsed 1m 10s (remain 4m 9s) Loss: 0.1205(0.0755) Grad: 240011.8125  LR: 0.00000184  
Epoch: [4][100/366] Elapsed 1m 27s (remain 3m 50s) Loss: 0.0888(0.0749) Grad: 89487.5547  LR: 0.00000160  
Epoch: [4][120/366] Elapsed 1m 46s (remain 3m 35s) Loss: 0.0720(0.0760) Grad: 57055.1289  LR: 0.00000138  
Epoch: [4][140/366] Elapsed 2m 4s (remain 3m 19s) Loss: 0.1346(0.0763) Grad: 45475.1289  LR: 0.00000117  
Epoch: [4][160/366]

Epoch 4 - avg_train_loss: 0.0760  avg_val_loss: 0.1096  time: 365s
Epoch 4 - Score: 0.4690  Scores: [0.5027246163272642, 0.4612897101908413, 0.43114448048741466, 0.4602014172701484, 0.48681748729697943, 0.4721201681796797]
Epoch 4 - Save Best Score: 0.4690 Model


EVAL: [60/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0918(0.1096) 
EVAL: [61/62] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0511(0.1096) 


Score: 0.4690  Scores: [0.5027246163272642, 0.4612897101908413, 0.43114448048741466, 0.4602014172701484, 0.48681748729697943, 0.4721201681796797]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-large/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswer

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 26s) Loss: 2.8395(2.8395) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 20s (remain 5m 33s) Loss: 0.5007(1.7862) Grad: 96998.1875  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 36s (remain 4m 52s) Loss: 0.2196(1.0347) Grad: 74666.5000  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 55s (remain 4m 36s) Loss: 0.1620(0.7602) Grad: 92496.9766  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 14s (remain 4m 22s) Loss: 0.2163(0.6122) Grad: 35981.8477  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 31s (remain 4m 1s) Loss: 0.1750(0.5299) Grad: 31413.1367  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 50s (remain 3m 42s) Loss: 0.1274(0.4672) Grad: 27762.1719  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 9s (remain 3m 26s) Loss: 0.2174(0.4316) Grad: 57715.2227  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 27s (remain 3m 7s) Loss: 0.2453(0.3994) Grad: 36023.0078  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 45s (remain 2m 49s) Los

Epoch 1 - avg_train_loss: 0.2516  avg_val_loss: 0.1412  time: 374s
Epoch 1 - Score: 0.5347  Scores: [0.5596173654635412, 0.5453937728541064, 0.4833118424070095, 0.5466326565167763, 0.5506849559216589, 0.5228384691252208]
Epoch 1 - Save Best Score: 0.5347 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 18s) Loss: 0.1876(0.1876) Grad: 68291.0078  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 18s (remain 4m 59s) Loss: 0.0952(0.1445) Grad: 144574.3594  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 34s (remain 4m 36s) Loss: 0.1386(0.1338) Grad: 86733.3750  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 53s (remain 4m 28s) Loss: 0.1295(0.1297) Grad: 168911.1094  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 10s (remain 4m 9s) Loss: 0.1269(0.1229) Grad: 42772.8398  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 29s (remain 3m 54s) Loss: 0.1042(0.1197) Grad: 46433.2969  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 49s (remain 3m 42s) Loss: 0.0815(0.1191) Grad: 37767.6172  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 8s (remain 3m 25s) Loss: 0.0939(0.1179) Grad: 22721.3672  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 25s (remain 3m 5s) Loss: 0.0586(0.1178) Grad: 16717.5293  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 2m 42s (remain 2m

Epoch 2 - avg_train_loss: 0.1186  avg_val_loss: 0.1182  time: 368s
Epoch 2 - Score: 0.4876  Scores: [0.5285747381424382, 0.4586857826095581, 0.44927772093089563, 0.4768182654544211, 0.5076891447676236, 0.5044568604645828]
Epoch 2 - Save Best Score: 0.4876 Model


EVAL: [60/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.0916(0.1181) 
EVAL: [61/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.1681(0.1182) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 24s) Loss: 0.0682(0.0682) Grad: 55142.5586  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 18s (remain 5m 5s) Loss: 0.1193(0.1091) Grad: 120659.8516  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 36s (remain 4m 53s) Loss: 0.0907(0.1050) Grad: 59500.4648  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 54s (remain 4m 31s) Loss: 0.0822(0.0981) Grad: 69169.2188  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 12s (remain 4m 15s) Loss: 0.0955(0.0991) Grad: 77648.3672  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 29s (remain 3m 55s) Loss: 0.1240(0.0992) Grad: 49594.3086  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 47s (remain 3m 37s) Loss: 0.0975(0.0987) Grad: 25150.6953  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 4s (remain 3m 18s) Loss: 0.0996(0.0998) Grad: 11402.2295  LR: 0.00000704  
Epoch: [3][160/366] 

Epoch 3 - avg_train_loss: 0.1049  avg_val_loss: 0.1275  time: 368s
Epoch 3 - Score: 0.5081  Scores: [0.5554730103192376, 0.48171402452590534, 0.4568944971075073, 0.5148937720525695, 0.5245083603008704, 0.5154145892954436]


EVAL: [60/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.1026(0.1274) 
EVAL: [61/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.1750(0.1275) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 57s) Loss: 0.3187(0.3187) Grad: 255063.0312  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 19s (remain 5m 17s) Loss: 0.1982(0.1576) Grad: 222948.1562  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 35s (remain 4m 44s) Loss: 0.1620(0.1687) Grad: 190435.0000  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 54s (remain 4m 30s) Loss: 0.1132(0.1683) Grad: 182523.7188  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 11s (remain 4m 12s) Loss: 0.1010(0.1637) Grad: 426032.3750  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 29s (remain 3m 55s) Loss: 0.1900(0.1584) Grad: 235388.4688  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 48s (remain 3m 39s) Loss: 0.1516(0.1565) Grad: 204293.9219  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 5s (remain 3m 20s) Loss: 0.1206(0.1531) Grad: 166901.8125  LR: 0.00000116  
Epoch: [4][1

Epoch 4 - avg_train_loss: 0.1224  avg_val_loss: 0.1254  time: 370s
Epoch 4 - Score: 0.5034  Scores: [0.5334298167996343, 0.48716845607938547, 0.4568225155169128, 0.5145697748457155, 0.5286454890541772, 0.49957805864431315]


EVAL: [60/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.1040(0.1253) 
EVAL: [61/62] Elapsed 0m 40s (remain 0m 0s) Loss: 0.1469(0.1254) 


Score: 0.4876  Scores: [0.5285747381424382, 0.4586857826095581, 0.44927772093089563, 0.4768182654544211, 0.5076891447676236, 0.5044568604645828]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/fb3/inputs/common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-large/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnsweri

Epoch: [1][0/366] Elapsed 0m 1s (remain 7m 50s) Loss: 2.2294(2.2294) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 19s (remain 5m 25s) Loss: 0.5637(1.5341) Grad: 105923.8594  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 37s (remain 4m 54s) Loss: 0.2811(0.9236) Grad: 134451.5781  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 55s (remain 4m 36s) Loss: 0.0963(0.6749) Grad: 236697.5312  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 13s (remain 4m 19s) Loss: 0.1580(0.5538) Grad: 83511.2891  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 31s (remain 4m 0s) Loss: 0.1215(0.4772) Grad: 27343.3066  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 50s (remain 3m 43s) Loss: 0.1353(0.4233) Grad: 33257.4297  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 7s (remain 3m 23s) Loss: 0.0689(0.3855) Grad: 25794.4141  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 26s (remain 3m 6s) Loss: 0.1949(0.3541) Grad: 60334.6406  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 44s (remain 2m 48s) 

Epoch 1 - avg_train_loss: 0.2329  avg_val_loss: 0.1511  time: 369s
Epoch 1 - Score: 0.5547  Scores: [0.5621144511125462, 0.6044834782191361, 0.5077777069180971, 0.5291444690479224, 0.5815466666877832, 0.5430111163364532]
Epoch 1 - Save Best Score: 0.5547 Model


EVAL: [60/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.1361(0.1512) 
EVAL: [61/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.1027(0.1511) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 16s) Loss: 0.2078(0.2078) Grad: 168178.8281  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 19s (remain 5m 19s) Loss: 0.1591(0.2061) Grad: 82085.7344  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 36s (remain 4m 53s) Loss: 0.0949(0.1700) Grad: 53622.6680  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 55s (remain 4m 36s) Loss: 0.0849(0.1506) Grad: 59730.2852  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 12s (remain 4m 15s) Loss: 0.0868(0.1407) Grad: 39151.3945  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 31s (remain 3m 59s) Loss: 0.0949(0.1311) Grad: 41121.2656  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 49s (remain 3m 42s) Loss: 0.0878(0.1269) Grad: 60896.4609  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 9s (remain 3m 27s) Loss: 0.1200(0.1235) Grad: 56608.1445  LR: 0.00001466  
Epoch: [2][160/366]

Epoch 2 - avg_train_loss: 0.1149  avg_val_loss: 0.1077  time: 375s
Epoch 2 - Score: 0.4650  Scores: [0.5080279603264571, 0.4577102854131042, 0.4311642386080733, 0.45505451575707284, 0.4720510808513231, 0.4662420986127352]
Epoch 2 - Save Best Score: 0.4650 Model


EVAL: [60/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0924(0.1078) 
EVAL: [61/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0580(0.1077) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 15s) Loss: 0.0491(0.0491) Grad: 64400.4727  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 18s (remain 5m 5s) Loss: 0.1174(0.0904) Grad: 157497.2500  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 36s (remain 4m 48s) Loss: 0.0952(0.0929) Grad: 94687.2266  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 54s (remain 4m 34s) Loss: 0.1047(0.0956) Grad: 132548.1719  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 14s (remain 4m 21s) Loss: 0.0698(0.0957) Grad: 72255.7031  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 32s (remain 4m 1s) Loss: 0.1048(0.0953) Grad: 112855.6641  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 48s (remain 3m 40s) Loss: 0.0960(0.0955) Grad: 160833.9062  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 6s (remain 3m 22s) Loss: 0.0677(0.0956) Grad: 102866.7969  LR: 0.00000704  
Epoch: [3][160/36

Epoch 3 - avg_train_loss: 0.0943  avg_val_loss: 0.1072  time: 370s
Epoch 3 - Score: 0.4640  Scores: [0.5049099666873114, 0.4570625408428381, 0.43012203172395974, 0.45455265847778326, 0.4748372709051395, 0.4623414961364473]
Epoch 3 - Save Best Score: 0.4640 Model


EVAL: [60/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0961(0.1073) 
EVAL: [61/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0485(0.1072) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 9m 54s) Loss: 0.1149(0.1149) Grad: 84437.1484  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 19s (remain 5m 25s) Loss: 0.0786(0.0997) Grad: 81536.1562  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 36s (remain 4m 45s) Loss: 0.1384(0.0920) Grad: 83626.6797  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 52s (remain 4m 24s) Loss: 0.1487(0.0957) Grad: 130681.0625  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 9s (remain 4m 5s) Loss: 0.0705(0.0938) Grad: 80767.1094  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 28s (remain 3m 50s) Loss: 0.1001(0.0947) Grad: 174092.4688  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 46s (remain 3m 35s) Loss: 0.0467(0.0922) Grad: 93905.0859  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 5s (remain 3m 19s) Loss: 0.0746(0.0919) Grad: 97398.2812  LR: 0.00000116  
Epoch: [4][160/366] 

Epoch 4 - avg_train_loss: 0.0911  avg_val_loss: 0.1066  time: 368s
Epoch 4 - Score: 0.4627  Scores: [0.5034586069070297, 0.45623937859390806, 0.42957749702383113, 0.4538769947421328, 0.47217927101030877, 0.4611238193540567]
Epoch 4 - Save Best Score: 0.4627 Model


EVAL: [60/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0966(0.1067) 
EVAL: [61/62] Elapsed 0m 41s (remain 0m 0s) Loss: 0.0481(0.1066) 


Score: 0.4627  Scores: [0.5034586069070297, 0.45623937859390806, 0.42957749702383113, 0.4538769947421328, 0.47217927101030877, 0.4611238193540567]
Score: 0.4723  Scores: [0.5093956882823811, 0.4585070734860993, 0.43500373791005825, 0.4665617359702086, 0.4882270975619826, 0.47609261792890234]


finised
