In [1]:
# import os
# import gc
# import re
# import ast
# import sys
# import copy
# import json
# import time
# import math
# import string
# import pickle
# import random
# import joblib
# import itertools
# import warnings
# warnings.filterwarnings("ignore")

# import scipy as sp
# import numpy as np
# import pandas as pd
# from tqdm.auto import tqdm
# from sklearn.metrics import mean_squared_error
# from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

# import torch
# import torch.nn as nn
# from torch.nn import Parameter
# import torch.nn.functional as F
# from torch.optim import Adam, SGD, AdamW
# from torch.utils.data import DataLoader, Dataset


# import tokenizers
# import transformers
# from transformers import AutoTokenizer, AutoModel, AutoConfig
# from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
# from transformers import DataCollatorWithPadding
# os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# import codecs
# from typing import Dict, List, Tuple
# from text_unidecode import unidecode

In [None]:
# !pip install transformers==4.21.2
# !pip install tokenizers==0.12.1

# !pip install -q joblib scikit-learn scipy 

In [2]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset


import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Looking in indexes: https://repo.huaweicloud.com/repository/pypi/simple




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# CFG


In [3]:
DATA_P = '/root/autodl-tmp/fb3/inputs/'
OUTPUT_DIR = '/root/autodl-tmp/fb3/output/trained_tiny_model/lsg-electra-base/'

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=False
    # competition='FB3'
    # _wandb_kernel='nakama'
    
    path=f"{DATA_P}common-nlp-tokenizer/model_tokenizer/tiny/lsg-electra-base/"
    model=path
    config_path=model+'config.pth'
    tokenizer = AutoTokenizer.from_pretrained(model)
    cfg_save_output = OUTPUT_DIR

    debug=False
    train=True
    trust_remote_code=True
    
    apex=True
    print_freq=20
    num_workers=4
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    max_len=1436
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0,1]

In [5]:
# # ====================================================
# # tokenizer
# # ====================================================
# tokenizer = AutoTokenizer.from_pretrained(CFG.model)
# tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
# CFG.tokenizer = tokenizer

# Utils


In [6]:
# ====================================================
# Utils
# ====================================================

def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

##  Data Loading

In [7]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/train.csv')
test = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv(f'{DATA_P}feedback-prize-english-language-learning/sample_submission.csv')


In [8]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [9]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# Dataset

In [10]:
# ====================================================
# Define max_len
# ====================================================
# lengths = []
# tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
# for text in tk0:
#     length = len(CFG.tokenizer(text, add_special_tokens=False)['input_ids'])
#     lengths.append(length)
# CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

max_len: 1436


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [12]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True, trust_remote_code=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config, trust_remote_code=True)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
    
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss
    
    
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [13]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        f"{CFG.cfg_save_output}_fold{fold}_best.pth")
                        # OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

            cfg_save_output = f"{OUTPUT_DIR}trained_tiny_model/lsg-electra-base/"
            
            
    # predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
    #                          map_location=torch.device('cpu'))['predictions']
    predictions = torch.load(f"{CFG.cfg_save_output}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [14]:
CFG.model.split('/')[-2]

'lsg-electra-base'

In [15]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    print('finised')

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/lsg-electra-base/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswering": "modeling_lsg_electra.LSGElectraForQuestionAnswering",
    "AutoModelForSequenceClassification": "modeling_lsg_electra.LSGElectraForSequenceClassification",
    "AutoModelForTokenClassification"

Epoch: [1][0/366] Elapsed 0m 1s (remain 8m 37s) Loss: 2.4448(2.4448) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 7s (remain 2m 5s) Loss: 1.9463(2.1084) Grad: 213526.2656  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 13s (remain 1m 45s) Loss: 0.3212(1.4827) Grad: 127912.7344  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 19s (remain 1m 38s) Loss: 0.2921(1.0789) Grad: 112994.8750  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 0m 26s (remain 1m 32s) Loss: 0.2104(0.8637) Grad: 96628.5000  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 0m 32s (remain 1m 25s) Loss: 0.1454(0.7255) Grad: 125618.9844  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 0m 38s (remain 1m 18s) Loss: 0.2384(0.6324) Grad: 58255.9219  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 0m 45s (remain 1m 12s) Loss: 0.1064(0.5626) Grad: 49054.6484  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 0m 51s (remain 1m 5s) Loss: 0.1971(0.5096) Grad: 88765.9297  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 0m 57s (remain 0m 58s)

Epoch 1 - avg_train_loss: 0.2973  avg_val_loss: 0.1226  time: 130s
Epoch 1 - Score: 0.4974  Scores: [0.5166656761352973, 0.4777127241669594, 0.47244990940846804, 0.4984803535383854, 0.5319704195334692, 0.4872868815061927]
Epoch 1 - Save Best Score: 0.4974 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1134(0.1227) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0962(0.1226) 
Epoch: [2][0/366] Elapsed 0m 0s (remain 4m 24s) Loss: 0.1255(0.1255) Grad: 133688.5469  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 7s (remain 1m 59s) Loss: 0.1093(0.1068) Grad: 93737.2578  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 13s (remain 1m 48s) Loss: 0.1720(0.1177) Grad: 97595.2812  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 19s (remain 1m 37s) Loss: 0.0870(0.1197) Grad: 106552.6797  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 0m 25s (remain 1m 29s) Loss: 0.1268(0.1220) Grad: 177491.2344  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 0m 31s (remain 1m 23s) Loss: 0.1473(0.1209) Grad: 208036.2188  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 0m 38s (remain 1m 17s) Loss: 0.0775(0.1198) Grad: 38357.0156  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 0m 44s (remain 1m 11s) Loss: 0.1905(0.1193) Grad: 133978.7031  LR: 0.00001466  
Epoch: [2][160/

Epoch 2 - avg_train_loss: 0.1146  avg_val_loss: 0.1122  time: 131s
Epoch 2 - Score: 0.4749  Scores: [0.511003344931223, 0.46730272313591015, 0.42892431044116597, 0.47139263771343026, 0.5040661888428912, 0.46701069084870184]
Epoch 2 - Save Best Score: 0.4749 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0968(0.1123) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0716(0.1122) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 4m 51s) Loss: 0.0743(0.0743) Grad: 85734.0469  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 7s (remain 1m 57s) Loss: 0.1263(0.0960) Grad: 151135.2031  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 13s (remain 1m 46s) Loss: 0.1136(0.1026) Grad: 85873.7109  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 19s (remain 1m 38s) Loss: 0.2070(0.1043) Grad: 131517.5312  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 0m 26s (remain 1m 31s) Loss: 0.1741(0.1053) Grad: 143399.0625  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 0m 32s (remain 1m 24s) Loss: 0.0712(0.1044) Grad: 86517.3594  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 0m 38s (remain 1m 17s) Loss: 0.0918(0.1061) Grad: 148512.7500  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 0m 44s (remain 1m 10s) Loss: 0.0913(0.1063) Grad: 91839.0781  LR: 0.00000704  
Epoch: [3][160/3

Epoch 3 - avg_train_loss: 0.1031  avg_val_loss: 0.1092  time: 132s
Epoch 3 - Score: 0.4681  Scores: [0.5025967361055018, 0.46204271409425124, 0.42513572479434086, 0.46389033049285494, 0.4948173369103289, 0.4598998453924384]
Epoch 3 - Save Best Score: 0.4681 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1001(0.1092) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0743(0.1092) 
Epoch: [4][0/366] Elapsed 0m 0s (remain 4m 57s) Loss: 0.0822(0.0822) Grad: 86965.6719  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 7s (remain 2m 2s) Loss: 0.0919(0.1056) Grad: 222918.9688  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 13s (remain 1m 50s) Loss: 0.0885(0.1041) Grad: 141148.1250  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 20s (remain 1m 41s) Loss: 0.0871(0.1024) Grad: 67441.8984  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 0m 26s (remain 1m 34s) Loss: 0.0745(0.1020) Grad: 113434.6250  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 0m 32s (remain 1m 25s) Loss: 0.0694(0.1003) Grad: 62925.7070  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 0m 39s (remain 1m 19s) Loss: 0.0800(0.0983) Grad: 157641.1875  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 0m 45s (remain 1m 12s) Loss: 0.0868(0.0967) Grad: 132745.1875  LR: 0.00000116  
Epoch: [4][160/3

Epoch 4 - avg_train_loss: 0.0946  avg_val_loss: 0.1098  time: 131s
Epoch 4 - Score: 0.4696  Scores: [0.5064835694830998, 0.4665849365219889, 0.42551874960821384, 0.46488545180786944, 0.4940782801577294, 0.4598645192609039]


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0946(0.1099) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0592(0.1098) 


Score: 0.4681  Scores: [0.5025967361055018, 0.46204271409425124, 0.42513572479434086, 0.46389033049285494, 0.4948173369103289, 0.4598998453924384]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/lsg-electra-base/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswering": "modeling_lsg_electra.LSGElectraForQuestionAnsw

Epoch: [1][0/366] Elapsed 0m 0s (remain 4m 48s) Loss: 2.2501(2.2501) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 7s (remain 2m 6s) Loss: 1.3879(2.1837) Grad: 212690.9844  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 13s (remain 1m 45s) Loss: 0.2526(1.4402) Grad: 153389.8750  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 19s (remain 1m 38s) Loss: 0.1183(1.0451) Grad: 56462.6328  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 0m 25s (remain 1m 31s) Loss: 0.1894(0.8329) Grad: 288352.5625  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 0m 31s (remain 1m 23s) Loss: 0.1948(0.7084) Grad: 112787.1016  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 0m 37s (remain 1m 16s) Loss: 0.0865(0.6178) Grad: 58513.7227  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 0m 44s (remain 1m 10s) Loss: 0.0692(0.5510) Grad: 72905.6562  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 0m 50s (remain 1m 4s) Loss: 0.1182(0.5017) Grad: 90096.5391  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 0m 56s (remain 0m 58s)

Epoch 1 - avg_train_loss: 0.2988  avg_val_loss: 0.1304  time: 131s
Epoch 1 - Score: 0.5130  Scores: [0.55899716385439, 0.49153571409377617, 0.459647995662612, 0.5031437403149736, 0.5412977615601366, 0.5233889810335599]
Epoch 1 - Save Best Score: 0.5130 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1119(0.1305) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0552(0.1304) 
Epoch: [2][0/366] Elapsed 0m 0s (remain 5m 7s) Loss: 0.0915(0.0915) Grad: 96831.8906  LR: 0.00001707  
Epoch: [2][20/366] Elapsed 0m 6s (remain 1m 52s) Loss: 0.1981(0.1318) Grad: 284675.3750  LR: 0.00001676  
Epoch: [2][40/366] Elapsed 0m 13s (remain 1m 43s) Loss: 0.1599(0.1282) Grad: 58814.6523  LR: 0.00001644  
Epoch: [2][60/366] Elapsed 0m 18s (remain 1m 34s) Loss: 0.1287(0.1230) Grad: 97758.0703  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 0m 25s (remain 1m 29s) Loss: 0.1320(0.1219) Grad: 208734.0781  LR: 0.00001576  
Epoch: [2][100/366] Elapsed 0m 31s (remain 1m 22s) Loss: 0.0987(0.1196) Grad: 73660.2422  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 0m 37s (remain 1m 16s) Loss: 0.1122(0.1178) Grad: 86569.4609  LR: 0.00001504  
Epoch: [2][140/366] Elapsed 0m 44s (remain 1m 10s) Loss: 0.1278(0.1176) Grad: 141645.9844  LR: 0.00001466  
Epoch: [2][160/366

Epoch 2 - avg_train_loss: 0.1157  avg_val_loss: 0.1178  time: 132s
Epoch 2 - Score: 0.4869  Scores: [0.5200962473484135, 0.48398211077820025, 0.4460111846590913, 0.4744885872705223, 0.5109707032588287, 0.48599647081610525]
Epoch 2 - Save Best Score: 0.4869 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1118(0.1179) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0576(0.1178) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 4m 53s) Loss: 0.1151(0.1151) Grad: 110248.5625  LR: 0.00001001  
Epoch: [3][20/366] Elapsed 0m 6s (remain 1m 54s) Loss: 0.1212(0.1183) Grad: 118035.5156  LR: 0.00000958  
Epoch: [3][40/366] Elapsed 0m 13s (remain 1m 46s) Loss: 0.1091(0.1124) Grad: 202701.0781  LR: 0.00000916  
Epoch: [3][60/366] Elapsed 0m 19s (remain 1m 38s) Loss: 0.0761(0.1073) Grad: 110129.1484  LR: 0.00000873  
Epoch: [3][80/366] Elapsed 0m 26s (remain 1m 31s) Loss: 0.0899(0.1039) Grad: 171845.8125  LR: 0.00000831  
Epoch: [3][100/366] Elapsed 0m 32s (remain 1m 24s) Loss: 0.0987(0.1036) Grad: 143206.3750  LR: 0.00000789  
Epoch: [3][120/366] Elapsed 0m 38s (remain 1m 17s) Loss: 0.0697(0.1041) Grad: 116529.3672  LR: 0.00000747  
Epoch: [3][140/366] Elapsed 0m 44s (remain 1m 11s) Loss: 0.0707(0.1053) Grad: 63406.2031  LR: 0.00000706  
Epoch: [3][16

Epoch 3 - avg_train_loss: 0.1029  avg_val_loss: 0.1126  time: 131s
Epoch 3 - Score: 0.4757  Scores: [0.5093139902830198, 0.46944658411694795, 0.43352235939229966, 0.469144937097953, 0.5007311558349079, 0.4718798646166392]
Epoch 3 - Save Best Score: 0.4757 Model


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1085(0.1127) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0543(0.1126) 
Epoch: [4][0/366] Elapsed 0m 0s (remain 4m 34s) Loss: 0.0597(0.0597) Grad: 104740.5391  LR: 0.00000295  
Epoch: [4][20/366] Elapsed 0m 7s (remain 1m 58s) Loss: 0.0964(0.0912) Grad: 101348.9141  LR: 0.00000265  
Epoch: [4][40/366] Elapsed 0m 13s (remain 1m 44s) Loss: 0.1084(0.0925) Grad: 213679.0625  LR: 0.00000237  
Epoch: [4][60/366] Elapsed 0m 19s (remain 1m 37s) Loss: 0.1003(0.0936) Grad: 137094.1875  LR: 0.00000210  
Epoch: [4][80/366] Elapsed 0m 26s (remain 1m 33s) Loss: 0.0851(0.0952) Grad: 61194.8320  LR: 0.00000184  
Epoch: [4][100/366] Elapsed 0m 33s (remain 1m 26s) Loss: 0.0834(0.0960) Grad: 71062.8359  LR: 0.00000160  
Epoch: [4][120/366] Elapsed 0m 38s (remain 1m 18s) Loss: 0.0521(0.0947) Grad: 62318.8477  LR: 0.00000138  
Epoch: [4][140/366] Elapsed 0m 45s (remain 1m 12s) Loss: 0.1133(0.0955) Grad: 68697.4609  LR: 0.00000117  
Epoch: [4][160/3

Epoch 4 - avg_train_loss: 0.0959  avg_val_loss: 0.1131  time: 132s
Epoch 4 - Score: 0.4767  Scores: [0.5110707541429977, 0.47154084485214914, 0.4320079536237206, 0.4714024476256716, 0.5013375726240118, 0.47254639172551777]


EVAL: [60/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1112(0.1132) 
EVAL: [61/62] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0512(0.1131) 


Score: 0.4757  Scores: [0.5093139902830198, 0.46944658411694795, 0.43352235939229966, 0.469144937097953, 0.5007311558349079, 0.4718798646166392]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/lsg-electra-base/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswering": "modeling_lsg_electra.LSGElectraForQuestionAnswer

Epoch: [1][0/366] Elapsed 0m 0s (remain 4m 52s) Loss: 2.7859(2.7859) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 7s (remain 1m 55s) Loss: 1.9210(2.1938) Grad: 223487.8125  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 13s (remain 1m 46s) Loss: 0.3617(1.5445) Grad: 104310.2031  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 20s (remain 1m 42s) Loss: 0.1530(1.1192) Grad: 48284.1133  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 0m 26s (remain 1m 34s) Loss: 0.1333(0.8976) Grad: 33126.9688  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 0m 33s (remain 1m 27s) Loss: 0.2422(0.7586) Grad: 160463.2188  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 0m 39s (remain 1m 20s) Loss: 0.1587(0.6689) Grad: 51797.6758  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 0m 45s (remain 1m 12s) Loss: 0.1443(0.6031) Grad: 53282.5742  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 0m 51s (remain 1m 5s) Loss: 0.2455(0.5504) Grad: 48908.0273  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 0m 58s (remain 0m 59s)

Epoch 1 - avg_train_loss: 0.3202  avg_val_loss: 0.1305  time: 132s
Epoch 1 - Score: 0.5140  Scores: [0.5231948509319471, 0.4789838806558671, 0.4850892884652619, 0.504153810548069, 0.5466567238066969, 0.5459839602892322]
Epoch 1 - Save Best Score: 0.5140 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.1068(0.1302) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.2796(0.1305) 
Epoch: [2][0/366] Elapsed 0m 0s (remain 5m 51s) Loss: 0.1003(0.1003) Grad: 99603.9141  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 7s (remain 2m 1s) Loss: 0.1049(0.1111) Grad: 56966.7578  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 13s (remain 1m 48s) Loss: 0.1053(0.1139) Grad: 75339.7656  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 20s (remain 1m 40s) Loss: 0.1382(0.1151) Grad: 173223.5156  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 0m 25s (remain 1m 31s) Loss: 0.0992(0.1116) Grad: 114574.9297  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 0m 32s (remain 1m 25s) Loss: 0.1101(0.1123) Grad: 89592.1406  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 0m 39s (remain 1m 19s) Loss: 0.0803(0.1143) Grad: 80803.4688  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 0m 46s (remain 1m 13s) Loss: 0.2108(0.1164) Grad: 120474.0391  LR: 0.00001466  
Epoch: [2][160/366

Epoch 2 - avg_train_loss: 0.1146  avg_val_loss: 0.1209  time: 132s
Epoch 2 - Score: 0.4945  Scores: [0.5291488393341985, 0.4784461386015409, 0.4646683124384198, 0.5023636445398333, 0.4965168905174975, 0.49564576305334257]
Epoch 2 - Save Best Score: 0.4945 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0998(0.1209) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.1308(0.1209) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 5m 11s) Loss: 0.0763(0.0763) Grad: 45627.6250  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 7s (remain 1m 59s) Loss: 0.1329(0.1053) Grad: 243299.3125  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 13s (remain 1m 47s) Loss: 0.1435(0.1035) Grad: 218939.3281  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 19s (remain 1m 36s) Loss: 0.0604(0.1029) Grad: 84567.7812  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 0m 25s (remain 1m 31s) Loss: 0.0947(0.0999) Grad: 75751.9141  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 0m 31s (remain 1m 23s) Loss: 0.1100(0.0993) Grad: 69439.2422  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 0m 37s (remain 1m 16s) Loss: 0.1047(0.0995) Grad: 151694.9844  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 0m 44s (remain 1m 10s) Loss: 0.1320(0.1002) Grad: 190952.0156  LR: 0.00000704  
Epoch: [3][160/3

Epoch 3 - avg_train_loss: 0.1026  avg_val_loss: 0.1151  time: 132s
Epoch 3 - Score: 0.4815  Scores: [0.5139244879788463, 0.46487266872059346, 0.44085451688797833, 0.4821287288160072, 0.49506517299121566, 0.4922677365430486]
Epoch 3 - Save Best Score: 0.4815 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0925(0.1150) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.1801(0.1151) 
Epoch: [4][0/366] Elapsed 0m 0s (remain 4m 48s) Loss: 0.1298(0.1298) Grad: 117366.3594  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 6s (remain 1m 53s) Loss: 0.0843(0.0862) Grad: 130919.3281  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 13s (remain 1m 46s) Loss: 0.0798(0.0884) Grad: 59822.2188  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 20s (remain 1m 41s) Loss: 0.1300(0.0919) Grad: 123847.5625  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 0m 26s (remain 1m 33s) Loss: 0.0725(0.0942) Grad: 99769.5312  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 0m 33s (remain 1m 27s) Loss: 0.1316(0.0958) Grad: 107257.8906  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 0m 39s (remain 1m 20s) Loss: 0.0755(0.0969) Grad: 69643.3281  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 0m 46s (remain 1m 13s) Loss: 0.0752(0.0973) Grad: 120740.1406  LR: 0.00000116  
Epoch: [4][160/

Epoch 4 - avg_train_loss: 0.0951  avg_val_loss: 0.1141  time: 132s
Epoch 4 - Score: 0.4794  Scores: [0.5099867082801764, 0.46497887708457236, 0.4410143723262784, 0.4810482805557443, 0.49137899522601247, 0.48817451616749014]
Epoch 4 - Save Best Score: 0.4794 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0932(0.1140) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.1561(0.1141) 


Score: 0.4794  Scores: [0.5099867082801764, 0.46497887708457236, 0.4410143723262784, 0.4810482805557443, 0.49137899522601247, 0.48817451616749014]
Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
LSGElectraConfig {
  "_name_or_path": "/root/autodl-tmp/lsg-electra-base/",
  "adaptive": true,
  "architectures": [
    "LSGElectraForPreTraining"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "auto_map": {
    "AutoModel": "modeling_lsg_electra.LSGElectraModel",
    "AutoModelForCausalLM": "modeling_lsg_electra.LSGElectraForCausalLM",
    "AutoModelForMaskedLM": "modeling_lsg_electra.LSGElectraForMaskedLM",
    "AutoModelForMultipleChoice": "modeling_lsg_electra.LSGElectraForMultipleChoice",
    "AutoModelForPreTraining": "modeling_lsg_electra.LSGElectraForPreTraining",
    "AutoModelForQuestionAnswering": "modeling_lsg_electra.LSGElectraForQuestionAnsw

Epoch: [1][0/366] Elapsed 0m 0s (remain 4m 41s) Loss: 2.4840(2.4840) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 6s (remain 1m 54s) Loss: 1.7957(2.1319) Grad: 222775.3438  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 13s (remain 1m 48s) Loss: 0.5588(1.4103) Grad: 147241.7812  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 20s (remain 1m 44s) Loss: 0.2347(1.0128) Grad: 75411.7188  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 0m 27s (remain 1m 35s) Loss: 0.1257(0.8170) Grad: 81024.6562  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 0m 33s (remain 1m 28s) Loss: 0.1856(0.6865) Grad: 107211.5703  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 0m 39s (remain 1m 20s) Loss: 0.0760(0.5950) Grad: 56333.1172  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 0m 46s (remain 1m 13s) Loss: 0.2551(0.5367) Grad: 249208.0312  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 0m 52s (remain 1m 6s) Loss: 0.1249(0.4888) Grad: 55104.0664  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 0m 58s (remain 1m 0s)

Epoch 1 - avg_train_loss: 0.2932  avg_val_loss: 0.1220  time: 133s
Epoch 1 - Score: 0.4964  Scores: [0.526141610051238, 0.4681249307146075, 0.4816124163615339, 0.478038826008438, 0.5393247059276969, 0.48510072958342654]
Epoch 1 - Save Best Score: 0.4964 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.1046(0.1221) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0842(0.1220) 
Epoch: [2][0/366] Elapsed 0m 0s (remain 4m 22s) Loss: 0.1221(0.1221) Grad: 151452.2344  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 7s (remain 2m 3s) Loss: 0.1118(0.1164) Grad: 105299.7891  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 14s (remain 1m 52s) Loss: 0.1221(0.1165) Grad: 98966.9609  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 20s (remain 1m 42s) Loss: 0.1247(0.1149) Grad: 83631.2656  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 0m 26s (remain 1m 32s) Loss: 0.1108(0.1119) Grad: 121162.8594  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 0m 32s (remain 1m 25s) Loss: 0.1321(0.1109) Grad: 79939.7266  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 0m 38s (remain 1m 18s) Loss: 0.1807(0.1134) Grad: 281357.5000  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 0m 45s (remain 1m 12s) Loss: 0.1132(0.1134) Grad: 219319.6406  LR: 0.00001466  
Epoch: [2][160/3

Epoch 2 - avg_train_loss: 0.1158  avg_val_loss: 0.1133  time: 133s
Epoch 2 - Score: 0.4771  Scores: [0.5075631227246158, 0.4567200843366543, 0.4383141646699799, 0.4620412870332408, 0.5300047739445778, 0.46818674871812177]
Epoch 2 - Save Best Score: 0.4771 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0995(0.1134) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0533(0.1133) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 4m 39s) Loss: 0.0797(0.0797) Grad: 63733.4414  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 7s (remain 1m 59s) Loss: 0.0904(0.0958) Grad: 112811.1641  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 13s (remain 1m 47s) Loss: 0.1096(0.0979) Grad: 118858.2656  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 19s (remain 1m 39s) Loss: 0.1443(0.1016) Grad: 173066.6562  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 0m 26s (remain 1m 33s) Loss: 0.0580(0.1039) Grad: 88477.8828  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 0m 32s (remain 1m 26s) Loss: 0.0941(0.1049) Grad: 80804.2188  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 0m 39s (remain 1m 19s) Loss: 0.1000(0.1045) Grad: 107389.8438  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 0m 46s (remain 1m 13s) Loss: 0.1091(0.1042) Grad: 149540.7656  LR: 0.00000704  
Epoch: [3][160/

Epoch 3 - avg_train_loss: 0.1031  avg_val_loss: 0.1084  time: 133s
Epoch 3 - Score: 0.4668  Scores: [0.5034888864469952, 0.45447456478792947, 0.4325065592318034, 0.4550091262223107, 0.49270537132782116, 0.4626257003280509]
Epoch 3 - Save Best Score: 0.4668 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0937(0.1086) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0442(0.1084) 
Epoch: [4][0/366] Elapsed 0m 0s (remain 5m 32s) Loss: 0.0714(0.0714) Grad: 80821.5156  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 7s (remain 2m 9s) Loss: 0.1418(0.1026) Grad: 142979.8750  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 14s (remain 1m 51s) Loss: 0.2050(0.1047) Grad: 170244.7656  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 20s (remain 1m 40s) Loss: 0.0750(0.1015) Grad: 99660.9297  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 0m 26s (remain 1m 33s) Loss: 0.1386(0.1010) Grad: 110946.5000  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 0m 33s (remain 1m 27s) Loss: 0.0600(0.1001) Grad: 69135.7422  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 0m 39s (remain 1m 19s) Loss: 0.0811(0.1007) Grad: 71444.8047  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 0m 46s (remain 1m 13s) Loss: 0.0554(0.0991) Grad: 88400.7188  LR: 0.00000116  
Epoch: [4][160/366

Epoch 4 - avg_train_loss: 0.0962  avg_val_loss: 0.1082  time: 133s
Epoch 4 - Score: 0.4663  Scores: [0.5027185538033578, 0.4533606108598723, 0.4309384753262597, 0.4550886891511836, 0.4928231639087349, 0.4629594929363798]
Epoch 4 - Save Best Score: 0.4663 Model


EVAL: [60/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0912(0.1083) 
EVAL: [61/62] Elapsed 0m 14s (remain 0m 0s) Loss: 0.0479(0.1082) 


Score: 0.4663  Scores: [0.5027185538033578, 0.4533606108598723, 0.4309384753262597, 0.4550886891511836, 0.4928231639087349, 0.4629594929363798]
Score: 0.4724  Scores: [0.5061653231580762, 0.4624927400134135, 0.43269011127595514, 0.46738706941242064, 0.49494898435707757, 0.4708564904202183]


finised
