<a href="https://colab.research.google.com/github/yanzhen4/feedback-prize-english-language-learning/blob/master/DeBERTa-pseudoLabel_1000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/fb3-deberta-v3-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

Package: pip install wandb transformers tokenizers iterative-stratification

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = '/content/gdrive/MyDrive/Colab_Notebooks/Deberta/'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Directory settings

In [1]:
from google.colab import drive

drive.mount('/content/gdrive/', force_remount=True)

Mounted at /content/gdrive/


# CFG

In [4]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="/content/gdrive/MyDrive/Colab_Notebooks/Deberta/microsoft_deberta-large"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=5
    encoder_lr=2e-6
    decoder_lr=2e-6
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [5]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Library

In [6]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

#os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip uninstall -y transformers')
#os.system('pip uninstall -y tokenizers')
#os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
#os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
#import tokenizers
import transformers
#print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

transformers.__version__: 4.24.0
env: TOKENIZERS_PARALLELISM=true


# Utils

In [7]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds): #loss
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores #score: single prediction result; scores: list of all results


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'): #print log 
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [8]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('/content/gdrive/MyDrive/Colab_Notebooks/Deberta/train.csv')
test = pd.read_csv('/content/gdrive/MyDrive/Colab_Notebooks/Deberta/test.csv')
submission = pd.read_csv('/content/gdrive/MyDrive/Colab_Notebooks/Deberta/sample_submission.csv')
old_train = pd.read_csv("/content/gdrive/MyDrive/Colab_Notebooks/Deberta/old_train.csv")

In [9]:
#Test
train = train.head(1000)
print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (1000, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


In [10]:
size = 200
old_train = old_train[["discourse_text"]].rename(columns={"discourse_text": "full_text"})
old_train = old_train[old_train["full_text"].str.len() >= 500].head(size).reset_index()
old_train = old_train.drop(columns = ['index'])
initial_values = np.zeros((len(old_train), 6))
old_train[CFG.target_cols] = initial_values
old_train

Unnamed: 0,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,According to an article by the Edgar Snyder Fi...,0.0,0.0,0.0,0.0,0.0,0.0
1,The most common of these distractions is a cel...,0.0,0.0,0.0,0.0,0.0,0.0
2,Reaction time is the measure of how quickly an...,0.0,0.0,0.0,0.0,0.0,0.0
3,"The affects can be physical, emotional, or psy...",0.0,0.0,0.0,0.0,0.0,0.0
4,"In conclusion, people shouldn't use cellphones...",0.0,0.0,0.0,0.0,0.0,0.0
5,It could lead to accidents and altercations. Y...,0.0,0.0,0.0,0.0,0.0,0.0
6,Many people happen to pass away due to a motor...,0.0,0.0,0.0,0.0,0.0,0.0
7,"In conclusion, Drivers shouldn't drive while u...",0.0,0.0,0.0,0.0,0.0,0.0
8,Other drivers take notice when drivers are on ...,0.0,0.0,0.0,0.0,0.0,0.0
9,Driving with cell phones should be banned beca...,0.0,0.0,0.0,0.0,0.0,0.0


# CV split

In [11]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    250
1    250
2    250
3    250
dtype: int64

In [12]:
len(train)

1000

In [13]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model) #get deberta tokenizer 
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

# Dataset

In [15]:
# ====================================================
# Define max_len
# Find max token length after tokenizing each text 
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")
CFG.max_len

  0%|          | 0/1000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (851 > 512). Running this sequence through the model will result in indexing errors
max_len: 4182
INFO:__main__:max_len: 4182


4182

In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs #a list of tensors 


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() 
        #last_hidden_state is a list of n tensors, n is sequence length
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        #padding: [PAD] make same length, for [PAD] attention_mask = 0
        #last_hidden_state is output of the model
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True) 
            #output of every layer during inference
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6) #hidden_size is input size, 6 is output size 
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [18]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        #optimize
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs) #predict
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [20]:
#testing
# ====================================================
# train loop
# ====================================================
import math
def train_loop(folds, fold): 
     
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    # num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    # scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    pesudoLabel_data = old_train #dataframe
    pesudoLabel_data_len = len(pesudoLabel_data)
    num_sample = 5
    sample_size = int(pesudoLabel_data_len / num_sample)

    num_train_steps = int((len(train_folds) + math.ceil(sample_size * num_sample / 2)) / CFG.batch_size * CFG.epochs * num_sample + len(train_folds) 
                          / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf
    lr_scaler = 4
    # 5 Epoch with origin data
    for epoch in range(CFG.epochs):

        start_time = time.time()
         
        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    #unlabled_data = old data
    
    for i in range(1, 6):
        # new scheduler
        # optimizer_parameters = get_optimizer_params(model,
        #                                         encoder_lr=CFG.encoder_lr / lr_scaler, 
        #                                         decoder_lr=CFG.decoder_lr / lr_scaler,
        #                                         weight_decay=CFG.weight_decay)
        # optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr / lr_scaler, eps=CFG.eps, betas=CFG.betas)
        # lr_scaler = lr_scaler * lr_scaler
        
        # num_train_steps = int((len(train_folds) + i * sample_size) / CFG.batch_size * CFG.epochs)
        # scheduler = get_scheduler(CFG, optimizer, num_train_steps)

        pesudoLabel_data_use = pesudoLabel_data[:sample_size]
        pesudoLabel_data = pesudoLabel_data[sample_size:]
        #unlabled_data = unlabled_data - sampled_data
        pesudo_dataset = TrainDataset(CFG, pesudoLabel_data_use)
        pesudo_loader = DataLoader(pesudo_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
        preds = np.array([[]])
        start = end = time.time()
        i = 0
        for step, (inputs, labels) in enumerate(pesudo_loader):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            with torch.no_grad():
                y_preds = model(inputs) #predict
            if i != 0:
              preds = np.concatenate((preds, y_preds.to('cpu').numpy()), axis = 0)
            else:
              i = 1
              preds = y_preds.to('cpu').numpy()
            end = time.time()
        
        print(np.array(preds).shape, pesudoLabel_data_use.columns)
        pesudoLabel_data_use[CFG.target_cols] = preds
        print(train_folds.columns)
        print(pesudoLabel_data_use.columns)
        frames = [train_folds, pesudoLabel_data_use]
        train_folds = pd.concat(frames)
        train_dataset = TrainDataset(CFG, train_folds)
        train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)

        # 5 Epochs training with origin data + pseudo-labled data
        for epoch in range(CFG.epochs):

            
                
            start_time = time.time()

            #predict for all sampled_data, generate labled data
            #stack sampled_data on new data
            #add to train_loader
            
            # train
            avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

            # eval
            avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
            
            # scoring
            score, scores = get_score(valid_labels, predictions)

            elapsed = time.time() - start_time

            LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
            LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
            if CFG.wandb:
                wandb.log({f"[fold{fold}] epoch": epoch+1, 
                          f"[fold{fold}] avg_train_loss": avg_loss, 
                          f"[fold{fold}] avg_val_loss": avg_val_loss,
                          f"[fold{fold}] score": score})
            
            if best_score > score:
                best_score = score
                LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
                torch.save({'model': model.state_dict(),
                            'predictions': predictions},
                            OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")
            
    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [22]:
#testing
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
                break
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

DebertaConfig {
  "_name_or_path": "/content/gdrive/MyDrive/Colab_Notebooks/Deberta/microsoft_deberta-large",
  "architectures": [
    "DebertaModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "c2p",
    "p2c"
  ],
  "position_biased_input": false,
  "relative_attention": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 0,
  "vocab_size": 50265
}

INFO:__main__:DebertaConfig {
  "_name_or_path": "/content/gdrive/MyDrive/Colab_No

Epoch: [1][0/750] Elapsed 0m 4s (remain 57m 15s) Loss: 2.7481(2.7481) Grad: inf  LR: 0.00000200  
Epoch: [1][20/750] Elapsed 0m 10s (remain 5m 54s) Loss: 1.8999(2.1047) Grad: 382233.0312  LR: 0.00000200  
Epoch: [1][40/750] Elapsed 0m 15s (remain 4m 30s) Loss: 2.3224(1.8411) Grad: 456535.5000  LR: 0.00000200  
Epoch: [1][60/750] Elapsed 0m 21s (remain 3m 58s) Loss: 0.2473(1.4236) Grad: 405036.4062  LR: 0.00000200  
Epoch: [1][80/750] Elapsed 0m 26s (remain 3m 40s) Loss: 0.0941(1.1234) Grad: 234005.9844  LR: 0.00000200  
Epoch: [1][100/750] Elapsed 0m 32s (remain 3m 26s) Loss: 0.9347(0.9415) Grad: 579065.8125  LR: 0.00000200  
Epoch: [1][120/750] Elapsed 0m 38s (remain 3m 19s) Loss: 0.0800(0.8231) Grad: 230424.7031  LR: 0.00000200  
Epoch: [1][140/750] Elapsed 0m 44s (remain 3m 10s) Loss: 0.1007(0.7336) Grad: 250333.1719  LR: 0.00000200  
Epoch: [1][160/750] Elapsed 0m 50s (remain 3m 3s) Loss: 0.0546(0.6639) Grad: 176067.3750  LR: 0.00000200  
Epoch: [1][180/750] Elapsed 0m 55s (remain 

Epoch 1 - avg_train_loss: 0.2690  avg_val_loss: 0.1742  time: 231s
INFO:__main__:Epoch 1 - avg_train_loss: 0.2690  avg_val_loss: 0.1742  time: 231s
Epoch 1 - Score: 0.5986  Scores: [0.641151946503038, 0.5158336806909397, 0.562247368306175, 0.6336507957886203, 0.620643569047055, 0.6182538218846939]
INFO:__main__:Epoch 1 - Score: 0.5986  Scores: [0.641151946503038, 0.5158336806909397, 0.562247368306175, 0.6336507957886203, 0.620643569047055, 0.6182538218846939]
Epoch 1 - Save Best Score: 0.5986 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5986 Model


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.1423(0.1742) 
Epoch: [2][0/750] Elapsed 0m 0s (remain 5m 31s) Loss: 0.0951(0.0951) Grad: inf  LR: 0.00000200  
Epoch: [2][20/750] Elapsed 0m 5s (remain 3m 24s) Loss: 0.0512(0.1130) Grad: 159013.7344  LR: 0.00000200  
Epoch: [2][40/750] Elapsed 0m 12s (remain 3m 30s) Loss: 0.1020(0.1009) Grad: 265847.5000  LR: 0.00000200  
Epoch: [2][60/750] Elapsed 0m 18s (remain 3m 26s) Loss: 0.1301(0.0933) Grad: 257116.5312  LR: 0.00000199  
Epoch: [2][80/750] Elapsed 0m 23s (remain 3m 16s) Loss: 0.3389(0.0974) Grad: 569955.7500  LR: 0.00000199  
Epoch: [2][100/750] Elapsed 0m 29s (remain 3m 8s) Loss: 0.1435(0.0945) Grad: 319821.9375  LR: 0.00000199  
Epoch: [2][120/750] Elapsed 0m 34s (remain 3m 1s) Loss: 0.1142(0.0919) Grad: 311512.4062  LR: 0.00000199  
Epoch: [2][140/750] Elapsed 0m 40s (remain 2m 54s) Loss: 0.0650(0.1050) Grad: 187572.8281  LR: 0.00000199  
Epoch: [2][160/750] Elapsed 0m 45s (remain 2m 48s) Loss: 0.0420(0.1055) Grad: 183787.8

Epoch 2 - avg_train_loss: 0.1089  avg_val_loss: 0.1118  time: 227s
INFO:__main__:Epoch 2 - avg_train_loss: 0.1089  avg_val_loss: 0.1118  time: 227s
Epoch 2 - Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]
INFO:__main__:Epoch 2 - Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]
Epoch 2 - Save Best Score: 0.4744 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.4744 Model


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0760(0.1118) 
Epoch: [3][0/750] Elapsed 0m 0s (remain 6m 2s) Loss: 0.0488(0.0488) Grad: 339814.7500  LR: 0.00000198  
Epoch: [3][20/750] Elapsed 0m 5s (remain 3m 27s) Loss: 0.0592(0.0868) Grad: 213397.0938  LR: 0.00000198  
Epoch: [3][40/750] Elapsed 0m 11s (remain 3m 23s) Loss: 0.1139(0.1217) Grad: 265611.0625  LR: 0.00000198  
Epoch: [3][60/750] Elapsed 0m 17s (remain 3m 16s) Loss: 0.0404(0.1110) Grad: 153278.4219  LR: 0.00000198  
Epoch: [3][80/750] Elapsed 0m 23s (remain 3m 15s) Loss: 0.0869(0.1049) Grad: 258457.7031  LR: 0.00000198  
Epoch: [3][100/750] Elapsed 0m 29s (remain 3m 7s) Loss: 0.0803(0.0997) Grad: 223013.5312  LR: 0.00000198  
Epoch: [3][120/750] Elapsed 0m 34s (remain 3m 0s) Loss: 0.1515(0.1040) Grad: 323058.3438  LR: 0.00000198  
Epoch: [3][140/750] Elapsed 0m 40s (remain 2m 53s) Loss: 0.0459(0.1000) Grad: 163850.4375  LR: 0.00000198  
Epoch: [3][160/750] Elapsed 0m 45s (remain 2m 47s) Loss: 0.0543(0.0962) Grad: 1

Epoch 3 - avg_train_loss: 0.0956  avg_val_loss: 0.1165  time: 227s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0956  avg_val_loss: 0.1165  time: 227s
Epoch 3 - Score: 0.4845  Scores: [0.5145388761737608, 0.48340827745687803, 0.4690414656230287, 0.47847139433682306, 0.4941474856748964, 0.46733711356318514]
INFO:__main__:Epoch 3 - Score: 0.4845  Scores: [0.5145388761737608, 0.48340827745687803, 0.4690414656230287, 0.47847139433682306, 0.4941474856748964, 0.46733711356318514]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0677(0.1165) 
Epoch: [4][0/750] Elapsed 0m 0s (remain 6m 9s) Loss: 0.0284(0.0284) Grad: 311914.2812  LR: 0.00000196  
Epoch: [4][20/750] Elapsed 0m 5s (remain 3m 25s) Loss: 0.0790(0.0706) Grad: 253537.5781  LR: 0.00000196  
Epoch: [4][40/750] Elapsed 0m 11s (remain 3m 19s) Loss: 0.0679(0.0668) Grad: 186944.9375  LR: 0.00000196  
Epoch: [4][60/750] Elapsed 0m 17s (remain 3m 12s) Loss: 0.0250(0.0658) Grad: 129742.4453  LR: 0.00000196  
Epoch: [4][80/750] Elapsed 0m 22s (remain 3m 8s) Loss: 0.1086(0.0688) Grad: 345600.5312  LR: 0.00000196  
Epoch: [4][100/750] Elapsed 0m 28s (remain 3m 2s) Loss: 0.0364(0.0727) Grad: 153483.2500  LR: 0.00000196  
Epoch: [4][120/750] Elapsed 0m 33s (remain 2m 56s) Loss: 0.0182(0.0714) Grad: 113967.5703  LR: 0.00000196  
Epoch: [4][140/750] Elapsed 0m 39s (remain 2m 49s) Loss: 0.0264(0.0716) Grad: 125036.5781  LR: 0.00000196  
Epoch: [4][160/750] Elapsed 0m 44s (remain 2m 43s) Loss: 0.0376(0.0726) Grad: 1

Epoch 4 - avg_train_loss: 0.0893  avg_val_loss: 0.1187  time: 227s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0893  avg_val_loss: 0.1187  time: 227s
Epoch 4 - Score: 0.4894  Scores: [0.5211263130209435, 0.49627608450169625, 0.4322608255041249, 0.5241217778015762, 0.4946244019826858, 0.46807324052124055]
INFO:__main__:Epoch 4 - Score: 0.4894  Scores: [0.5211263130209435, 0.49627608450169625, 0.4322608255041249, 0.5241217778015762, 0.4946244019826858, 0.46807324052124055]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0838(0.1187) 
Epoch: [5][0/750] Elapsed 0m 0s (remain 5m 45s) Loss: 0.1034(0.1034) Grad: inf  LR: 0.00000193  
Epoch: [5][20/750] Elapsed 0m 5s (remain 3m 26s) Loss: 0.0863(0.0703) Grad: 249097.4688  LR: 0.00000193  
Epoch: [5][40/750] Elapsed 0m 11s (remain 3m 19s) Loss: 0.0151(0.0750) Grad: 89696.0703  LR: 0.00000193  
Epoch: [5][60/750] Elapsed 0m 16s (remain 3m 11s) Loss: 0.0244(0.0729) Grad: 160769.7500  LR: 0.00000193  
Epoch: [5][80/750] Elapsed 0m 23s (remain 3m 14s) Loss: 0.0942(0.0787) Grad: 542060.5625  LR: 0.00000193  
Epoch: [5][100/750] Elapsed 0m 28s (remain 3m 6s) Loss: 0.2108(0.0820) Grad: 416533.6250  LR: 0.00000193  
Epoch: [5][120/750] Elapsed 0m 34s (remain 2m 59s) Loss: 0.2816(0.0807) Grad: 562795.8750  LR: 0.00000192  
Epoch: [5][140/750] Elapsed 0m 41s (remain 2m 57s) Loss: 0.0487(0.0822) Grad: 204493.3125  LR: 0.00000192  
Epoch: [5][160/750] Elapsed 0m 46s (remain 2m 51s) Loss: 0.2706(0.0809) Grad: 490479.3

Epoch 5 - avg_train_loss: 0.0789  avg_val_loss: 0.1150  time: 227s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0789  avg_val_loss: 0.1150  time: 227s
Epoch 5 - Score: 0.4809  Scores: [0.49533342755907755, 0.48859671030723406, 0.43278750732394966, 0.49378452405505696, 0.5108711866744036, 0.46382926230809857]
INFO:__main__:Epoch 5 - Score: 0.4809  Scores: [0.49533342755907755, 0.48859671030723406, 0.43278750732394966, 0.49378452405505696, 0.5108711866744036, 0.46382926230809857]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0796(0.1150) 
(40, 6) Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold'], dtype='object')
Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Epoch: [1][0/790] Elapsed 0m 0s (remain 6m 15s) Loss: 0.0000(0.0000) Grad: 777.9039  LR: 0.00000189  
Epoch: [1][20/790] Elapsed 0m 5s (remain 3m 37s) Loss: 0.0040(0.0650) Grad: 47626.6172  LR: 0.00000189  
Epoch: [1][40/790] Elapsed 0m 11s (remain 3m 28s) Loss: 0.0131(0.0531) Grad: 93124.7734  LR: 0.00000189  
Epoch: [1][60/790] Elapsed 0m 16s (remain 3m 21s) Loss: 0.0147(0.0496) Grad: 83264.1172  LR: 0.00000189  
Epoch: [1][80/790] Elapsed 0m 22s (remain 3m 18s) Loss: 0.1001(0.0501) Grad: 331677.2812  LR: 0.00000189  
Epoch: [1][100/790] Elapsed 0m 28s (r

Epoch 1 - avg_train_loss: 0.0643  avg_val_loss: 0.1155  time: 238s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0643  avg_val_loss: 0.1155  time: 238s
Epoch 1 - Score: 0.4824  Scores: [0.5055299379672322, 0.4890132423002857, 0.42954788602164845, 0.4955284299631898, 0.49359445040153926, 0.48127658247419625]
INFO:__main__:Epoch 1 - Score: 0.4824  Scores: [0.5055299379672322, 0.4890132423002857, 0.42954788602164845, 0.4955284299631898, 0.49359445040153926, 0.48127658247419625]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0974(0.1155) 
Epoch: [2][0/790] Elapsed 0m 0s (remain 6m 25s) Loss: 0.0333(0.0333) Grad: 444096.3750  LR: 0.00000184  
Epoch: [2][20/790] Elapsed 0m 6s (remain 3m 54s) Loss: 0.0252(0.0460) Grad: 275286.2812  LR: 0.00000184  
Epoch: [2][40/790] Elapsed 0m 11s (remain 3m 35s) Loss: 0.0300(0.0507) Grad: 144618.4219  LR: 0.00000184  
Epoch: [2][60/790] Elapsed 0m 17s (remain 3m 26s) Loss: 0.0078(0.0526) Grad: 89117.1797  LR: 0.00000184  
Epoch: [2][80/790] Elapsed 0m 22s (remain 3m 19s) Loss: 0.0612(0.0515) Grad: 260215.7031  LR: 0.00000184  
Epoch: [2][100/790] Elapsed 0m 28s (remain 3m 12s) Loss: 0.0465(0.0509) Grad: 187924.7656  LR: 0.00000183  
Epoch: [2][120/790] Elapsed 0m 34s (remain 3m 10s) Loss: 0.0796(0.0515) Grad: 210047.9375  LR: 0.00000183  
Epoch: [2][140/790] Elapsed 0m 40s (remain 3m 4s) Loss: 0.0523(0.0499) Grad: 205354.4531  LR: 0.00000183  
Epoch: [2][160/790] Elapsed 0m 45s (remain 2m 58s) Loss: 0.2782(0.0508) Grad: 

Epoch 2 - avg_train_loss: 0.0600  avg_val_loss: 0.1183  time: 238s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0600  avg_val_loss: 0.1183  time: 238s
Epoch 2 - Score: 0.4887  Scores: [0.5044489082785619, 0.48967375602516483, 0.4543790238374466, 0.5022487331847074, 0.5048387574023848, 0.4768222589918503]
INFO:__main__:Epoch 2 - Score: 0.4887  Scores: [0.5044489082785619, 0.48967375602516483, 0.4543790238374466, 0.5022487331847074, 0.5048387574023848, 0.4768222589918503]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0959(0.1183) 
Epoch: [3][0/790] Elapsed 0m 0s (remain 6m 43s) Loss: 0.0329(0.0329) Grad: 388958.4375  LR: 0.00000178  
Epoch: [3][20/790] Elapsed 0m 5s (remain 3m 39s) Loss: 0.0399(0.0549) Grad: 479692.7500  LR: 0.00000178  
Epoch: [3][40/790] Elapsed 0m 11s (remain 3m 29s) Loss: 0.0155(0.0506) Grad: 214984.4062  LR: 0.00000178  
Epoch: [3][60/790] Elapsed 0m 16s (remain 3m 23s) Loss: 0.0075(0.0485) Grad: 139322.3125  LR: 0.00000178  
Epoch: [3][80/790] Elapsed 0m 22s (remain 3m 17s) Loss: 0.0924(0.0471) Grad: 597614.6875  LR: 0.00000178  
Epoch: [3][100/790] Elapsed 0m 28s (remain 3m 11s) Loss: 0.0319(0.0456) Grad: 158819.5625  LR: 0.00000178  
Epoch: [3][120/790] Elapsed 0m 33s (remain 3m 4s) Loss: 0.0801(0.0450) Grad: 258804.9219  LR: 0.00000177  
Epoch: [3][140/790] Elapsed 0m 39s (remain 3m 2s) Loss: 0.0275(0.0444) Grad: 140393.9062  LR: 0.00000177  
Epoch: [3][160/790] Elapsed 0m 45s (remain 2m 56s) Loss: 0.0405(0.0443) Grad: 

Epoch 3 - avg_train_loss: 0.0454  avg_val_loss: 0.1243  time: 238s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0454  avg_val_loss: 0.1243  time: 238s
Epoch 3 - Score: 0.5010  Scores: [0.5306761334044718, 0.5216499142211736, 0.4370716391308852, 0.5129925990498712, 0.5263395871723923, 0.4771789924614918]
INFO:__main__:Epoch 3 - Score: 0.5010  Scores: [0.5306761334044718, 0.5216499142211736, 0.4370716391308852, 0.5129925990498712, 0.5263395871723923, 0.4771789924614918]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0710(0.1243) 
Epoch: [4][0/790] Elapsed 0m 0s (remain 6m 37s) Loss: 0.0987(0.0987) Grad: 429921.4062  LR: 0.00000172  
Epoch: [4][20/790] Elapsed 0m 5s (remain 3m 39s) Loss: 0.0440(0.0425) Grad: 388471.5000  LR: 0.00000172  
Epoch: [4][40/790] Elapsed 0m 11s (remain 3m 31s) Loss: 0.0448(0.0396) Grad: 161091.8594  LR: 0.00000171  
Epoch: [4][60/790] Elapsed 0m 17s (remain 3m 24s) Loss: 0.0139(0.0378) Grad: 97065.8516  LR: 0.00000171  
Epoch: [4][80/790] Elapsed 0m 23s (remain 3m 22s) Loss: 0.0433(0.0370) Grad: 206411.9219  LR: 0.00000171  
Epoch: [4][100/790] Elapsed 0m 28s (remain 3m 16s) Loss: 0.0426(0.0346) Grad: 184975.8750  LR: 0.00000171  
Epoch: [4][120/790] Elapsed 0m 34s (remain 3m 9s) Loss: 0.0257(0.0341) Grad: 162448.2031  LR: 0.00000171  
Epoch: [4][140/790] Elapsed 0m 39s (remain 3m 2s) Loss: 0.0099(0.0335) Grad: 75112.6484  LR: 0.00000171  
Epoch: [4][160/790] Elapsed 0m 45s (remain 2m 56s) Loss: 0.0719(0.0329) Grad: 26

Epoch 4 - avg_train_loss: 0.0346  avg_val_loss: 0.1211  time: 238s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0346  avg_val_loss: 0.1211  time: 238s
Epoch 4 - Score: 0.4941  Scores: [0.5294398950830769, 0.49329629383425994, 0.43446605874976596, 0.5040222796128071, 0.5119832567323038, 0.49159624139041413]
INFO:__main__:Epoch 4 - Score: 0.4941  Scores: [0.5294398950830769, 0.49329629383425994, 0.43446605874976596, 0.5040222796128071, 0.5119832567323038, 0.49159624139041413]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0907(0.1211) 
Epoch: [5][0/790] Elapsed 0m 0s (remain 6m 26s) Loss: 0.0028(0.0028) Grad: 124702.6016  LR: 0.00000165  
Epoch: [5][20/790] Elapsed 0m 6s (remain 4m 1s) Loss: 0.0191(0.0225) Grad: 343755.5625  LR: 0.00000164  
Epoch: [5][40/790] Elapsed 0m 12s (remain 3m 41s) Loss: 0.0187(0.0211) Grad: 292866.7500  LR: 0.00000164  
Epoch: [5][60/790] Elapsed 0m 17s (remain 3m 30s) Loss: 0.0255(0.0230) Grad: 367315.5000  LR: 0.00000164  
Epoch: [5][80/790] Elapsed 0m 23s (remain 3m 21s) Loss: 0.0142(0.0240) Grad: 216306.5625  LR: 0.00000164  
Epoch: [5][100/790] Elapsed 0m 28s (remain 3m 15s) Loss: 0.0981(0.0243) Grad: 554746.6250  LR: 0.00000164  
Epoch: [5][120/790] Elapsed 0m 34s (remain 3m 8s) Loss: 0.0346(0.0239) Grad: 258660.0156  LR: 0.00000163  
Epoch: [5][140/790] Elapsed 0m 39s (remain 3m 2s) Loss: 0.0285(0.0245) Grad: 413021.0000  LR: 0.00000163  
Epoch: [5][160/790] Elapsed 0m 45s (remain 2m 56s) Loss: 0.0124(0.0272) Grad: 2

Epoch 5 - avg_train_loss: 0.0260  avg_val_loss: 0.1207  time: 238s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0260  avg_val_loss: 0.1207  time: 238s
Epoch 5 - Score: 0.4946  Scores: [0.5247316175828755, 0.5088788268005908, 0.443385208370467, 0.5116406533589808, 0.5103610904327999, 0.46837681507373363]
INFO:__main__:Epoch 5 - Score: 0.4946  Scores: [0.5247316175828755, 0.5088788268005908, 0.443385208370467, 0.5116406533589808, 0.5103610904327999, 0.46837681507373363]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0842(0.1207) 
(40, 6) Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold'], dtype='object')
Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Epoch: [1][0/830] Elapsed 0m 0s (remain 6m 49s) Loss: 0.0147(0.0147) Grad: 241334.1875  LR: 0.00000157  
Epoch: [1][20/830] Elapsed 0m 6s (remain 3m 51s) Loss: 0.0053(0.0137) Grad: 128185.0391  LR: 0.00000157  
Epoch: [1][40/830] Elapsed 0m 11s (remain 3m 41s) Loss: 0.0395(0.0159) Grad: 364608.5312  LR: 0.00000156  
Epoch: [1][60/830] Elapsed 0m 16s (remain 3m 33s) Loss: 0.0102(0.0172) Grad: 178865.3125  LR: 0.00000156  
Epoch: [1][80/830] Elapsed 0m 22s (remain 3m 28s) Loss: 0.0171(0.0171) Grad: 355670.4688  LR: 0.00000156  
Epoch: [1][100/830] Elapsed 0m 

Epoch 1 - avg_train_loss: 0.0192  avg_val_loss: 0.1269  time: 249s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0192  avg_val_loss: 0.1269  time: 249s
Epoch 1 - Score: 0.5061  Scores: [0.5508749649516262, 0.5347997549318488, 0.44172435832155965, 0.5206330164364796, 0.5141792996420594, 0.47425402594792826]
INFO:__main__:Epoch 1 - Score: 0.5061  Scores: [0.5508749649516262, 0.5347997549318488, 0.44172435832155965, 0.5206330164364796, 0.5141792996420594, 0.47425402594792826]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0620(0.1269) 
Epoch: [2][0/830] Elapsed 0m 0s (remain 7m 1s) Loss: 0.0306(0.0306) Grad: 306974.0625  LR: 0.00000148  
Epoch: [2][20/830] Elapsed 0m 5s (remain 3m 50s) Loss: 0.0612(0.0152) Grad: 464692.9688  LR: 0.00000148  
Epoch: [2][40/830] Elapsed 0m 11s (remain 3m 40s) Loss: 0.0004(0.0131) Grad: 35287.2891  LR: 0.00000147  
Epoch: [2][60/830] Elapsed 0m 17s (remain 3m 35s) Loss: 0.0061(0.0139) Grad: 214888.2188  LR: 0.00000147  
Epoch: [2][80/830] Elapsed 0m 22s (remain 3m 28s) Loss: 0.0070(0.0138) Grad: 183790.5625  LR: 0.00000147  
Epoch: [2][100/830] Elapsed 0m 27s (remain 3m 21s) Loss: 0.0235(0.0133) Grad: 403675.3438  LR: 0.00000147  
Epoch: [2][120/830] Elapsed 0m 33s (remain 3m 17s) Loss: 0.0093(0.0144) Grad: 239001.1094  LR: 0.00000147  
Epoch: [2][140/830] Elapsed 0m 39s (remain 3m 11s) Loss: 0.0065(0.0142) Grad: 184510.7344  LR: 0.00000146  
Epoch: [2][160/830] Elapsed 0m 44s (remain 3m 6s) Loss: 0.0212(0.0143) Grad: 3

Epoch 2 - avg_train_loss: 0.0145  avg_val_loss: 0.1244  time: 248s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0145  avg_val_loss: 0.1244  time: 248s
Epoch 2 - Score: 0.5015  Scores: [0.5405288055532975, 0.5122707434936085, 0.44103238142711404, 0.5106267202163524, 0.5228661339525466, 0.4819225544529323]
INFO:__main__:Epoch 2 - Score: 0.5015  Scores: [0.5405288055532975, 0.5122707434936085, 0.44103238142711404, 0.5106267202163524, 0.5228661339525466, 0.4819225544529323]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0791(0.1244) 
Epoch: [3][0/830] Elapsed 0m 0s (remain 6m 55s) Loss: 0.0041(0.0041) Grad: 120996.1406  LR: 0.00000138  
Epoch: [3][20/830] Elapsed 0m 6s (remain 4m 17s) Loss: 0.0053(0.0091) Grad: 163205.7500  LR: 0.00000138  
Epoch: [3][40/830] Elapsed 0m 12s (remain 3m 54s) Loss: 0.0021(0.0098) Grad: 116178.4375  LR: 0.00000138  
Epoch: [3][60/830] Elapsed 0m 17s (remain 3m 43s) Loss: 0.0020(0.0090) Grad: 84741.4375  LR: 0.00000138  
Epoch: [3][80/830] Elapsed 0m 23s (remain 3m 35s) Loss: 0.0083(0.0096) Grad: 198866.6250  LR: 0.00000138  
Epoch: [3][100/830] Elapsed 0m 28s (remain 3m 28s) Loss: 0.0089(0.0093) Grad: 217995.2500  LR: 0.00000137  
Epoch: [3][120/830] Elapsed 0m 34s (remain 3m 21s) Loss: 0.0032(0.0095) Grad: 116828.8438  LR: 0.00000137  
Epoch: [3][140/830] Elapsed 0m 39s (remain 3m 14s) Loss: 0.0057(0.0096) Grad: 160328.0156  LR: 0.00000137  
Epoch: [3][160/830] Elapsed 0m 45s (remain 3m 8s) Loss: 0.0083(0.0091) Grad: 

Epoch 3 - avg_train_loss: 0.0106  avg_val_loss: 0.1254  time: 249s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0106  avg_val_loss: 0.1254  time: 249s
Epoch 3 - Score: 0.5032  Scores: [0.5440513505449945, 0.5086684924989494, 0.4461054250000473, 0.5144297859872405, 0.5204570762846312, 0.4856065906960962]
INFO:__main__:Epoch 3 - Score: 0.5032  Scores: [0.5440513505449945, 0.5086684924989494, 0.4461054250000473, 0.5144297859872405, 0.5204570762846312, 0.4856065906960962]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0918(0.1254) 
Epoch: [4][0/830] Elapsed 0m 0s (remain 6m 47s) Loss: 0.0072(0.0072) Grad: 179467.5781  LR: 0.00000129  
Epoch: [4][20/830] Elapsed 0m 6s (remain 3m 59s) Loss: 0.0143(0.0061) Grad: 264512.6875  LR: 0.00000128  
Epoch: [4][40/830] Elapsed 0m 11s (remain 3m 45s) Loss: 0.0049(0.0067) Grad: 158950.5625  LR: 0.00000128  
Epoch: [4][60/830] Elapsed 0m 17s (remain 3m 37s) Loss: 0.0020(0.0071) Grad: 82554.3438  LR: 0.00000128  
Epoch: [4][80/830] Elapsed 0m 22s (remain 3m 30s) Loss: 0.0016(0.0070) Grad: 82596.6094  LR: 0.00000128  
Epoch: [4][100/830] Elapsed 0m 28s (remain 3m 23s) Loss: 0.0022(0.0064) Grad: 106168.0312  LR: 0.00000127  
Epoch: [4][120/830] Elapsed 0m 33s (remain 3m 17s) Loss: 0.0128(0.0071) Grad: 193454.2188  LR: 0.00000127  
Epoch: [4][140/830] Elapsed 0m 39s (remain 3m 11s) Loss: 0.0025(0.0079) Grad: 131843.6094  LR: 0.00000127  
Epoch: [4][160/830] Elapsed 0m 44s (remain 3m 6s) Loss: 0.0164(0.0078) Grad: 2

Epoch 4 - avg_train_loss: 0.0082  avg_val_loss: 0.1295  time: 249s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0082  avg_val_loss: 0.1295  time: 249s
Epoch 4 - Score: 0.5123  Scores: [0.5458458398234279, 0.5188138364242358, 0.449310656377602, 0.5248912749558405, 0.5247608029556297, 0.5103194569819957]
INFO:__main__:Epoch 4 - Score: 0.5123  Scores: [0.5458458398234279, 0.5188138364242358, 0.449310656377602, 0.5248912749558405, 0.5247608029556297, 0.5103194569819957]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0780(0.1295) 
Epoch: [5][0/830] Elapsed 0m 0s (remain 6m 55s) Loss: 0.0055(0.0055) Grad: 172255.5625  LR: 0.00000118  
Epoch: [5][20/830] Elapsed 0m 5s (remain 3m 50s) Loss: 0.0008(0.0068) Grad: 84917.8828  LR: 0.00000118  
Epoch: [5][40/830] Elapsed 0m 11s (remain 3m 39s) Loss: 0.0162(0.0061) Grad: 287006.2500  LR: 0.00000118  
Epoch: [5][60/830] Elapsed 0m 16s (remain 3m 32s) Loss: 0.0006(0.0059) Grad: 53052.1172  LR: 0.00000118  
Epoch: [5][80/830] Elapsed 0m 22s (remain 3m 26s) Loss: 0.0010(0.0053) Grad: 63884.1875  LR: 0.00000117  
Epoch: [5][100/830] Elapsed 0m 27s (remain 3m 21s) Loss: 0.0004(0.0051) Grad: 44869.6289  LR: 0.00000117  
Epoch: [5][120/830] Elapsed 0m 33s (remain 3m 15s) Loss: 0.0015(0.0056) Grad: 71140.9141  LR: 0.00000117  
Epoch: [5][140/830] Elapsed 0m 38s (remain 3m 10s) Loss: 0.0081(0.0056) Grad: 252484.3594  LR: 0.00000117  
Epoch: [5][160/830] Elapsed 0m 44s (remain 3m 4s) Loss: 0.0086(0.0056) Grad: 2310

Epoch 5 - avg_train_loss: 0.0064  avg_val_loss: 0.1276  time: 249s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0064  avg_val_loss: 0.1276  time: 249s
Epoch 5 - Score: 0.5077  Scores: [0.5405835698503, 0.50903030982796, 0.45089933905107754, 0.5320451149251493, 0.5240654117679651, 0.48930470742032456]
INFO:__main__:Epoch 5 - Score: 0.5077  Scores: [0.5405835698503, 0.50903030982796, 0.45089933905107754, 0.5320451149251493, 0.5240654117679651, 0.48930470742032456]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0918(0.1276) 
(40, 6) Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold'], dtype='object')
Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Epoch: [1][0/870] Elapsed 0m 0s (remain 7m 18s) Loss: 0.0035(0.0035) Grad: 154182.3750  LR: 0.00000108  
Epoch: [1][20/870] Elapsed 0m 6s (remain 4m 3s) Loss: 0.0012(0.0072) Grad: 83013.0469  LR: 0.00000108  
Epoch: [1][40/870] Elapsed 0m 11s (remain 3m 52s) Loss: 0.0136(0.0057) Grad: 183863.2344  LR: 0.00000108  
Epoch: [1][60/870] Elapsed 0m 17s (remain 3m 51s) Loss: 0.0008(0.0053) Grad: 63988.9648  LR: 0.00000107  
Epoch: [1][80/870] Elapsed 0m 23s (remain 3m 49s) Loss: 0.0019(0.0059) Grad: 95078.8984  LR: 0.00000107  
Epoch: [1][100/870] Elapsed 0m 29s 

Epoch 1 - avg_train_loss: 0.0067  avg_val_loss: 0.1256  time: 259s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0067  avg_val_loss: 0.1256  time: 259s
Epoch 1 - Score: 0.5045  Scores: [0.5416996852553712, 0.5152315925740998, 0.449879012841027, 0.5182643394648494, 0.5156217064648022, 0.4865996393209579]
INFO:__main__:Epoch 1 - Score: 0.5045  Scores: [0.5416996852553712, 0.5152315925740998, 0.449879012841027, 0.5182643394648494, 0.5156217064648022, 0.4865996393209579]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0871(0.1256) 
Epoch: [2][0/870] Elapsed 0m 0s (remain 7m 5s) Loss: 0.0020(0.0020) Grad: 117312.1562  LR: 0.00000097  
Epoch: [2][20/870] Elapsed 0m 5s (remain 4m 1s) Loss: 0.0057(0.0067) Grad: 190357.9219  LR: 0.00000097  
Epoch: [2][40/870] Elapsed 0m 11s (remain 3m 53s) Loss: 0.0091(0.0078) Grad: 351188.5312  LR: 0.00000097  
Epoch: [2][60/870] Elapsed 0m 17s (remain 3m 46s) Loss: 0.0069(0.0074) Grad: 257613.4375  LR: 0.00000096  
Epoch: [2][80/870] Elapsed 0m 22s (remain 3m 38s) Loss: 0.0053(0.0077) Grad: 216814.7344  LR: 0.00000096  
Epoch: [2][100/870] Elapsed 0m 27s (remain 3m 32s) Loss: 0.0012(0.0078) Grad: 70369.9219  LR: 0.00000096  
Epoch: [2][120/870] Elapsed 0m 33s (remain 3m 28s) Loss: 0.0154(0.0076) Grad: 352721.8438  LR: 0.00000096  
Epoch: [2][140/870] Elapsed 0m 39s (remain 3m 23s) Loss: 0.0111(0.0085) Grad: 256304.7656  LR: 0.00000095  
Epoch: [2][160/870] Elapsed 0m 44s (remain 3m 17s) Loss: 0.0011(0.0084) Grad: 6

Epoch 2 - avg_train_loss: 0.0074  avg_val_loss: 0.1309  time: 260s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0074  avg_val_loss: 0.1309  time: 260s
Epoch 2 - Score: 0.5149  Scores: [0.5676493168127109, 0.5226570933333974, 0.46255814454633354, 0.5250135811602556, 0.5241595172410125, 0.4873440593325937]
INFO:__main__:Epoch 2 - Score: 0.5149  Scores: [0.5676493168127109, 0.5226570933333974, 0.46255814454633354, 0.5250135811602556, 0.5241595172410125, 0.4873440593325937]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0767(0.1309) 
Epoch: [3][0/870] Elapsed 0m 0s (remain 7m 21s) Loss: 0.0004(0.0004) Grad: 29786.0195  LR: 0.00000086  
Epoch: [3][20/870] Elapsed 0m 5s (remain 4m 2s) Loss: 0.0017(0.0029) Grad: 83332.1875  LR: 0.00000086  
Epoch: [3][40/870] Elapsed 0m 11s (remain 3m 50s) Loss: 0.0021(0.0039) Grad: 107891.8828  LR: 0.00000086  
Epoch: [3][60/870] Elapsed 0m 16s (remain 3m 43s) Loss: 0.0035(0.0039) Grad: 152217.2031  LR: 0.00000086  
Epoch: [3][80/870] Elapsed 0m 22s (remain 3m 37s) Loss: 0.0016(0.0036) Grad: 112780.4375  LR: 0.00000085  
Epoch: [3][100/870] Elapsed 0m 27s (remain 3m 31s) Loss: 0.0020(0.0040) Grad: 102437.9297  LR: 0.00000085  
Epoch: [3][120/870] Elapsed 0m 33s (remain 3m 25s) Loss: 0.0012(0.0040) Grad: 81695.5547  LR: 0.00000085  
Epoch: [3][140/870] Elapsed 0m 38s (remain 3m 19s) Loss: 0.0051(0.0038) Grad: 184711.9531  LR: 0.00000085  
Epoch: [3][160/870] Elapsed 0m 44s (remain 3m 14s) Loss: 0.0099(0.0039) Grad: 26

Epoch 3 - avg_train_loss: 0.0038  avg_val_loss: 0.1276  time: 260s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0038  avg_val_loss: 0.1276  time: 260s
Epoch 3 - Score: 0.5079  Scores: [0.5414237515807696, 0.5221531679129477, 0.45038705710745847, 0.5216062271828302, 0.5257409569170493, 0.48623820453147953]
INFO:__main__:Epoch 3 - Score: 0.5079  Scores: [0.5414237515807696, 0.5221531679129477, 0.45038705710745847, 0.5216062271828302, 0.5257409569170493, 0.48623820453147953]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0892(0.1276) 
Epoch: [4][0/870] Elapsed 0m 0s (remain 7m 15s) Loss: 0.0022(0.0022) Grad: 119678.6562  LR: 0.00000076  
Epoch: [4][20/870] Elapsed 0m 6s (remain 4m 5s) Loss: 0.0017(0.0037) Grad: 93839.4609  LR: 0.00000075  
Epoch: [4][40/870] Elapsed 0m 12s (remain 4m 13s) Loss: 0.0027(0.0030) Grad: 112342.6641  LR: 0.00000075  
Epoch: [4][60/870] Elapsed 0m 18s (remain 3m 58s) Loss: 0.0004(0.0025) Grad: 42743.2188  LR: 0.00000075  
Epoch: [4][80/870] Elapsed 0m 23s (remain 3m 50s) Loss: 0.0002(0.0037) Grad: 31591.1367  LR: 0.00000075  
Epoch: [4][100/870] Elapsed 0m 29s (remain 3m 41s) Loss: 0.0009(0.0037) Grad: 53099.1523  LR: 0.00000074  
Epoch: [4][120/870] Elapsed 0m 34s (remain 3m 33s) Loss: 0.0028(0.0033) Grad: 119325.5391  LR: 0.00000074  
Epoch: [4][140/870] Elapsed 0m 40s (remain 3m 27s) Loss: 0.0010(0.0035) Grad: 82046.3281  LR: 0.00000074  
Epoch: [4][160/870] Elapsed 0m 45s (remain 3m 21s) Loss: 0.0007(0.0035) Grad: 6028

Epoch 4 - avg_train_loss: 0.0026  avg_val_loss: 0.1258  time: 260s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0026  avg_val_loss: 0.1258  time: 260s
Epoch 4 - Score: 0.5041  Scores: [0.5375375129190014, 0.5081259083778679, 0.4478713765583822, 0.5212334560678149, 0.5241735553694846, 0.4853737466576682]
INFO:__main__:Epoch 4 - Score: 0.5041  Scores: [0.5375375129190014, 0.5081259083778679, 0.4478713765583822, 0.5212334560678149, 0.5241735553694846, 0.4853737466576682]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0846(0.1258) 
Epoch: [5][0/870] Elapsed 0m 0s (remain 7m 15s) Loss: 0.0008(0.0008) Grad: 68584.8594  LR: 0.00000065  
Epoch: [5][20/870] Elapsed 0m 7s (remain 4m 45s) Loss: 0.0002(0.0020) Grad: 28033.0156  LR: 0.00000065  
Epoch: [5][40/870] Elapsed 0m 13s (remain 4m 27s) Loss: 0.0015(0.0020) Grad: 93670.9062  LR: 0.00000065  
Epoch: [5][60/870] Elapsed 0m 18s (remain 4m 7s) Loss: 0.0007(0.0017) Grad: 50740.9180  LR: 0.00000064  
Epoch: [5][80/870] Elapsed 0m 24s (remain 3m 57s) Loss: 0.0004(0.0015) Grad: 47135.3594  LR: 0.00000064  
Epoch: [5][100/870] Elapsed 0m 29s (remain 3m 47s) Loss: 0.0001(0.0013) Grad: 18340.6113  LR: 0.00000064  
Epoch: [5][120/870] Elapsed 0m 35s (remain 3m 39s) Loss: 0.0005(0.0013) Grad: 48082.6719  LR: 0.00000064  
Epoch: [5][140/870] Elapsed 0m 40s (remain 3m 31s) Loss: 0.0019(0.0013) Grad: 96416.2891  LR: 0.00000064  
Epoch: [5][160/870] Elapsed 0m 46s (remain 3m 24s) Loss: 0.0053(0.0013) Grad: 187162.

Epoch 5 - avg_train_loss: 0.0019  avg_val_loss: 0.1271  time: 259s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0019  avg_val_loss: 0.1271  time: 259s
Epoch 5 - Score: 0.5069  Scores: [0.5415299644681293, 0.5104307310000584, 0.4511421515327618, 0.5245109830272339, 0.5277811496239864, 0.48630310752536343]
INFO:__main__:Epoch 5 - Score: 0.5069  Scores: [0.5415299644681293, 0.5104307310000584, 0.4511421515327618, 0.5245109830272339, 0.5277811496239864, 0.48630310752536343]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0893(0.1271) 
(40, 6) Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold'], dtype='object')
Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Epoch: [1][0/910] Elapsed 0m 0s (remain 7m 42s) Loss: 0.0013(0.0013) Grad: 87811.2578  LR: 0.00000055  
Epoch: [1][20/910] Elapsed 0m 6s (remain 4m 47s) Loss: 0.0007(0.0019) Grad: 50958.0625  LR: 0.00000055  
Epoch: [1][40/910] Elapsed 0m 12s (remain 4m 22s) Loss: 0.0007(0.0029) Grad: 62810.7617  LR: 0.00000055  
Epoch: [1][60/910] Elapsed 0m 17s (remain 4m 8s) Loss: 0.0004(0.0023) Grad: 35490.8320  LR: 0.00000054  
Epoch: [1][80/910] Elapsed 0m 23s (remain 3m 59s) Loss: 0.0007(0.0020) Grad: 54789.0586  LR: 0.00000054  
Epoch: [1][100/910] Elapsed 0m 28s (r

Epoch 1 - avg_train_loss: 0.0014  avg_val_loss: 0.1274  time: 270s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0014  avg_val_loss: 0.1274  time: 270s
Epoch 1 - Score: 0.5074  Scores: [0.5420384559929684, 0.5138704330863841, 0.4505294083432854, 0.524291415824992, 0.5259609232019423, 0.4878848291923165]
INFO:__main__:Epoch 1 - Score: 0.5074  Scores: [0.5420384559929684, 0.5138704330863841, 0.4505294083432854, 0.524291415824992, 0.5259609232019423, 0.4878848291923165]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0860(0.1274) 
Epoch: [2][0/910] Elapsed 0m 0s (remain 7m 24s) Loss: 0.0001(0.0001) Grad: 22560.4336  LR: 0.00000045  
Epoch: [2][20/910] Elapsed 0m 5s (remain 4m 13s) Loss: 0.0002(0.0005) Grad: 39990.4805  LR: 0.00000045  
Epoch: [2][40/910] Elapsed 0m 11s (remain 4m 1s) Loss: 0.0004(0.0006) Grad: 47840.4531  LR: 0.00000045  
Epoch: [2][60/910] Elapsed 0m 17s (remain 3m 57s) Loss: 0.0003(0.0006) Grad: 32876.3594  LR: 0.00000045  
Epoch: [2][80/910] Elapsed 0m 22s (remain 3m 51s) Loss: 0.0006(0.0009) Grad: 62708.9844  LR: 0.00000044  
Epoch: [2][100/910] Elapsed 0m 28s (remain 3m 45s) Loss: 0.0002(0.0008) Grad: 26697.3770  LR: 0.00000044  
Epoch: [2][120/910] Elapsed 0m 33s (remain 3m 39s) Loss: 0.0004(0.0007) Grad: 41745.3672  LR: 0.00000044  
Epoch: [2][140/910] Elapsed 0m 39s (remain 3m 33s) Loss: 0.0002(0.0007) Grad: 23953.1191  LR: 0.00000044  
Epoch: [2][160/910] Elapsed 0m 45s (remain 3m 29s) Loss: 0.0022(0.0007) Grad: 131051.

Epoch 2 - avg_train_loss: 0.0009  avg_val_loss: 0.1277  time: 271s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0009  avg_val_loss: 0.1277  time: 271s
Epoch 2 - Score: 0.5081  Scores: [0.54608867919998, 0.5102387692945188, 0.45202881649346777, 0.5247050837830798, 0.5249763636699456, 0.4905076972501754]
INFO:__main__:Epoch 2 - Score: 0.5081  Scores: [0.54608867919998, 0.5102387692945188, 0.45202881649346777, 0.5247050837830798, 0.5249763636699456, 0.4905076972501754]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0843(0.1277) 
Epoch: [3][0/910] Elapsed 0m 0s (remain 7m 32s) Loss: 0.0005(0.0005) Grad: 46040.3125  LR: 0.00000036  
Epoch: [3][20/910] Elapsed 0m 5s (remain 4m 12s) Loss: 0.0002(0.0004) Grad: 34228.4062  LR: 0.00000036  
Epoch: [3][40/910] Elapsed 0m 11s (remain 4m 3s) Loss: 0.0007(0.0004) Grad: 73464.1719  LR: 0.00000036  
Epoch: [3][60/910] Elapsed 0m 16s (remain 3m 56s) Loss: 0.0002(0.0009) Grad: 24574.7871  LR: 0.00000035  
Epoch: [3][80/910] Elapsed 0m 22s (remain 3m 52s) Loss: 0.0001(0.0009) Grad: 21368.6406  LR: 0.00000035  
Epoch: [3][100/910] Elapsed 0m 28s (remain 3m 45s) Loss: 0.0002(0.0008) Grad: 39123.8594  LR: 0.00000035  
Epoch: [3][120/910] Elapsed 0m 33s (remain 3m 39s) Loss: 0.0002(0.0007) Grad: 32097.6992  LR: 0.00000035  
Epoch: [3][140/910] Elapsed 0m 39s (remain 3m 32s) Loss: 0.0007(0.0006) Grad: 73597.9453  LR: 0.00000035  
Epoch: [3][160/910] Elapsed 0m 44s (remain 3m 27s) Loss: 0.0000(0.0006) Grad: 5948.84

Epoch 3 - avg_train_loss: 0.0007  avg_val_loss: 0.1274  time: 270s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0007  avg_val_loss: 0.1274  time: 270s
Epoch 3 - Score: 0.5075  Scores: [0.546833676917089, 0.5123772051460811, 0.45126070479923225, 0.5222465266406981, 0.5251932741202482, 0.48717624189361297]
INFO:__main__:Epoch 3 - Score: 0.5075  Scores: [0.546833676917089, 0.5123772051460811, 0.45126070479923225, 0.5222465266406981, 0.5251932741202482, 0.48717624189361297]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0855(0.1274) 
Epoch: [4][0/910] Elapsed 0m 0s (remain 7m 37s) Loss: 0.0001(0.0001) Grad: 16260.3154  LR: 0.00000028  
Epoch: [4][20/910] Elapsed 0m 6s (remain 4m 15s) Loss: 0.0001(0.0002) Grad: 35658.7188  LR: 0.00000028  
Epoch: [4][40/910] Elapsed 0m 11s (remain 4m 4s) Loss: 0.0002(0.0002) Grad: 41280.6406  LR: 0.00000027  
Epoch: [4][60/910] Elapsed 0m 16s (remain 3m 56s) Loss: 0.0002(0.0006) Grad: 26864.6992  LR: 0.00000027  
Epoch: [4][80/910] Elapsed 0m 22s (remain 3m 49s) Loss: 0.0003(0.0005) Grad: 49586.3633  LR: 0.00000027  
Epoch: [4][100/910] Elapsed 0m 27s (remain 3m 43s) Loss: 0.0003(0.0004) Grad: 40512.7031  LR: 0.00000027  
Epoch: [4][120/910] Elapsed 0m 33s (remain 3m 40s) Loss: 0.0002(0.0005) Grad: 43296.9492  LR: 0.00000027  
Epoch: [4][140/910] Elapsed 0m 39s (remain 3m 34s) Loss: 0.0001(0.0008) Grad: 11539.4443  LR: 0.00000026  
Epoch: [4][160/910] Elapsed 0m 44s (remain 3m 28s) Loss: 0.0001(0.0007) Grad: 26953.1

Epoch 4 - avg_train_loss: 0.0006  avg_val_loss: 0.1279  time: 270s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0006  avg_val_loss: 0.1279  time: 270s
Epoch 4 - Score: 0.5084  Scores: [0.5468098012109956, 0.5134951764189142, 0.4522369496209764, 0.5246193387755153, 0.5261855164375867, 0.4870069684605603]
INFO:__main__:Epoch 4 - Score: 0.5084  Scores: [0.5468098012109956, 0.5134951764189142, 0.4522369496209764, 0.5246193387755153, 0.5261855164375867, 0.4870069684605603]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0854(0.1279) 
Epoch: [5][0/910] Elapsed 0m 0s (remain 7m 49s) Loss: 0.0002(0.0002) Grad: 31090.4297  LR: 0.00000020  
Epoch: [5][20/910] Elapsed 0m 5s (remain 4m 11s) Loss: 0.0001(0.0002) Grad: 26570.3281  LR: 0.00000020  
Epoch: [5][40/910] Elapsed 0m 11s (remain 4m 0s) Loss: 0.0002(0.0002) Grad: 24003.5176  LR: 0.00000020  
Epoch: [5][60/910] Elapsed 0m 16s (remain 3m 53s) Loss: 0.0000(0.0001) Grad: 5505.4473  LR: 0.00000020  
Epoch: [5][80/910] Elapsed 0m 22s (remain 3m 47s) Loss: 0.0002(0.0001) Grad: 42828.9023  LR: 0.00000020  
Epoch: [5][100/910] Elapsed 0m 27s (remain 3m 41s) Loss: 0.0001(0.0003) Grad: 29836.9160  LR: 0.00000020  
Epoch: [5][120/910] Elapsed 0m 33s (remain 3m 36s) Loss: 0.0001(0.0002) Grad: 19945.4453  LR: 0.00000019  
Epoch: [5][140/910] Elapsed 0m 38s (remain 3m 30s) Loss: 0.0001(0.0002) Grad: 26407.9277  LR: 0.00000019  
Epoch: [5][160/910] Elapsed 0m 44s (remain 3m 25s) Loss: 0.0000(0.0003) Grad: 8400.146

Epoch 5 - avg_train_loss: 0.0004  avg_val_loss: 0.1278  time: 270s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0004  avg_val_loss: 0.1278  time: 270s
Epoch 5 - Score: 0.5083  Scores: [0.5469370568510489, 0.5133744262266249, 0.4526195373043647, 0.5242260109150003, 0.5254554281601501, 0.48735972540309075]
INFO:__main__:Epoch 5 - Score: 0.5083  Scores: [0.5469370568510489, 0.5133744262266249, 0.4526195373043647, 0.5242260109150003, 0.5254554281601501, 0.48735972540309075]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0869(0.1278) 
(40, 6) Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Index(['text_id', 'full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold'], dtype='object')
Index(['full_text', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'], dtype='object')
Epoch: [1][0/950] Elapsed 0m 0s (remain 7m 54s) Loss: 0.0000(0.0000) Grad: 11589.6377  LR: 0.00000014  
Epoch: [1][20/950] Elapsed 0m 5s (remain 4m 23s) Loss: 0.0000(0.0001) Grad: 16388.7637  LR: 0.00000014  
Epoch: [1][40/950] Elapsed 0m 11s (remain 4m 11s) Loss: 0.0001(0.0002) Grad: 16258.2168  LR: 0.00000014  
Epoch: [1][60/950] Elapsed 0m 16s (remain 4m 4s) Loss: 0.0002(0.0001) Grad: 39569.8789  LR: 0.00000014  
Epoch: [1][80/950] Elapsed 0m 22s (remain 3m 58s) Loss: 0.0001(0.0002) Grad: 16611.9570  LR: 0.00000013  
Epoch: [1][100/950] Elapsed 0m 27s (r

Epoch 1 - avg_train_loss: 0.0004  avg_val_loss: 0.1275  time: 281s
INFO:__main__:Epoch 1 - avg_train_loss: 0.0004  avg_val_loss: 0.1275  time: 281s
Epoch 1 - Score: 0.5076  Scores: [0.5441275580099116, 0.5128041569721333, 0.4528085695799629, 0.5241331449514001, 0.5252269230423061, 0.48678123550670743]
INFO:__main__:Epoch 1 - Score: 0.5076  Scores: [0.5441275580099116, 0.5128041569721333, 0.4528085695799629, 0.5241331449514001, 0.5252269230423061, 0.48678123550670743]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0852(0.1275) 
Epoch: [2][0/950] Elapsed 0m 0s (remain 7m 51s) Loss: 0.0000(0.0000) Grad: 18776.9531  LR: 0.00000008  
Epoch: [2][20/950] Elapsed 0m 5s (remain 4m 22s) Loss: 0.0000(0.0002) Grad: 11325.8643  LR: 0.00000008  
Epoch: [2][40/950] Elapsed 0m 11s (remain 4m 13s) Loss: 0.0000(0.0001) Grad: 4861.7363  LR: 0.00000008  
Epoch: [2][60/950] Elapsed 0m 16s (remain 4m 5s) Loss: 0.0000(0.0002) Grad: 8296.9170  LR: 0.00000008  
Epoch: [2][80/950] Elapsed 0m 22s (remain 4m 3s) Loss: 0.0000(0.0004) Grad: 17876.0215  LR: 0.00000008  
Epoch: [2][100/950] Elapsed 0m 28s (remain 3m 56s) Loss: 0.0001(0.0003) Grad: 15713.5781  LR: 0.00000008  
Epoch: [2][120/950] Elapsed 0m 33s (remain 3m 52s) Loss: 0.0377(0.0006) Grad: 448744.5000  LR: 0.00000008  
Epoch: [2][140/950] Elapsed 0m 39s (remain 3m 45s) Loss: 0.0000(0.0005) Grad: 9783.7588  LR: 0.00000008  
Epoch: [2][160/950] Elapsed 0m 44s (remain 3m 40s) Loss: 0.0000(0.0005) Grad: 6908.3989 

Epoch 2 - avg_train_loss: 0.0003  avg_val_loss: 0.1278  time: 281s
INFO:__main__:Epoch 2 - avg_train_loss: 0.0003  avg_val_loss: 0.1278  time: 281s
Epoch 2 - Score: 0.5082  Scores: [0.5457086792832536, 0.5135553957203323, 0.452676352973826, 0.5243406856264506, 0.525693103999385, 0.4869976638502538]
INFO:__main__:Epoch 2 - Score: 0.5082  Scores: [0.5457086792832536, 0.5135553957203323, 0.452676352973826, 0.5243406856264506, 0.525693103999385, 0.4869976638502538]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0855(0.1278) 
Epoch: [3][0/950] Elapsed 0m 0s (remain 7m 42s) Loss: 0.0000(0.0000) Grad: 13873.0557  LR: 0.00000004  
Epoch: [3][20/950] Elapsed 0m 5s (remain 4m 23s) Loss: 0.0000(0.0003) Grad: 6200.0557  LR: 0.00000004  
Epoch: [3][40/950] Elapsed 0m 11s (remain 4m 13s) Loss: 0.0000(0.0002) Grad: 3269.6755  LR: 0.00000004  
Epoch: [3][60/950] Elapsed 0m 16s (remain 4m 6s) Loss: 0.0000(0.0001) Grad: 12746.7578  LR: 0.00000004  
Epoch: [3][80/950] Elapsed 0m 23s (remain 4m 9s) Loss: 0.0000(0.0001) Grad: 7666.3057  LR: 0.00000004  
Epoch: [3][100/950] Elapsed 0m 28s (remain 4m 1s) Loss: 0.0002(0.0004) Grad: 25384.6035  LR: 0.00000004  
Epoch: [3][120/950] Elapsed 0m 34s (remain 3m 54s) Loss: 0.0000(0.0003) Grad: 4596.6113  LR: 0.00000004  
Epoch: [3][140/950] Elapsed 0m 39s (remain 3m 47s) Loss: 0.0000(0.0003) Grad: 5784.0640  LR: 0.00000004  
Epoch: [3][160/950] Elapsed 0m 45s (remain 3m 45s) Loss: 0.0001(0.0003) Grad: 20475.6562  LR

Epoch 3 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 281s
INFO:__main__:Epoch 3 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 281s
Epoch 3 - Score: 0.5081  Scores: [0.545783683395385, 0.5135661860699281, 0.45244701871014714, 0.5241441451592121, 0.525618966052651, 0.4872545257944505]
INFO:__main__:Epoch 3 - Score: 0.5081  Scores: [0.545783683395385, 0.5135661860699281, 0.45244701871014714, 0.5241441451592121, 0.525618966052651, 0.4872545257944505]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0853(0.1277) 
Epoch: [4][0/950] Elapsed 0m 0s (remain 7m 58s) Loss: 0.0000(0.0000) Grad: 3135.6699  LR: 0.00000002  
Epoch: [4][20/950] Elapsed 0m 5s (remain 4m 24s) Loss: 0.0000(0.0001) Grad: 2968.6260  LR: 0.00000001  
Epoch: [4][40/950] Elapsed 0m 11s (remain 4m 14s) Loss: 0.0000(0.0001) Grad: 1044.6471  LR: 0.00000001  
Epoch: [4][60/950] Elapsed 0m 16s (remain 4m 7s) Loss: 0.0002(0.0001) Grad: 28665.4336  LR: 0.00000001  
Epoch: [4][80/950] Elapsed 0m 22s (remain 4m 0s) Loss: 0.0000(0.0002) Grad: 10046.6025  LR: 0.00000001  
Epoch: [4][100/950] Elapsed 0m 27s (remain 3m 53s) Loss: 0.0000(0.0003) Grad: 3579.3923  LR: 0.00000001  
Epoch: [4][120/950] Elapsed 0m 33s (remain 3m 48s) Loss: 0.0000(0.0003) Grad: 7352.0317  LR: 0.00000001  
Epoch: [4][140/950] Elapsed 0m 38s (remain 3m 43s) Loss: 0.0000(0.0002) Grad: 5639.7085  LR: 0.00000001  
Epoch: [4][160/950] Elapsed 0m 44s (remain 3m 38s) Loss: 0.0000(0.0002) Grad: 5572.9072  LR:

Epoch 4 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 281s
INFO:__main__:Epoch 4 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 281s
Epoch 4 - Score: 0.5081  Scores: [0.5455060628966164, 0.5135729220380986, 0.45249622172674825, 0.5241183565016886, 0.5256997901419759, 0.48718962437669183]
INFO:__main__:Epoch 4 - Score: 0.5081  Scores: [0.5455060628966164, 0.5135729220380986, 0.45249622172674825, 0.5241183565016886, 0.5256997901419759, 0.48718962437669183]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0854(0.1277) 
Epoch: [5][0/950] Elapsed 0m 0s (remain 8m 4s) Loss: 0.0000(0.0000) Grad: 1990.8517  LR: 0.00000000  
Epoch: [5][20/950] Elapsed 0m 6s (remain 4m 28s) Loss: 0.0000(0.0003) Grad: 6170.4355  LR: 0.00000000  
Epoch: [5][40/950] Elapsed 0m 11s (remain 4m 16s) Loss: 0.0000(0.0002) Grad: 3946.4648  LR: 0.00000000  
Epoch: [5][60/950] Elapsed 0m 17s (remain 4m 8s) Loss: 0.0000(0.0002) Grad: 5246.5679  LR: 0.00000000  
Epoch: [5][80/950] Elapsed 0m 22s (remain 4m 1s) Loss: 0.0000(0.0002) Grad: 4534.0371  LR: 0.00000000  
Epoch: [5][100/950] Elapsed 0m 28s (remain 3m 55s) Loss: 0.0025(0.0005) Grad: 89604.2422  LR: 0.00000000  
Epoch: [5][120/950] Elapsed 0m 33s (remain 3m 49s) Loss: 0.0001(0.0005) Grad: 15866.7822  LR: 0.00000000  
Epoch: [5][140/950] Elapsed 0m 39s (remain 3m 43s) Loss: 0.0002(0.0004) Grad: 24845.2949  LR: 0.00000000  
Epoch: [5][160/950] Elapsed 0m 44s (remain 3m 37s) Loss: 0.0000(0.0004) Grad: 1944.1212  LR:

Epoch 5 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 283s
INFO:__main__:Epoch 5 - avg_train_loss: 0.0003  avg_val_loss: 0.1277  time: 283s
Epoch 5 - Score: 0.5081  Scores: [0.5456146725600978, 0.5135118831744238, 0.45250204006176176, 0.524141167812546, 0.5256965106765492, 0.4871989665140227]
INFO:__main__:Epoch 5 - Score: 0.5081  Scores: [0.5456146725600978, 0.5135118831744238, 0.45250204006176176, 0.524141167812546, 0.5256965106765492, 0.4871989665140227]


EVAL: [124/125] Elapsed 0m 15s (remain 0m 0s) Loss: 0.0854(0.1277) 


Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]
INFO:__main__:Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]
Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]
INFO:__main__:Score: 0.4744  Scores: [0.4955053859707873, 0.48258737323618073, 0.43359696928816804, 0.4744697259056485, 0.4935083034115342, 0.4669559239043084]


0,1
[fold0] avg_train_loss,█▄▃▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] avg_val_loss,█▁▂▂▁▁▂▂▂▂▃▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃
[fold0] epoch,▁▃▅▆█▁▃▅▆█▁▃▅▆█▁▃▅▆█▁▃▅▆█▁▃▅▆█
[fold0] loss,▅█▂▂▃▄▃▁▂▃▂▂▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] lr,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁
[fold0] score,█▁▂▂▁▁▂▂▂▂▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃▃

0,1
[fold0] avg_train_loss,0.00029
[fold0] avg_val_loss,0.12772
[fold0] epoch,5.0
[fold0] loss,0.0
[fold0] lr,0.0
[fold0] score,0.50811


In [23]:
# if __name__ == '__main__':
    
#     def get_result(oof_df):
#         labels = oof_df[CFG.target_cols].values
#         preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
#         score, scores = get_score(labels, preds)
#         LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
#     if CFG.train:
#         oof_df = pd.DataFrame()
#         for fold in range(CFG.n_fold):
#             if fold in CFG.trn_fold:
#                 _oof_df = train_loop(train, fold)
#                 oof_df = pd.concat([oof_df, _oof_df])
#                 LOGGER.info(f"========== fold: {fold} result ==========")
#                 get_result(_oof_df)
#         oof_df = oof_df.reset_index(drop=True)
#         LOGGER.info(f"========== CV ==========")
#         get_result(oof_df)
#         oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
#     if CFG.wandb:
#         wandb.finish()