In [1]:
!nvidia-smi

Sun Nov 27 01:04:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   40C    P0    52W / 400W |      0MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "wanwan7123"

    NAME = "feedback3-Exp063-lomgformer-base"
    MODEL_PATH = "allenai/longformer-base-4096"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-english-learning"

    api_path = "kaggle_json/kaggle.json"

    apex=True
    seed = 42
    num_fold = 10
    trn_fold = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    batch_size = 16
    n_epochs = 4
    max_len = 1024
    target_list = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    lr_weight_decay = 0.98
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 20
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob
import math
import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

! pip install iterative-stratification
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

!pip install text-unidecode

[0mLooking in links: https://download.pytorch.org/whl/torch_stable.html
[0m

In [4]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # pip install
    ! pip install transformers==4.16.2
    ! pip install tokenizers==0.11.6
    ! pip install transformers[sentencepiece]

    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']

    # set dirs
    cfg.INPUT = 'input'
    cfg.EXP = cfg.NAME
    cfg.OUTPUT_EXP = cfg.NAME
    cfg.SUBMISSION = './'
    cfg.DATASET = '../input/'

    cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

    # make dirs
    for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [5]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_multilabelstratifiedkfold(train, target_col, n_splits, seed):
    kf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [6]:
def mcrmse(cfg, preds, df):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        all_score += score/len(cfg.target_list)
    return all_score

In [7]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [8]:
# =====================
# Dataset, Model
# =====================

def processing_features(df):
    df['text'] = df['full_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    return df

# dataset
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [9]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )
        
        self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.backbone = LongformerModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )
        self.pool = AttentionPooling(self.config.hidden_size)
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.backbone.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.backbone.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [10]:
# def get_optimizer_grouped_parameters(cfg, model):
#         param_optimizer = list(model.named_parameters())
#         no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
#         optimizer_parameters = [
#             {'params': [p for n, p in model.backbone.named_parameters() if not any(nd in n for nd in no_decay)],
#              'lr': cfg.encoder_lr, 'weight_decay': cfg.weight_decay},
#             {'params': [p for n, p in model.backbone.named_parameters() if any(nd in n for nd in no_decay)],
#              'lr': cfg.encoder_lr, 'weight_decay': 0.0},
#             {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
#              'lr': cfg.decoder_lr, 'weight_decay': 0.0}
#         ]
#         return optimizer_parameters
    
    
def get_optimizer_grouped_parameters(cfg, model):
    model_type = 'backbone'
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
             'lr': cfg.decoder_lr, 'weight_decay': 0.0},
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = cfg.encoder_lr
    for layer in layers:
        lr *= cfg.lr_weight_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": cfg.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

# initialize layer
def reinit_bert(model):
    for layer in model.backbone.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

In [11]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=0.3, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [12]:
from torch.autograd.function import InplaceFunction
from torch.nn import Parameter
import torch.nn.init as init

class Mixout(InplaceFunction):
    @staticmethod
    def _make_noise(input):
        return input.new().resize_as_(input)

    @classmethod
    def forward(cls, ctx, input, target=None, p=0.0, training=False, inplace=False):
        if p < 0 or p > 1:
            raise ValueError("A mix probability of mixout has to be between 0 and 1," " but got {}".format(p))
        if target is not None and input.size() != target.size():
            raise ValueError(
                "A target tensor size must match with a input tensor size {},"
                " but got {}".format(input.size(), target.size())
            )
        ctx.p = p
        ctx.training = training

        if ctx.p == 0 or not ctx.training:
            return input

        if target is None:
            target = cls._make_noise(input)
            target.fill_(0)
        target = target.to(input.device)

        if inplace:
            ctx.mark_dirty(input)
            output = input
        else:
            output = input.clone()

        ctx.noise = cls._make_noise(input)
        if len(ctx.noise.size()) == 1:
            ctx.noise.bernoulli_(1 - ctx.p)
        else:
            ctx.noise[0].bernoulli_(1 - ctx.p)
            ctx.noise = ctx.noise[0].repeat(input.size()[0], 1)
        ctx.noise.expand_as(input)

        if ctx.p == 1:
            output = target
        else:
            output = ((1 - ctx.noise) * target + ctx.noise * output - ctx.p * target) / (1 - ctx.p)
        return output

    @staticmethod
    def backward(ctx, grad_output):
        if ctx.p > 0 and ctx.training:
            return grad_output * ctx.noise, None, None, None, None
        else:
            return grad_output, None, None, None, None


def mixout(input, target=None, p=0.0, training=False, inplace=False):
    return Mixout.apply(input, target, p, training, inplace)


class MixLinear(torch.nn.Module):
    __constants__ = ["bias", "in_features", "out_features"]
    def __init__(self, in_features, out_features, bias=True, target=None, p=0.0):
        super(MixLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.Tensor(out_features, in_features))
        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter("bias", None)
        self.reset_parameters()
        self.target = target
        self.p = p

    def reset_parameters(self):
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        return F.linear(input, mixout(self.weight, self.target, self.p, self.training), self.bias)

    def extra_repr(self):
        type = "drop" if self.target is None else "mix"
        return "{}={}, in_features={}, out_features={}, bias={}".format(
            type + "out", self.p, self.in_features, self.out_features, self.bias is not None
        )

def replace_mixout(model):
    for sup_module in model.modules():
        for name, module in sup_module.named_children():
            if isinstance(module, nn.Dropout):
                module.p = 0.0
            if isinstance(module, nn.Linear):
                target_state_dict = module.state_dict()
                bias = True if module.bias is not None else False
                new_module = MixLinear(
                    module.in_features, module.out_features, bias, target_state_dict["weight"], 0.2
                )
                new_module.load_state_dict(target_state_dict)
                setattr(sup_module, name, new_module)
    return model

In [13]:
def evaluating(cfg, valid_loader, model, valid_df, fold, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast():
                    loss, output = model(inputs, labels)
                
                output = output.detach().cpu().numpy()
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))
                pbar.set_postfix({
                    'val_loss': loss.item()
                })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    val_log = {
        'val_loss': val_loss,
        'mcrmse': score
    }
    display(val_log)

    if best_val_score > score:
        print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
        )
    
    return best_val_preds, best_val_score

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 6), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        # print(f'==========fold {fold}==========')
        # if (fold <= 7):
        #     valid_df = train.loc[cfg.folds==fold]
        #     valid_idx = list(valid_df.index)
        #     best_val_preds = np.load(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'))
        #     oof_pred[valid_idx] = best_val_preds.astype(np.float32)  
        # else:
        
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = reinit_bert(model)
        # model = replace_mixout(model)
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.decoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # enable FGM
        fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999

        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                     # FGM attack
                    fgm.attack()
                    with autocast(enabled=cfg.apex):
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()

                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )

                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        print(f'fold: {fold}, epoch: {epoch}, step: {step}')
                        best_val_preds, best_val_score = evaluating(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

            train_loss = sum(train_losses)/sum(train_nums)
            train_log = {
                'train_loss':train_loss
            }
            display(train_log)

            # evaluating(epoch)
            print(f'fold: {fold}, epoch: {epoch}, complete')
            best_val_preds, best_val_score = evaluating(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                best_val_preds,
                best_val_score,
            )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        fold_score.append(best_val_score)
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

In [14]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer, LongformerModel
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
%env TOKENIZERS_PARALLELISM=true
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))

train = processing_features(train)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
cfg.folds = get_multilabelstratifiedkfold(train, cfg.target_list, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))
score = training(cfg, train)

[0menv: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2


Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 0, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.6053128858356525, 'mcrmse': 3.174122073376692}

[31msave model weight[0m
fold: 0, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.303710376210225, 'mcrmse': 1.917007270280523}

[31msave model weight[0m
fold: 0, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.19274163284265172, 'mcrmse': 0.6285117321766821}

[31msave model weight[0m
fold: 0, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.14165021159002544, 'mcrmse': 0.5349497204619703}

[31msave model weight[0m
fold: 0, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12765411815375013, 'mcrmse': 0.5074727951038374}

[31msave model weight[0m
fold: 0, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11395624744922608, 'mcrmse': 0.4782447166760342}

[31msave model weight[0m
fold: 0, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11297602749541592, 'mcrmse': 0.4765933955283893}

[31msave model weight[0m
fold: 0, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11891643097028708, 'mcrmse': 0.4878617728239276}

fold: 0, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11761374695374228, 'mcrmse': 0.4861298769588631}

fold: 0, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13218195492501758, 'mcrmse': 0.5163760792551342}

{'train_loss': 0.5986469694836574}

fold: 0, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11724897685563168, 'mcrmse': 0.4852769759401841}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 0, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1068970331602999, 'mcrmse': 0.4628880076470583}

[31msave model weight[0m
fold: 0, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1338408789823732, 'mcrmse': 0.5198665955718604}

fold: 0, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11345583772110512, 'mcrmse': 0.4772804584053416}

fold: 0, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12616249145296834, 'mcrmse': 0.5021626295353805}

fold: 0, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10881809143306655, 'mcrmse': 0.46695124059325427}

fold: 0, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1089508070436585, 'mcrmse': 0.4668440747261374}

fold: 0, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12146976944583152, 'mcrmse': 0.49261920230641193}

fold: 0, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11601300526153097, 'mcrmse': 0.48228959175657465}

fold: 0, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10677039970065016, 'mcrmse': 0.46269772042954727}

[31msave model weight[0m
fold: 0, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12533606497375557, 'mcrmse': 0.5017100660764375}

{'train_loss': 0.11129580597308548}

fold: 0, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11979647285645575, 'mcrmse': 0.49031262307007034}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 0, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11130095088420927, 'mcrmse': 0.47301040339651657}

fold: 0, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10664473428293263, 'mcrmse': 0.46224294201098926}

[31msave model weight[0m
fold: 0, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10598279970228824, 'mcrmse': 0.4606318096902668}

[31msave model weight[0m
fold: 0, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10568876099555999, 'mcrmse': 0.4603142499184203}

[31msave model weight[0m
fold: 0, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11531837113068232, 'mcrmse': 0.48198191461605233}

fold: 0, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11516289725480482, 'mcrmse': 0.4804052316853033}

fold: 0, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11279542290646097, 'mcrmse': 0.47577122607552047}

fold: 0, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11274279166213082, 'mcrmse': 0.4753413402393968}

fold: 0, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10596786999641478, 'mcrmse': 0.4606627038123397}

fold: 0, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10751566130791784, 'mcrmse': 0.46398404468859794}

{'train_loss': 0.1019168150018562}

fold: 0, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10916230909507293, 'mcrmse': 0.46784714988620446}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 0, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10802432433571048, 'mcrmse': 0.46550504559715344}

fold: 0, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10652889940135009, 'mcrmse': 0.46204306323345046}

fold: 0, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10525414812595338, 'mcrmse': 0.4593057354379271}

[31msave model weight[0m
fold: 0, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10646663587111646, 'mcrmse': 0.46227943595495896}

fold: 0, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10389712891157936, 'mcrmse': 0.4562249656951618}

[31msave model weight[0m
fold: 0, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10340452117993094, 'mcrmse': 0.4551733698481003}

[31msave model weight[0m
fold: 0, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10387895181965645, 'mcrmse': 0.45635972628082555}

fold: 0, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10367156919615958, 'mcrmse': 0.45586461883270735}

fold: 0, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1037290845914265, 'mcrmse': 0.4559889211369639}

fold: 0, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10371069446244204, 'mcrmse': 0.4559294524291725}

{'train_loss': 0.09480195243588903}

fold: 0, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10370933895220842, 'mcrmse': 0.4559232648555094}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 1, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.285067916221326, 'mcrmse': 2.8680910187020356}

[31msave model weight[0m
fold: 1, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.1593953678979898, 'mcrmse': 1.7736388701030474}

[31msave model weight[0m
fold: 1, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.20371939916439982, 'mcrmse': 0.6496229450963716}

[31msave model weight[0m
fold: 1, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.14038276630441857, 'mcrmse': 0.5350564336283051}

[31msave model weight[0m
fold: 1, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13477401380115153, 'mcrmse': 0.5231830507838355}

[31msave model weight[0m
fold: 1, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1434100122784105, 'mcrmse': 0.540333137365012}

fold: 1, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12172641761391365, 'mcrmse': 0.49694507512999253}

[31msave model weight[0m
fold: 1, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12223821903204979, 'mcrmse': 0.4974668702214859}

fold: 1, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12286608325093604, 'mcrmse': 0.4993610494725683}

fold: 1, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13328780896980744, 'mcrmse': 0.5197364174115344}

{'train_loss': 0.5431611017747359}

fold: 1, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12567983239965366, 'mcrmse': 0.505134278828139}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 1, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13514694138942168, 'mcrmse': 0.5241764636495151}

fold: 1, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11879190760652733, 'mcrmse': 0.49072968459697364}

[31msave model weight[0m
fold: 1, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11987198138480906, 'mcrmse': 0.49296713995496944}

fold: 1, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11905805116800396, 'mcrmse': 0.4914837352519909}

fold: 1, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11775050378973832, 'mcrmse': 0.4885172739738645}

[31msave model weight[0m
fold: 1, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11513395346415317, 'mcrmse': 0.4829300230869591}

[31msave model weight[0m
fold: 1, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11813955260512164, 'mcrmse': 0.4891247671291672}

fold: 1, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11440442260497671, 'mcrmse': 0.48100919557167526}

[31msave model weight[0m
fold: 1, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11543910209175266, 'mcrmse': 0.4832897658616567}

fold: 1, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11566215244781636, 'mcrmse': 0.48368216778487805}

{'train_loss': 0.10886353464289145}

fold: 1, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11394977588635272, 'mcrmse': 0.4799914752535804}

[31msave model weight[0m


  0%|          | 0/220 [00:00<?, ?it/s]

fold: 1, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11452054485793003, 'mcrmse': 0.4812007283316975}

fold: 1, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11372121547341651, 'mcrmse': 0.4796316140377661}

[31msave model weight[0m
fold: 1, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11433963001231708, 'mcrmse': 0.4809237709464764}

fold: 1, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11617942242061391, 'mcrmse': 0.48464503809046733}

fold: 1, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11285617997121933, 'mcrmse': 0.47759050017970317}

[31msave model weight[0m
fold: 1, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11424888887673693, 'mcrmse': 0.48066500114159405}

fold: 1, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12057926033235267, 'mcrmse': 0.4942504607468865}

fold: 1, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11930512093826938, 'mcrmse': 0.49169516155365695}

fold: 1, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11388720583427897, 'mcrmse': 0.48004962292588826}

fold: 1, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1122176084105316, 'mcrmse': 0.47640256695372407}

[31msave model weight[0m


{'train_loss': 0.10159563493322242}

fold: 1, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11314205621438259, 'mcrmse': 0.4782506827260324}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 1, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11214873749200645, 'mcrmse': 0.4762125579269369}

[31msave model weight[0m
fold: 1, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11291994537462664, 'mcrmse': 0.47798628874021254}

fold: 1, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11202697884624876, 'mcrmse': 0.4759150551167045}

[31msave model weight[0m
fold: 1, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11216247514309481, 'mcrmse': 0.47628442314907}

fold: 1, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11143157983679905, 'mcrmse': 0.474643830179701}

[31msave model weight[0m
fold: 1, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11176410593721263, 'mcrmse': 0.47533141031730514}

fold: 1, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11151856980512818, 'mcrmse': 0.474841578723063}

fold: 1, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11165277004394385, 'mcrmse': 0.47515535056401537}

fold: 1, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11166741392191719, 'mcrmse': 0.47518504418772356}

fold: 1, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11166037579098016, 'mcrmse': 0.4751690800392702}

{'train_loss': 0.09572938027029687}

fold: 1, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11166229588753732, 'mcrmse': 0.47517491137474754}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 2, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.5703336363253384, 'mcrmse': 3.1435244447211796}

[31msave model weight[0m
fold: 2, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.4767992905033824, 'mcrmse': 2.0711238633330344}

[31msave model weight[0m
fold: 2, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.23130327291653285, 'mcrmse': 0.693330513322613}

[31msave model weight[0m
fold: 2, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.14016948114423192, 'mcrmse': 0.5326517215702058}

[31msave model weight[0m
fold: 2, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1324907165125508, 'mcrmse': 0.5174048627413533}

[31msave model weight[0m
fold: 2, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.14892919197716675, 'mcrmse': 0.5467191157688502}

fold: 2, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1211344356198445, 'mcrmse': 0.49454157412046784}

[31msave model weight[0m
fold: 2, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.116546767656608, 'mcrmse': 0.4846463047828223}

[31msave model weight[0m
fold: 2, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12595254937401207, 'mcrmse': 0.5032347585927158}

fold: 2, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1172369806777181, 'mcrmse': 0.48574529146200257}

{'train_loss': 0.6167551105672663}

fold: 2, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11418569865434067, 'mcrmse': 0.479248831414797}

[31msave model weight[0m


  0%|          | 0/220 [00:00<?, ?it/s]

fold: 2, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11255528636829322, 'mcrmse': 0.47603793919831744}

[31msave model weight[0m
fold: 2, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11368204325513767, 'mcrmse': 0.479077574465807}

fold: 2, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11133603242885731, 'mcrmse': 0.4736584976036605}

[31msave model weight[0m
fold: 2, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11268060976434546, 'mcrmse': 0.476693314614901}

fold: 2, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11009095060398512, 'mcrmse': 0.4707757416580504}

[31msave model weight[0m
fold: 2, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11040246002661908, 'mcrmse': 0.47115949742545127}

fold: 2, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1111994847236082, 'mcrmse': 0.4736229037929685}

fold: 2, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11355208102470774, 'mcrmse': 0.478493091823741}

fold: 2, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10989434998053724, 'mcrmse': 0.47068836660128005}

[31msave model weight[0m
fold: 2, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10818138642384269, 'mcrmse': 0.46672788194401477}

[31msave model weight[0m


{'train_loss': 0.10681314996697686}

fold: 2, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11020156598228323, 'mcrmse': 0.47074103395734096}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 2, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11380784171621514, 'mcrmse': 0.47852320466778797}

fold: 2, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10994463574017405, 'mcrmse': 0.4703387628531413}

fold: 2, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10960713248042499, 'mcrmse': 0.4699661161792208}

fold: 2, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11018857620942318, 'mcrmse': 0.4711972653868087}

fold: 2, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10735948625809091, 'mcrmse': 0.46492568424234687}

[31msave model weight[0m
fold: 2, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10770931918069225, 'mcrmse': 0.4656436205025367}

fold: 2, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10830294087414852, 'mcrmse': 0.4667110190890953}

fold: 2, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1075807497515093, 'mcrmse': 0.46537217608265075}

fold: 2, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10698699360460881, 'mcrmse': 0.46398394062719356}

[31msave model weight[0m
fold: 2, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10697000974889302, 'mcrmse': 0.46413204512023415}

{'train_loss': 0.10089675000106746}

fold: 2, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11242729412091662, 'mcrmse': 0.47568938234217045}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 2, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10870480352579176, 'mcrmse': 0.46779864970078}

fold: 2, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10745149662198923, 'mcrmse': 0.4652243014318776}

fold: 2, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10823219829736769, 'mcrmse': 0.46683314765014794}

fold: 2, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10718936374044175, 'mcrmse': 0.4645331463376667}

fold: 2, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10716422001266723, 'mcrmse': 0.46446426058197204}

fold: 2, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10909344435043042, 'mcrmse': 0.4684362395558536}

fold: 2, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10732647208758937, 'mcrmse': 0.46473040145266825}

fold: 2, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1071609022557888, 'mcrmse': 0.46438509503185577}

fold: 2, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10719706994645736, 'mcrmse': 0.4644599984579247}

fold: 2, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10720796250473813, 'mcrmse': 0.4644760318080926}

{'train_loss': 0.09700364140285687}

fold: 2, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10719211228058466, 'mcrmse': 0.46444179143451203}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 3, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.1290405186850703, 'mcrmse': 2.7094728370505403}

[31msave model weight[0m
fold: 3, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.0359422050778517, 'mcrmse': 1.6308700608368105}

[31msave model weight[0m
fold: 3, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1971393589625883, 'mcrmse': 0.635527555982612}

[31msave model weight[0m
fold: 3, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1489031410888028, 'mcrmse': 0.5456309988103026}

[31msave model weight[0m
fold: 3, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12369152278546482, 'mcrmse': 0.49871665182447444}

[31msave model weight[0m
fold: 3, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12240147321958982, 'mcrmse': 0.49545628423507276}

[31msave model weight[0m
fold: 3, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1188053443761128, 'mcrmse': 0.4885909849038232}

[31msave model weight[0m
fold: 3, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11994653646750829, 'mcrmse': 0.49053242960908355}

fold: 3, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13881777230735934, 'mcrmse': 0.5255520624485481}

fold: 3, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11257011024162288, 'mcrmse': 0.4750548149945791}

[31msave model weight[0m


{'train_loss': 0.5218222164972262}

fold: 3, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1193366439827263, 'mcrmse': 0.4891898984212242}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 3, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11235310461210168, 'mcrmse': 0.4742783010015992}

[31msave model weight[0m
fold: 3, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.107161638986729, 'mcrmse': 0.46315191627025865}

[31msave model weight[0m
fold: 3, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.108410896738167, 'mcrmse': 0.46563406802436347}

fold: 3, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10723881846498651, 'mcrmse': 0.46303578222789815}

[31msave model weight[0m
fold: 3, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10762296117785032, 'mcrmse': 0.4640345940071358}

fold: 3, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10591104912483479, 'mcrmse': 0.4604505342462738}

[31msave model weight[0m
fold: 3, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10583651023904991, 'mcrmse': 0.4601796916126264}

[31msave model weight[0m
fold: 3, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10608386726635496, 'mcrmse': 0.4608314858309556}

fold: 3, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10472246989264818, 'mcrmse': 0.4576431165207694}

[31msave model weight[0m
fold: 3, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1061780703113512, 'mcrmse': 0.46097936227374275}

{'train_loss': 0.10638352944092318}

fold: 3, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1070998765318595, 'mcrmse': 0.4630653818638022}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 3, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10543453946823964, 'mcrmse': 0.45930583869132485}

fold: 3, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1069403202713603, 'mcrmse': 0.4624524690650659}

fold: 3, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10495393637501066, 'mcrmse': 0.4583001254504989}

fold: 3, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10562268051954792, 'mcrmse': 0.4596837398760573}

fold: 3, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10404372365807024, 'mcrmse': 0.45628169939436075}

[31msave model weight[0m
fold: 3, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10683253521809492, 'mcrmse': 0.46243796128594566}

fold: 3, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10362626212027372, 'mcrmse': 0.45542591027230694}

[31msave model weight[0m
fold: 3, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10379367194059864, 'mcrmse': 0.4557868031573787}

fold: 3, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10480528200983696, 'mcrmse': 0.45798085826125}

fold: 3, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10534634130537662, 'mcrmse': 0.45932007664622115}

{'train_loss': 0.10139506753872742}

fold: 3, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10328333777234987, 'mcrmse': 0.4544920346399024}

[31msave model weight[0m


  0%|          | 0/220 [00:00<?, ?it/s]

fold: 3, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10515445905268345, 'mcrmse': 0.45874516542781685}

fold: 3, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1034589699086021, 'mcrmse': 0.45487984740956144}

fold: 3, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10325697392149044, 'mcrmse': 0.4543592262093633}

[31msave model weight[0m
fold: 3, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10301300124896456, 'mcrmse': 0.4539714698873951}

[31msave model weight[0m
fold: 3, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1037877043494788, 'mcrmse': 0.45563728611103876}

fold: 3, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10328543734977312, 'mcrmse': 0.45457725160495077}

fold: 3, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10343674019626949, 'mcrmse': 0.454918961717264}

fold: 3, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1032949517221402, 'mcrmse': 0.45462398313572217}

fold: 3, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10323941673311736, 'mcrmse': 0.4545010904992556}

fold: 3, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10326704264754225, 'mcrmse': 0.45456064171755073}

{'train_loss': 0.09722383279692043}

fold: 3, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10326244180922008, 'mcrmse': 0.45455027383079805}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 4, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.0755228045041605, 'mcrmse': 2.6677192025329877}

[31msave model weight[0m
fold: 4, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.9635509255597049, 'mcrmse': 1.5868295039904716}

[31msave model weight[0m
fold: 4, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.18634023252503037, 'mcrmse': 0.6191352315549178}

[31msave model weight[0m
fold: 4, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.18837972118726473, 'mcrmse': 0.6200268685431439}

fold: 4, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1723944407975887, 'mcrmse': 0.5892584627628396}

[31msave model weight[0m
fold: 4, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.15270799436532628, 'mcrmse': 0.5560771829304471}

[31msave model weight[0m
fold: 4, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12253860706258612, 'mcrmse': 0.49746708622142793}

[31msave model weight[0m
fold: 4, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1293598119255222, 'mcrmse': 0.5107287734827891}

fold: 4, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11881564368905924, 'mcrmse': 0.48895499923746194}

[31msave model weight[0m
fold: 4, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12093453296005269, 'mcrmse': 0.4947061501808949}

{'train_loss': 0.5032275134189562}

fold: 4, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.142241891845108, 'mcrmse': 0.5283244981735455}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 4, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12674424749658542, 'mcrmse': 0.5054944113195408}

fold: 4, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11533296792327291, 'mcrmse': 0.48193366203242016}

[31msave model weight[0m
fold: 4, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11549016488406359, 'mcrmse': 0.4823222378754828}

fold: 4, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11908873824207374, 'mcrmse': 0.4894908275552028}

fold: 4, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11412635826698654, 'mcrmse': 0.4790726285638666}

[31msave model weight[0m
fold: 4, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11094869867615077, 'mcrmse': 0.4721531124551057}

[31msave model weight[0m
fold: 4, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11443682514188235, 'mcrmse': 0.4804974314393688}

fold: 4, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11282953355089782, 'mcrmse': 0.47603670335436177}

fold: 4, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11808555335035105, 'mcrmse': 0.4868736145573776}

fold: 4, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10988089850034251, 'mcrmse': 0.4701193582746022}

[31msave model weight[0m


{'train_loss': 0.11116969466886738}

fold: 4, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11274744450207562, 'mcrmse': 0.47649085115453976}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 4, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11198083496154726, 'mcrmse': 0.4749477277787588}

fold: 4, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11055013802274109, 'mcrmse': 0.47162449645025906}

fold: 4, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11082014443395692, 'mcrmse': 0.47196377486575264}

fold: 4, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10955816372047605, 'mcrmse': 0.46943008446372936}

[31msave model weight[0m
fold: 4, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10961095005502482, 'mcrmse': 0.4695775296059139}

fold: 4, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11100253209357372, 'mcrmse': 0.4723781318290027}

fold: 4, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10920979532286944, 'mcrmse': 0.46883104076010623}

[31msave model weight[0m
fold: 4, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1081574782348045, 'mcrmse': 0.4662724860762348}

[31msave model weight[0m
fold: 4, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10893171377803969, 'mcrmse': 0.467943422395657}

fold: 4, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1075581476816436, 'mcrmse': 0.46497665766926843}

[31msave model weight[0m


{'train_loss': 0.10029303355311806}

fold: 4, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10769720350766121, 'mcrmse': 0.4652875960681543}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 4, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10819694935284613, 'mcrmse': 0.4664041344987748}

fold: 4, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10764730453033887, 'mcrmse': 0.46521737764295984}

fold: 4, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10726722409886777, 'mcrmse': 0.46425862308689697}

[31msave model weight[0m
fold: 4, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10736525468432995, 'mcrmse': 0.46450903706740376}

fold: 4, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10721076617155538, 'mcrmse': 0.46419119260358827}

[31msave model weight[0m
fold: 4, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10711420521787975, 'mcrmse': 0.4639817900640965}

[31msave model weight[0m
fold: 4, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10757661976701463, 'mcrmse': 0.4650218814459102}

fold: 4, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10744175640747065, 'mcrmse': 0.464712993633178}

fold: 4, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10729049523468212, 'mcrmse': 0.46438640719989543}

fold: 4, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10729342195993799, 'mcrmse': 0.46439726865398673}

{'train_loss': 0.09661333974112164}

fold: 4, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10729470673729391, 'mcrmse': 0.4643999434165966}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 5, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.1914039774014213, 'mcrmse': 2.773350726979113}

[31msave model weight[0m
fold: 5, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.0676570816722977, 'mcrmse': 1.6729401369755417}

[31msave model weight[0m
fold: 5, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.24398153052305627, 'mcrmse': 0.7062418615599947}

[31msave model weight[0m
fold: 5, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12994497496148813, 'mcrmse': 0.5116736998411894}

[31msave model weight[0m
fold: 5, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1255257717712456, 'mcrmse': 0.5020832278923175}

[31msave model weight[0m
fold: 5, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11758484281694798, 'mcrmse': 0.4860981900862225}

[31msave model weight[0m
fold: 5, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13479091966396098, 'mcrmse': 0.5185068814952547}

fold: 5, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11440852314919767, 'mcrmse': 0.47839359591476216}

[31msave model weight[0m
fold: 5, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11827788465773054, 'mcrmse': 0.48688298587168777}

fold: 5, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11470299423731806, 'mcrmse': 0.47971488992023187}

{'train_loss': 0.5362662293694236}

fold: 5, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1212769081182492, 'mcrmse': 0.4918806156856384}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 5, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11816779525993425, 'mcrmse': 0.4850691712880236}

fold: 5, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10682312551590488, 'mcrmse': 0.4622939519058452}

[31msave model weight[0m
fold: 5, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1061896408153007, 'mcrmse': 0.4611972676048499}

[31msave model weight[0m
fold: 5, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10875360564807492, 'mcrmse': 0.4665348596148097}

fold: 5, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1117088018780779, 'mcrmse': 0.47325829875989706}

fold: 5, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10591709186963717, 'mcrmse': 0.4607042832482643}

[31msave model weight[0m
fold: 5, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11286078252450889, 'mcrmse': 0.4751664337290502}

fold: 5, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1056571245155371, 'mcrmse': 0.4598255919121593}

[31msave model weight[0m
fold: 5, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10874115942460497, 'mcrmse': 0.4663549425369478}

fold: 5, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10618788160174095, 'mcrmse': 0.46125693713473304}

{'train_loss': 0.11167924253439361}

fold: 5, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10795502517077014, 'mcrmse': 0.46528330227071396}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 5, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10458782585837957, 'mcrmse': 0.45766162320799936}

[31msave model weight[0m
fold: 5, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10486723648388978, 'mcrmse': 0.4585004482403868}

fold: 5, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10355483940647692, 'mcrmse': 0.45523796473943323}

[31msave model weight[0m
fold: 5, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10321570307854801, 'mcrmse': 0.45465819698449333}

[31msave model weight[0m
fold: 5, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.103453641333391, 'mcrmse': 0.45515447710585594}

fold: 5, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10424634819978948, 'mcrmse': 0.45676785728676783}

fold: 5, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10346613146002641, 'mcrmse': 0.455243232708105}

fold: 5, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10514559768273703, 'mcrmse': 0.45912224928597617}

fold: 5, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10346312237822491, 'mcrmse': 0.4549986776966553}

fold: 5, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10359142554919128, 'mcrmse': 0.4552340870280831}

{'train_loss': 0.1000905297527259}

fold: 5, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11460441579599209, 'mcrmse': 0.479912518317263}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 5, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10286763592449295, 'mcrmse': 0.4537467437001185}

[31msave model weight[0m
fold: 5, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10315731876646467, 'mcrmse': 0.4546444084614021}

fold: 5, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10209188415952351, 'mcrmse': 0.45191278666962387}

[31msave model weight[0m
fold: 5, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10401344078276163, 'mcrmse': 0.4563897292356879}

fold: 5, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10214625901121008, 'mcrmse': 0.4521542466120775}

fold: 5, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10186126605247903, 'mcrmse': 0.4514837190272043}

[31msave model weight[0m
fold: 5, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10205481363379437, 'mcrmse': 0.4518990760114472}

fold: 5, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10218304931126593, 'mcrmse': 0.4522311811270385}

fold: 5, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10213989324277015, 'mcrmse': 0.45214369784375547}

fold: 5, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10210752635813125, 'mcrmse': 0.45206978949303894}

{'train_loss': 0.09631440737708048}

fold: 5, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10210397957688402, 'mcrmse': 0.4520636375710726}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 6, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.7398428368141583, 'mcrmse': 2.3266612326898137}

[31msave model weight[0m
fold: 6, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.713151850816234, 'mcrmse': 1.2973815258770787}

[31msave model weight[0m
fold: 6, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.21390543752314184, 'mcrmse': 0.6640530674275398}

[31msave model weight[0m
fold: 6, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.16194035070936394, 'mcrmse': 0.5745490237201129}

[31msave model weight[0m
fold: 6, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13038495015305326, 'mcrmse': 0.5135693716777717}

[31msave model weight[0m
fold: 6, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12583004412672405, 'mcrmse': 0.5038544356259391}

[31msave model weight[0m
fold: 6, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1287327535317072, 'mcrmse': 0.5090625528800885}

fold: 6, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12323398794740667, 'mcrmse': 0.49871695627951}

[31msave model weight[0m
fold: 6, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12335544717891136, 'mcrmse': 0.4987520371130301}

fold: 6, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13401154447775668, 'mcrmse': 0.5204952460197538}

{'train_loss': 0.43167772438715807}

fold: 6, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11955263107405294, 'mcrmse': 0.4909067062748719}

[31msave model weight[0m


  0%|          | 0/220 [00:00<?, ?it/s]

fold: 6, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12407694362542208, 'mcrmse': 0.49998948508638275}

fold: 6, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12179693937911402, 'mcrmse': 0.4954169969395701}

fold: 6, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12445629931166959, 'mcrmse': 0.5008836225780962}

fold: 6, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11510367462854557, 'mcrmse': 0.48124438413658427}

[31msave model weight[0m
fold: 6, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11711080227514058, 'mcrmse': 0.4856065751975438}

fold: 6, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11639525478377062, 'mcrmse': 0.48414870404994215}

fold: 6, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11226199759775415, 'mcrmse': 0.475220789665621}

[31msave model weight[0m
fold: 6, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11386172820235152, 'mcrmse': 0.4787694809327928}

fold: 6, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11282019317150116, 'mcrmse': 0.47640968671821304}

fold: 6, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11201672074015794, 'mcrmse': 0.4748237464696462}

[31msave model weight[0m


{'train_loss': 0.10525927421721545}

fold: 6, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11304979575107164, 'mcrmse': 0.47678689368312144}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 6, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11049551299542112, 'mcrmse': 0.47117219335126614}

[31msave model weight[0m
fold: 6, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11097349679988364, 'mcrmse': 0.4724981879961944}

fold: 6, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.115830152693307, 'mcrmse': 0.4831272644334335}

fold: 6, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11098418732547699, 'mcrmse': 0.4723774803441375}

fold: 6, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11322335228133384, 'mcrmse': 0.47733463743447985}

fold: 6, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10998768984433026, 'mcrmse': 0.47022463725642094}

[31msave model weight[0m
fold: 6, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11022170328194528, 'mcrmse': 0.47066031514664075}

fold: 6, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11009003369666426, 'mcrmse': 0.4705291379290222}

fold: 6, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11074216267489412, 'mcrmse': 0.47188804979201765}

fold: 6, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1113859241080406, 'mcrmse': 0.47342081133030195}

{'train_loss': 0.09741223041306843}

fold: 6, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11080332891181911, 'mcrmse': 0.47190581173051893}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 6, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11029676000213684, 'mcrmse': 0.4709607737442845}

fold: 6, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11111165819418095, 'mcrmse': 0.472812505262114}

fold: 6, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11025629463174459, 'mcrmse': 0.4708485761820724}

fold: 6, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11015949384940554, 'mcrmse': 0.4705936172191523}

fold: 6, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11095086908172769, 'mcrmse': 0.47231614900582525}

fold: 6, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10971451750801653, 'mcrmse': 0.4696377900663422}

[31msave model weight[0m
fold: 6, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1096816398108097, 'mcrmse': 0.4695233713446677}

[31msave model weight[0m
fold: 6, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10977266565003359, 'mcrmse': 0.4697162489325395}

fold: 6, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10975342872731216, 'mcrmse': 0.46965508046247895}

fold: 6, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10974784978591573, 'mcrmse': 0.46964793852858083}

{'train_loss': 0.09343749680979685}

fold: 6, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10975654789096559, 'mcrmse': 0.4696711592956183}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 7, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.4138791250145952, 'mcrmse': 2.990139524446988}

[31msave model weight[0m
fold: 7, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.1360578966872465, 'mcrmse': 1.75042199525448}

[31msave model weight[0m
fold: 7, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.2059706867198505, 'mcrmse': 0.6510775529840431}

[31msave model weight[0m
fold: 7, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.157196018015942, 'mcrmse': 0.5657477583222145}

[31msave model weight[0m
fold: 7, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11791064060481309, 'mcrmse': 0.4870372207947757}

[31msave model weight[0m
fold: 7, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12098428698451928, 'mcrmse': 0.4931289807350626}

fold: 7, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11350362321071307, 'mcrmse': 0.4775484594643602}

[31msave model weight[0m
fold: 7, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11517319494805982, 'mcrmse': 0.4802335264723224}

fold: 7, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12516730452132652, 'mcrmse': 0.5004390658926328}

fold: 7, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11528253822070558, 'mcrmse': 0.4811327969279928}

{'train_loss': 0.5893889458342032}

fold: 7, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11132863996660008, 'mcrmse': 0.4712830302132722}

[31msave model weight[0m


  0%|          | 0/220 [00:00<?, ?it/s]

fold: 7, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10551836051980552, 'mcrmse': 0.4596749824741144}

[31msave model weight[0m
fold: 7, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1010312945832072, 'mcrmse': 0.45019563495356}

[31msave model weight[0m
fold: 7, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10285208271363812, 'mcrmse': 0.45436891062077417}

fold: 7, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10399534135980679, 'mcrmse': 0.4565853432336097}

fold: 7, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10269031140124402, 'mcrmse': 0.4539308403515267}

fold: 7, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10301201247498203, 'mcrmse': 0.4550584911127775}

fold: 7, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10118135097234146, 'mcrmse': 0.4504935026331933}

fold: 7, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10168086754544006, 'mcrmse': 0.4515660433587497}

fold: 7, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10141005003086441, 'mcrmse': 0.4510841325903624}

fold: 7, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1030387342585932, 'mcrmse': 0.4550700527186334}

{'train_loss': 0.10739056677980856}

fold: 7, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10349333816019775, 'mcrmse': 0.4556215708230164}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 7, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10437601497944664, 'mcrmse': 0.4579991132966814}

fold: 7, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10030229084784417, 'mcrmse': 0.44850686738465456}

[31msave model weight[0m
fold: 7, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10414106894255903, 'mcrmse': 0.4573477625824263}

fold: 7, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10201457866927242, 'mcrmse': 0.45237925702570686}

fold: 7, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10079248783076206, 'mcrmse': 0.44959344799109124}

fold: 7, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10674038521773979, 'mcrmse': 0.46308603829106854}

fold: 7, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10462520738391926, 'mcrmse': 0.45855455753090063}

fold: 7, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1001537646669561, 'mcrmse': 0.4483550738689958}

[31msave model weight[0m
fold: 7, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10200468412674296, 'mcrmse': 0.4525293858503221}

fold: 7, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10213198017357561, 'mcrmse': 0.45280442067610793}

{'train_loss': 0.10161012130027468}

fold: 7, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10110721306499008, 'mcrmse': 0.45054687281086625}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 7, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10034201057899333, 'mcrmse': 0.4487661583325045}

fold: 7, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.09973695211093445, 'mcrmse': 0.4473848714094508}

[31msave model weight[0m
fold: 7, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.09930161020868575, 'mcrmse': 0.44628876827487823}

[31msave model weight[0m
fold: 7, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10008012930221875, 'mcrmse': 0.44820658844389416}

fold: 7, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10016359907129536, 'mcrmse': 0.44829347876618636}

fold: 7, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.09955890442404296, 'mcrmse': 0.44678336428413623}

fold: 7, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10014055366330135, 'mcrmse': 0.44817391577564913}

fold: 7, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10036761077392437, 'mcrmse': 0.44884294127060126}

fold: 7, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10014283176883103, 'mcrmse': 0.44833120590918873}

fold: 7, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1001634260882502, 'mcrmse': 0.44838227396314234}

{'train_loss': 0.09767134836451574}

fold: 7, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10020027881311944, 'mcrmse': 0.4484671875365849}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/220 [00:00<?, ?it/s]

fold: 8, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.656496733350827, 'mcrmse': 3.234550806760291}

[31msave model weight[0m
fold: 8, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.4471435363945144, 'mcrmse': 2.069963413340603}

[31msave model weight[0m
fold: 8, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.2518240175664882, 'mcrmse': 0.7257039593547093}

[31msave model weight[0m
fold: 8, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.15836482376927305, 'mcrmse': 0.5676457618176577}

[31msave model weight[0m
fold: 8, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13524246375883936, 'mcrmse': 0.5218951815286288}

[31msave model weight[0m
fold: 8, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13122174109491852, 'mcrmse': 0.5151873474895104}

[31msave model weight[0m
fold: 8, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13035781199441238, 'mcrmse': 0.5116189634080677}

[31msave model weight[0m
fold: 8, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11247850662988165, 'mcrmse': 0.47526842733206487}

[31msave model weight[0m
fold: 8, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11359945387410386, 'mcrmse': 0.4775970208674992}

fold: 8, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13778910170430722, 'mcrmse': 0.5261671111790386}

{'train_loss': 0.6252844971689311}

fold: 8, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11583571389431843, 'mcrmse': 0.48212306145909956}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 8, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1099487692117691, 'mcrmse': 0.4699269378045421}

[31msave model weight[0m
fold: 8, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11219124758944791, 'mcrmse': 0.47437837227951907}

fold: 8, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11592556833458678, 'mcrmse': 0.4821486492540801}

fold: 8, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11286553913903663, 'mcrmse': 0.475799323171424}

fold: 8, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1083572621426314, 'mcrmse': 0.4663353876319754}

[31msave model weight[0m
fold: 8, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10904604265147158, 'mcrmse': 0.46791152837038874}

fold: 8, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11702493221863457, 'mcrmse': 0.4844579067758955}

fold: 8, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10626865965326118, 'mcrmse': 0.46181223427561463}

[31msave model weight[0m
fold: 8, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10663260330858133, 'mcrmse': 0.4624150362627138}

fold: 8, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10980146073395639, 'mcrmse': 0.469122897709711}

{'train_loss': 0.10820179629054937}

fold: 8, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11321478962059826, 'mcrmse': 0.47688503326046683}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 8, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10699797417882763, 'mcrmse': 0.462962263738459}

fold: 8, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10715980753493126, 'mcrmse': 0.4635408264850809}

fold: 8, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1067522256766134, 'mcrmse': 0.4625317534592754}

fold: 8, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10650242211492471, 'mcrmse': 0.4621184895354109}

fold: 8, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10890666690781294, 'mcrmse': 0.4674061040037403}

fold: 8, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10641211888674275, 'mcrmse': 0.4618994139644515}

fold: 8, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10503426600066597, 'mcrmse': 0.4587574545053799}

[31msave model weight[0m
fold: 8, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10621423344782856, 'mcrmse': 0.46147036529441104}

fold: 8, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10424350754684195, 'mcrmse': 0.4572536005699154}

[31msave model weight[0m
fold: 8, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10457589435379219, 'mcrmse': 0.45790026098488534}

{'train_loss': 0.1029065061529929}

fold: 8, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10498526749556022, 'mcrmse': 0.45895685585031176}



  0%|          | 0/220 [00:00<?, ?it/s]

fold: 8, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10460237577519453, 'mcrmse': 0.4579687181970682}

fold: 8, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10523576499022486, 'mcrmse': 0.4593437152235654}

fold: 8, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10496856262693015, 'mcrmse': 0.4587873437530736}

fold: 8, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10442302616127312, 'mcrmse': 0.45741715578600123}

fold: 8, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10400481342964465, 'mcrmse': 0.45664214685943766}

[31msave model weight[0m
fold: 8, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1044363329553848, 'mcrmse': 0.45753840662893175}

fold: 8, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10466410015778774, 'mcrmse': 0.45805611696769577}

fold: 8, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10370993255959142, 'mcrmse': 0.45593167771653736}

[31msave model weight[0m
fold: 8, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1038119836955729, 'mcrmse': 0.4561460787846928}

fold: 8, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10378889613749122, 'mcrmse': 0.45610082861699613}

{'train_loss': 0.0992718555033207}

fold: 8, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10378616278433739, 'mcrmse': 0.45609332570980704}

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/219 [00:00<?, ?it/s]

fold: 9, epoch: 0, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 2.659390770659155, 'mcrmse': 3.2264762685320414}

[31msave model weight[0m
fold: 9, epoch: 0, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 1.629145430058849, 'mcrmse': 2.227703681426314}

[31msave model weight[0m
fold: 9, epoch: 0, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.2196259723634136, 'mcrmse': 0.6800458874231807}

[31msave model weight[0m
fold: 9, epoch: 0, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.17618428292323132, 'mcrmse': 0.6036205292300978}

[31msave model weight[0m
fold: 9, epoch: 0, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.14418420134758464, 'mcrmse': 0.5426319725478571}

[31msave model weight[0m
fold: 9, epoch: 0, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.13000950095604877, 'mcrmse': 0.5143056717551859}

[31msave model weight[0m
fold: 9, epoch: 0, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1611608780768453, 'mcrmse': 0.5706299657706834}

fold: 9, epoch: 0, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1375955422313846, 'mcrmse': 0.5289823284380502}

fold: 9, epoch: 0, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11823271731941068, 'mcrmse': 0.4884478152098296}

[31msave model weight[0m
fold: 9, epoch: 0, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12151289290311384, 'mcrmse': 0.4954493601485219}

{'train_loss': 0.6499495356254381}

fold: 9, epoch: 0, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11988229000446748, 'mcrmse': 0.4916971001692287}



  0%|          | 0/219 [00:00<?, ?it/s]

fold: 9, epoch: 1, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1145434885913012, 'mcrmse': 0.480315724470067}

[31msave model weight[0m
fold: 9, epoch: 1, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12071363962426478, 'mcrmse': 0.49305982735647}

fold: 9, epoch: 1, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11541115507787587, 'mcrmse': 0.4816916282602942}

fold: 9, epoch: 1, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11588389122364473, 'mcrmse': 0.4822152720128049}

fold: 9, epoch: 1, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11381004720318075, 'mcrmse': 0.4785330079985948}

[31msave model weight[0m
fold: 9, epoch: 1, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.12129105901231571, 'mcrmse': 0.49353318581994804}

fold: 9, epoch: 1, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11246949905643658, 'mcrmse': 0.4754756736427781}

[31msave model weight[0m
fold: 9, epoch: 1, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11284779073024283, 'mcrmse': 0.47638702194372795}

fold: 9, epoch: 1, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11445341426499035, 'mcrmse': 0.4791103800963363}

fold: 9, epoch: 1, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1111385986513021, 'mcrmse': 0.47207508964877265}

[31msave model weight[0m


{'train_loss': 0.10763096894439497}

fold: 9, epoch: 1, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11869042534001019, 'mcrmse': 0.48919538522026107}



  0%|          | 0/219 [00:00<?, ?it/s]

fold: 9, epoch: 2, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11225873292708884, 'mcrmse': 0.4747793825670382}

fold: 9, epoch: 2, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11081771385304783, 'mcrmse': 0.4714404624104388}

[31msave model weight[0m
fold: 9, epoch: 2, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11225065817030108, 'mcrmse': 0.47452063293301167}

fold: 9, epoch: 2, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11330973980378131, 'mcrmse': 0.47679232183567344}

fold: 9, epoch: 2, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11026878229209355, 'mcrmse': 0.4705329970035295}

[31msave model weight[0m
fold: 9, epoch: 2, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10968333847668706, 'mcrmse': 0.46927489208323525}

[31msave model weight[0m
fold: 9, epoch: 2, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.11088586781097919, 'mcrmse': 0.47194637567063363}

fold: 9, epoch: 2, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10929479906145408, 'mcrmse': 0.46819326244673143}

[31msave model weight[0m
fold: 9, epoch: 2, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10923247389039215, 'mcrmse': 0.4679447771582058}

[31msave model weight[0m
fold: 9, epoch: 2, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10955675660955663, 'mcrmse': 0.4682560403605366}

{'train_loss': 0.09921632559128003}

fold: 9, epoch: 2, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10958708396979741, 'mcrmse': 0.4686498768402193}



  0%|          | 0/219 [00:00<?, ?it/s]

fold: 9, epoch: 3, step: 20


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1096825797338875, 'mcrmse': 0.4687742357474518}

fold: 9, epoch: 3, step: 40


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.1087910668278227, 'mcrmse': 0.46692638865650954}

[31msave model weight[0m
fold: 9, epoch: 3, step: 60


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10884263351255534, 'mcrmse': 0.4669067313891127}

[31msave model weight[0m
fold: 9, epoch: 3, step: 80


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10879893935456568, 'mcrmse': 0.4669016707610142}

[31msave model weight[0m
fold: 9, epoch: 3, step: 100


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10866535774299077, 'mcrmse': 0.4665838862870264}

[31msave model weight[0m
fold: 9, epoch: 3, step: 120


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10875166405220421, 'mcrmse': 0.46679964954090025}

fold: 9, epoch: 3, step: 140


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10850178389524927, 'mcrmse': 0.4662245380468409}

[31msave model weight[0m
fold: 9, epoch: 3, step: 160


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10849531086123719, 'mcrmse': 0.46620479238088525}

[31msave model weight[0m
fold: 9, epoch: 3, step: 180


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10848051339996104, 'mcrmse': 0.46616665964047743}

[31msave model weight[0m
fold: 9, epoch: 3, step: 200


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10848296965871539, 'mcrmse': 0.46617514193821424}

{'train_loss': 0.09390472693870601}

fold: 9, epoch: 3, complete


  0%|          | 0/25 [00:00<?, ?it/s]

{'val_loss': 0.10848626129481258, 'mcrmse': 0.4661798710569352}

fold score： [0.4551733698481003, 0.474643830179701, 0.46398394062719356, 0.4539714698873951, 0.4639817900640965, 0.4514837190272043, 0.4695233713446677, 0.44628876827487823, 0.45593167771653736, 0.46616665964047743]
CV: 0.4603


In [15]:
!pip install kaggle

Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73031 sha256=c3427c0c5b9674f3965f04618e1e1147c8195d7358b96b01fae78c6840406308
  Stored in directory: /root/.cache/pip/wheels/ac/b2/c3/fa4706d469b5879105991d1c8be9a3c2ef329ba9fe2ce5085e
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.12
[0m

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

Starting upload for file tokenizer.tar


100%|██████████| 3.22M/3.22M [00:00<00:00, 4.77MB/s]


Upload successful: tokenizer.tar (3MB)
Starting upload for file modelconfig.pth


100%|██████████| 2.42k/2.42k [00:00<00:00, 4.27kB/s]


Upload successful: modelconfig.pth (2KB)
Starting upload for file model.tar


 82%|████████▏ | 4.54G/5.56G [00:43<00:08, 135MB/s] 