In [1]:
! nvidia-smi

Sun Aug 21 05:10:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import os

class Config:
    AUTHOR = "wanwan7123"

    NAME = "feedback-Exp041-essay-deberta-large"
    MODEL_PATH = "microsoft/deberta-large"
    DATASET_PATH = []

    COMPETITION = "feedback-prize-effectiveness"
    COLAB_PATH = "/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    num_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    batch_size = 4
    n_epochs = 5
    
    fc_dropout = 0.1
    weight_decay = 0.001
    beta = (0.9, 0.98)
    lr = 5e-6
    eval_steps = 499
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "fp16": True,
        "freezing": True,
        "optim8bit": True,
        "gradient_checkpoint": True
    }

    upload_from_colab = True

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import log_loss
!pip install torch==1.10

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils.rnn import pad_sequence

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch==1.10
  Downloading torch-1.10.0-cp37-cp37m-manylinux1_x86_64.whl (881.9 MB)
[K     |██████████████████████████████▎ | 834.1 MB 1.2 MB/s eta 0:00:42tcmalloc: large alloc 1147494400 bytes == 0x3a7e2000 @  0x7f3704a6f615 0x592b76 0x4df71e 0x59afff 0x515655 0x549576 0x593fce 0x548ae9 0x51566f 0x549576 0x593fce 0x548ae9 0x5127f1 0x598e3b 0x511f68 0x598e3b 0x511f68 0x598e3b 0x511f68 0x4bc98a 0x532e76 0x594b72 0x515600 0x549576 0x593fce 0x548ae9 0x5127f1 0x549576 0x593fce 0x5118f8 0x593dd7
[K     |████████████████████████████████| 881.9 MB 16 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu113
    Uninstalling torch-1.12.1+cu113:
      Successfully uninstalled torch-1.12.1+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour 

In [4]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install transformers==4.16.2
        ! pip install tokenizers==0.11.6
        ! pip install transformers[sentencepiece]

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [None]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupstratifiedkfold(train, target_col, group_col, n_splits, seed):
    kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    train['fold'] = fold_series
    return train, fold_series

In [None]:
# 文章のバグを治す
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [None]:
def flatten(_list):
    return list(itertools.chain.from_iterable(_list))

def even_split(input_ids):
    best_idx = None
    best_len = 100000
    for i in range(1, len(input_ids)):
        x_len = len(flatten(input_ids[:i]))
        y_len = len(flatten(input_ids[i:]))
        diff = abs(x_len - y_len)
        
        if best_len > diff:
            best_len = diff
            best_idx = i
    
    return best_idx

def preprocess_df(df, tokenizer, max_length=198, total_max_length:int=1024):
    df['discourse_text'] = df['discourse_text'].apply(lambda x : resolve_encodings_and_normalize(x))
    df["input_text"] = df["discourse_type"] + " " + df["discourse_text"]

    # one-hot型の準備
    label_ar = df['label'].values
    onehot_ar = np.eye(3)[label_ar] 
    df['Ineffective'] = onehot_ar[:, 0]
    df['Adequate'] = onehot_ar[:, 1]
    df['Effective'] = onehot_ar[:, 2]
    df['label_list'] = df[['Ineffective', 'Adequate', 'Effective']].values.tolist()


    gdf = df.groupby("essay_id")
    fold_df = df.groupby('essay_id')['fold'].apply(lambda x: list(x)[0])
    
    essay_inputs = df.groupby("essay_id")["input_text"].apply(list)
    essay_ids = essay_inputs.index.tolist()
    
    labels = gdf["label_list"].apply(list)
    discourse_ids = gdf["discourse_id"].apply(list)
    
    rows = []
    for i in tqdm(range(len(essay_inputs))):
        # まず全体をtokenizeして1024に収まっていれば、各テキストをtruncationしておく必要はない
        input_ids = tokenizer.batch_encode_plus(essay_inputs[i], max_length=total_max_length, truncation=True)["input_ids"]
        
        if len(flatten(input_ids)) > total_max_length:
            split_idx = even_split(input_ids)
            
            first = input_ids[:split_idx]
            first_seq_ids = [[seq_ids]*len(ids) for seq_ids, ids in enumerate(first)]

            second = input_ids[split_idx:]
            second_seq_ids = [[seq_ids]*len(ids) for seq_ids, ids in enumerate(second)]
            essay_id = essay_ids[i]
            
            rows.append({
                "essay_id":essay_ids[i],
                "group":1,
                "discourse_id": discourse_ids[i][:split_idx ],
                "label":labels[i][:split_idx],
                "input_ids":flatten(first),
                "seq_ids":flatten(first_seq_ids),
                "fold":fold_df[essay_id],
            })
        
            rows.append({
                "essay_id":essay_ids[i],
                "group":2,
                "discourse_id": discourse_ids[i][split_idx:],
                "label":labels[i][split_idx:],
                "input_ids":flatten(second),
                "seq_ids":flatten(second_seq_ids),
                "fold":fold_df[essay_id],
            })
        else:
            # もしかしたら一つのtextで1024超える場合もある？
            seq_ids = [[seq_ids]*len(ids) for seq_ids, ids in enumerate(input_ids)]
            input_ids = flatten(input_ids)
            essay_id = essay_ids[i]
            
            rows.append({
                "essay_id":essay_ids[i],
                "group":1,
                "discourse_id": discourse_ids[i],
                "label":labels[i],
                "input_ids":input_ids,
                "seq_ids":flatten(seq_ids),
                "fold":fold_df[essay_id],
            })
            
    return pd.DataFrame(rows)

In [None]:
class TrainDataset(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.df = df
        self.tokenizer = tokenizer
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        inputs = {}
        
        input_ids = np.array(row["input_ids"])
        seq_ids = np.array(row["seq_ids"])
        label = np.array(row['label'])
        
        attention_mask = np.array([1 if id != self.tokenizer.pad_token_id else 0 for id in input_ids], dtype=np.int64)
        
        inputs = {
            "discourse_id":row["discourse_id"],
            "input_ids":input_ids,
            "attention_mask":attention_mask,
            "seq_ids":seq_ids,
            "label":label,
        }
        
        return inputs

# バッチごとにパディング操作を行う
class Collator:
    def __init__(self, tokenizer, input_cols, meta_cols=None):
        self.tokenizer = tokenizer
        self.input_cols = input_cols
        
        self.meta_cols = meta_cols if meta_cols is not None else []
        
        self.padding = True
        self.max_length: Optional[int] = None
        self.pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        first = features[0]
        
        input_features = []
        meta_features = {meta_col:[] for meta_col in self.meta_cols}
        
        for f in features:
            input_features.append({col:f[col] for col in self.input_cols})
            for meta_col in self.meta_cols:
                meta_features[meta_col].extend(f[meta_col])
            
        batch = self.tokenizer.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        if "label" in first:
            batch["labels"] = pad_sequence([torch.tensor(f["label"], dtype=torch.float) for f in features], batch_first=True, padding_value=-1)

        if "seq_ids" in first:
            batch["seq_ids"] = pad_sequence([torch.tensor(f["seq_ids"], dtype=torch.long) for f in features], batch_first=True, padding_value=-1)
            
        if self.meta_cols is not None:
            batch["meta"] = meta_features

        
        return batch

In [None]:
class SimpleHeader(nn.Module):
    def __init__(self, input_size:int, num_labels):
        super().__init__()
        self.fc = nn.Linear(input_size, num_labels)

        self.cls_dropouts = nn.Sequential(
            nn.Dropout(0.1),
            nn.Dropout(0.2),
            nn.Dropout(0.3),
            nn.Dropout(0.4),
            nn.Dropout(0.5),
        )

    def forward(self, x):
        output = torch.mean(
            torch.stack(
                [self.fc(self.cls_dropouts[i](x)) for i in range(5)],
                dim=0,
            ),
            dim=0,
        )
        return output

class LitModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": 0.1,
                "layer_norm_eps": 1e-7,
                "add_pooling_layer": False,
                "num_labels": 3,
            }
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )   
        self.hidden_size = self.config.hidden_size
        self.fc = nn.Linear(self.config.hidden_size, 3)
        self.header = SimpleHeader(input_size=self.config.hidden_size, num_labels=3)
        self._init_weights(self.header.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=0)
        )
        self._init_weights(self.attention)

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.backbone.gradient_checkpointing_enable()

    @property
    def device(self):
        return self.backbone.device
            
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, last_hidden_state):
        weights = self.attention(last_hidden_state)
        feature = torch.sum(weights * last_hidden_state, dim=0)
        return feature

    # トークンごとに平均取って潰す
    def sequence_mean(self, logits, batch):
        batch_seq_mean = []
        batch_size = batch["input_ids"].shape[0]
        seq_lens = []
        # バッチサイズごとに
        for i in range(batch_size):
            seq_mean = []
            # バッチの長さに応じて処理
            # iはバッチを表すので、そのバッチで何個文章がくっついてるかをjは表す
            for j in range(max(batch["seq_ids"][i])+1):
                # i, j成分（該当するdiscourse）を取り出す
                idx = batch["seq_ids"][i]==j
                idx = idx.nonzero().reshape(-1)
                # idxでtensorの抜き出しを行う
                seq_tensor = torch.index_select(logits[i], 0, idx)
                seq_tensor = self.feature(seq_tensor)
                seq_mean.append(seq_tensor)

            seq_lens.append(len(seq_mean))
            batch_seq_mean.append(torch.vstack(seq_mean))

        return batch_seq_mean, seq_lens
            
    def forward(self, input_dict, labels):
        # batch, len, hidden_size
        output = self.backbone(
            input_ids=input_dict["input_ids"],
            attention_mask=input_dict["attention_mask"]
        )["last_hidden_state"]
    
        # discourse, hidden_size
        output, seq_lens = self.sequence_mean(output, input_dict)
        # discourse, hidden_size
        output = torch.vstack(output)
        output = self.header(output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

In [None]:
# FGM
# https://www.kaggle.com/competitions/tweet-sentiment-extraction/discussion/143764#809408

class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1.0, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [None]:
def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_all_df = pd.DataFrame()
    for fold in cfg.trn_fold:
        # dataset, dataloader
        train_df = train.loc[train['fold']!=fold]
        valid_df = train.loc[train['fold']==fold]

        # psuedo_dataの追加
        psuedo_df = pd.read_csv(f'/content/drive/MyDrive/DataAnalysis/competicion/competicion_feedback/wanwan7123/Input/team_psuedo/ver1/team_psuedo_sample_fold{fold}.csv')
        psuedo_df['discourse_id'] = psuedo_df['discourse_id'].apply(lambda x: eval(str(x)))
        psuedo_df['label'] = psuedo_df['label'].apply(lambda x: eval(str(x)))
        psuedo_df['input_ids'] = psuedo_df['input_ids'].apply(lambda x: eval(str(x)))
        psuedo_df['seq_ids'] = psuedo_df['seq_ids'].apply(lambda x: eval(str(x)))
        train_df = pd.concat([train_df, psuedo_df]).reset_index(drop=True)

        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(train_df, cfg.tokenizer)
        valid_dataset = TrainDataset(valid_df, cfg.tokenizer)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True,
            pin_memory=True,
            drop_last=True,
            collate_fn = Collator(cfg.tokenizer, input_cols = ["input_ids", "attention_mask"], meta_cols = ["discourse_id"])
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
            collate_fn = Collator(cfg.tokenizer, input_cols = ["input_ids", "attention_mask"], meta_cols = ["discourse_id"])
        )

        # model
        model = LitModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL+'config.pth')
        model = model.to(cfg.device)

        # optimizer, scheduler
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
                'weight_decay': cfg.weight_decay
            },
            {
                'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
                'weight_decay': 0.0
            }
        ]

        # optimizer
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=cfg.lr,
            betas=cfg.beta,
            weight_decay=cfg.weight_decay,
        )
        
        # scaler
        scaler = GradScaler()

        # enable FGM
        fgm = FGM(model)

        num_train_optimization_steps = int(
            len(train_loader) * cfg.n_epochs // cfg.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * cfg.num_warmup_steps_rate)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )

        # model-training
        criterion = nn.CrossEntropyLoss()
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            train_losses = []
            train_nums = []
            model.train() 
            val_losses_batch = []
            ids = []
            # dataloader回して予測
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs) in enumerate(pbar):

                    meta = inputs.pop("meta", None)
                    inputs = {k:v.to(cfg.device) for k, v in inputs.items()}
                    labels = inputs['labels'][torch.where(inputs['labels'] != -1)].reshape(-1, 3)
                    optimizer.zero_grad()

                    with autocast():
                        loss, output = model(inputs, labels)
                    
                    ids.extend(meta["discourse_id"])
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                    # FGM attack
                    fgm.attack()
                    with autocast():
                        loss_adv, _ = model(inputs, labels)
                    scaler.scale(loss_adv).backward()
                    fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), 
                            cfg.clip_grad_norm
                        )
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()

                    # evaluating
                    if ((step+1) % cfg.eval_steps == 0):
                        print('===========steps：', step, '==========')
                        val_preds = []
                        val_losses = []
                        val_nums = []
                        ids = []
                        model.eval()
                        with torch.no_grad():
                            with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                                for steps, (inputs) in enumerate(pbar):

                                    meta = inputs.pop("meta", None)
                                    inputs = {k:v.to(cfg.device) for k, v in inputs.items()}
                                    labels = inputs['labels'][torch.where(inputs['labels'] != -1)].reshape(-1, 3)

                                    with autocast():
                                        loss, output = model(inputs, labels)
                                    ids.extend(meta["discourse_id"])

                                    output = output.detach().cpu().numpy()
                                    val_preds.append(output)
                                    val_losses.append(loss.item() * len(labels))
                                    val_nums.append(len(labels))
                                    pbar.set_postfix({
                                        'val_loss': loss.item()
                                    })

                        val_preds = np.vstack(val_preds)
                        val_loss = sum(val_losses) / sum(val_nums)

                        val_log = {
                            'val_loss': val_loss
                        }
                        display(val_log)

                        if best_val_score > val_loss:
                            print("save model weight")
                            best_val_preds = val_preds
                            best_val_score = val_loss
                            torch.save(
                                model.state_dict(), 
                                os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
                            )
                        
                        model.train()

            train_loss = sum(train_losses)/sum(train_nums)
            train_log = {
                'train_loss':train_loss
            }
            display(train_log)

            # evaluating(per epoch)
            print(' ==========end epoch ==========')
            val_preds = []
            val_losses = []
            val_nums = []
            ids = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for steps, (inputs) in enumerate(pbar):

                        meta = inputs.pop("meta", None)
                        inputs = {k:v.to(cfg.device) for k, v in inputs.items()}
                        labels = inputs['labels'][torch.where(inputs['labels'] != -1)].reshape(-1, 3)

                        with autocast():
                            loss, output = model(inputs, labels)
                        ids.extend(meta["discourse_id"])

                        output = output.detach().cpu().numpy()
                        val_preds.append(output)
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.vstack(val_preds)
            val_loss = sum(val_losses) / sum(val_nums)

            val_log = {
                'val_loss': val_loss
            }
            display(val_log)

            if best_val_score > val_loss:
                print("save model weight")
                best_val_preds = val_preds
                best_val_score = val_loss
                torch.save(
                    model.state_dict(), 
                    os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
                )

        oof_df = pd.DataFrame(ids, columns=['discourse_id'])
        oof_df['Ineffective'] = best_val_preds[:, 0]
        oof_df['Adequate'] = best_val_preds[:, 1]
        oof_df['Effective'] = best_val_preds[:, 2]
        oof_df.to_csv(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.csv'))
        oof_all_df = pd.concat([oof_all_df, oof_df], axis=0)
        del model; gc.collect()

    oof_all_df.to_csv(os.path.join(cfg.EXP_PREDS, 'oof_pred.csv'))

    # =====================
    # scoring
    # =====================
    '''
    metric = nn.CrossEntropyLoss()
    score = metric(torch.from_numpy(oof_pred), torch.from_numpy(train['label'].values))
    print('CV:', score.to('cpu').detach().numpy())
    '''
    return oof_all_df

In [None]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
%env TOKENIZERS_PARALLELISM=true
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train_full.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(os.path.join(cfg.OUTPUT_EXP, 'tokenizer'))
train['label'] = train['discourse_effectiveness'].map({'Ineffective':0, 'Adequate':1, 'Effective':2})
train['type_label'] = train['discourse_type'] + '_' + train['discourse_effectiveness']
train, cfg.folds = get_groupstratifiedkfold(train, 'type_label', 'essay_id', cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

train = preprocess_df(train, cfg.tokenizer, max_length=198, total_max_length=1024)
display(train.head())
score = training(cfg, train)

if cfg.upload_from_colab and cfg.COLAB:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

This environment is Google Colab
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2


100%|██████████| 4191/4191 [00:06<00:00, 664.82it/s]


Unnamed: 0,essay_id,group,discourse_id,label,input_ids,seq_ids,fold
0,00066EA9880D,1,"[fe6dfbd53216, ca9e1b60c9fb, 6cf2157f4f19, d92...","[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[1, 32258, 16870, 1672, 1677, 32, 1931, 40054,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
1,000E6DE9E817,1,"[695d181861a1, cd97ee1cc0ad, 1b775274990b, 567...","[[0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, ...","[1, 46884, 38, 524, 7594, 136, 5, 714, 464, 14...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",0
2,0016926B079C,1,"[89304284cef1, 4f2e871a4908, a885c3aa214b, 953...","[[0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, ...","[1, 46884, 38, 206, 14, 521, 74, 1796, 31, 223...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1
3,00203C45FC55,1,"[a713d0f6dc68, 2fd9bb2bfedf, 0e5ecdf1516e, 499...","[[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[1, 32258, 85, 16, 358, 1294, 18, 3366, 7, 28,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
4,0029F4D19C3F,1,"[1082de1aa198, e425994b2124, bf086f9911f6, 29c...","[[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, ...","[1, 32258, 38, 1317, 47, 32, 2811, 2992, 5, 13...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2


Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 51%|█████     | 498/979 [15:57<13:08,  1.64s/it, loss=0.708, lr=4.89e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.58][A
  0%|          | 1/220 [00:00<00:50,  4.35it/s, val_loss=0.58][A
  0%|          | 1/220 [00:00<00:50,  4.35it/s, val_loss=0.746][A
  1%|          | 2/220 [00:00<00:51,  4.23it/s, val_loss=0.746][A
  1%|          | 2/220 [00:00<00:51,  4.23it/s, val_loss=0.762][A
  1%|▏         | 3/220 [00:00<01:03,  3.41it/s, val_loss=0.762][A
  1%|▏         | 3/220 [00:01<01:03,  3.41it/s, val_loss=0.819][A
  2%|▏         | 4/220 [00:01<01:01,  3.49it/s, val_loss=0.819][A
  2%|▏         | 4/220 [00:01<01:01,  3.49it/s, val_loss=0.796][A
  2%|▏         | 5/220 [00:01<01:08,  3.12it/s, val_loss=0.796][A
  2%|▏         | 5/220 [00:01<01:08,  3.12it/s, val_loss=0.697][A
  3%|▎         | 6/220 [00:01<01:12,  2.96it/s, val_loss=0.697][A
  3%|▎         | 6/220 [00:02<01:12,  2.96it/s, val_loss=0.792][A
  3%|▎         | 7/220 [00:02<01:05,  3.24it/s, val_loss=0.792][A
  3%|▎         | 7/220 [00:0

{'val_loss': 0.6760728931475768}

save model weight


100%|██████████| 979/979 [32:00<00:00,  1.96s/it, loss=0.708, lr=4.89e-6]


{'train_loss': 0.7491365469851481}



100%|██████████| 220/220 [01:06<00:00,  3.33it/s, val_loss=0.0905]


{'val_loss': 0.6331481239484369}

save model weight


 51%|█████     | 498/979 [15:50<19:17,  2.41s/it, loss=0.768, lr=4e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.586][A
  0%|          | 1/220 [00:00<00:53,  4.11it/s, val_loss=0.586][A
  0%|          | 1/220 [00:00<00:53,  4.11it/s, val_loss=0.722][A
  1%|          | 2/220 [00:00<00:52,  4.15it/s, val_loss=0.722][A
  1%|          | 2/220 [00:00<00:52,  4.15it/s, val_loss=0.634][A
  1%|▏         | 3/220 [00:00<01:04,  3.38it/s, val_loss=0.634][A
  1%|▏         | 3/220 [00:01<01:04,  3.38it/s, val_loss=0.637][A
  2%|▏         | 4/220 [00:01<01:01,  3.53it/s, val_loss=0.637][A
  2%|▏         | 4/220 [00:01<01:01,  3.53it/s, val_loss=0.457][A
  2%|▏         | 5/220 [00:01<01:09,  3.08it/s, val_loss=0.457][A
  2%|▏         | 5/220 [00:01<01:09,  3.08it/s, val_loss=0.638][A
  3%|▎         | 6/220 [00:01<01:12,  2.95it/s, val_loss=0.638][A
  3%|▎         | 6/220 [00:02<01:12,  2.95it/s, val_loss=0.845][A
  3%|▎         | 7/220 [00:02<01:05,  3.24it/s, val_loss=0.845][A
  3%|▎         | 7/220 [00

{'val_loss': 0.6433225715742809}

100%|██████████| 979/979 [31:36<00:00,  1.94s/it, loss=0.768, lr=4e-6]


{'train_loss': 0.6422432230093834}



100%|██████████| 220/220 [01:06<00:00,  3.33it/s, val_loss=0.105]


{'val_loss': 0.6235884108351321}

save model weight


 51%|█████     | 498/979 [15:25<13:00,  1.62s/it, loss=0.445, lr=2.53e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.516][A
  0%|          | 1/220 [00:00<00:54,  3.99it/s, val_loss=0.516][A
  0%|          | 1/220 [00:00<00:54,  3.99it/s, val_loss=0.646][A
  1%|          | 2/220 [00:00<00:54,  4.03it/s, val_loss=0.646][A
  1%|          | 2/220 [00:00<00:54,  4.03it/s, val_loss=0.491][A
  1%|▏         | 3/220 [00:00<01:04,  3.34it/s, val_loss=0.491][A
  1%|▏         | 3/220 [00:01<01:04,  3.34it/s, val_loss=0.522][A
  2%|▏         | 4/220 [00:01<01:03,  3.40it/s, val_loss=0.522][A
  2%|▏         | 4/220 [00:01<01:03,  3.40it/s, val_loss=0.433][A
  2%|▏         | 5/220 [00:01<01:09,  3.08it/s, val_loss=0.433][A
  2%|▏         | 5/220 [00:01<01:09,  3.08it/s, val_loss=0.724][A
  3%|▎         | 6/220 [00:01<01:15,  2.85it/s, val_loss=0.724][A
  3%|▎         | 6/220 [00:02<01:15,  2.85it/s, val_loss=0.884][A
  3%|▎         | 7/220 [00:02<01:07,  3.17it/s, val_loss=0.884][A
  3%|▎         | 7/220 [00

{'val_loss': 0.6128174300448354}

save model weight


100%|██████████| 979/979 [31:42<00:00,  1.94s/it, loss=0.445, lr=2.53e-6]


{'train_loss': 0.5820154882464253}



100%|██████████| 220/220 [01:05<00:00,  3.35it/s, val_loss=0.111]


{'val_loss': 0.6035977469717381}

save model weight


 51%|█████     | 498/979 [15:17<16:19,  2.04s/it, loss=0.491, lr=1.04e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.581][A
  0%|          | 1/220 [00:00<00:49,  4.40it/s, val_loss=0.581][A
  0%|          | 1/220 [00:00<00:49,  4.40it/s, val_loss=0.635][A
  1%|          | 2/220 [00:00<00:50,  4.30it/s, val_loss=0.635][A
  1%|          | 2/220 [00:00<00:50,  4.30it/s, val_loss=0.542][A
  1%|▏         | 3/220 [00:00<01:03,  3.43it/s, val_loss=0.542][A
  1%|▏         | 3/220 [00:01<01:03,  3.43it/s, val_loss=0.526][A
  2%|▏         | 4/220 [00:01<01:00,  3.55it/s, val_loss=0.526][A
  2%|▏         | 4/220 [00:01<01:00,  3.55it/s, val_loss=0.446][A
  2%|▏         | 5/220 [00:01<01:08,  3.13it/s, val_loss=0.446][A
  2%|▏         | 5/220 [00:01<01:08,  3.13it/s, val_loss=0.654][A
  3%|▎         | 6/220 [00:01<01:11,  2.98it/s, val_loss=0.654][A
  3%|▎         | 6/220 [00:02<01:11,  2.98it/s, val_loss=0.94] [A
  3%|▎         | 7/220 [00:02<01:05,  3.26it/s, val_loss=0.94][A
  3%|▎         | 7/220 [00:

{'val_loss': 0.6017770652071046}

save model weight


100%|██████████| 979/979 [31:45<00:00,  1.95s/it, loss=0.491, lr=1.04e-6]


{'train_loss': 0.5383913122261149}



100%|██████████| 220/220 [01:06<00:00,  3.31it/s, val_loss=0.0784]


{'val_loss': 0.6026371781920484}



 51%|█████     | 498/979 [15:32<11:34,  1.44s/it, loss=0.761, lr=1.21e-7]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.614][A
  0%|          | 1/220 [00:00<00:49,  4.41it/s, val_loss=0.614][A
  0%|          | 1/220 [00:00<00:49,  4.41it/s, val_loss=0.627][A
  1%|          | 2/220 [00:00<00:51,  4.26it/s, val_loss=0.627][A
  1%|          | 2/220 [00:00<00:51,  4.26it/s, val_loss=0.506][A
  1%|▏         | 3/220 [00:00<01:03,  3.43it/s, val_loss=0.506][A
  1%|▏         | 3/220 [00:01<01:03,  3.43it/s, val_loss=0.504][A
  2%|▏         | 4/220 [00:01<01:00,  3.55it/s, val_loss=0.504][A
  2%|▏         | 4/220 [00:01<01:00,  3.55it/s, val_loss=0.403][A
  2%|▏         | 5/220 [00:01<01:07,  3.20it/s, val_loss=0.403][A
  2%|▏         | 5/220 [00:01<01:07,  3.20it/s, val_loss=0.699][A
  3%|▎         | 6/220 [00:01<01:11,  2.97it/s, val_loss=0.699][A
  3%|▎         | 6/220 [00:02<01:11,  2.97it/s, val_loss=0.964][A
  3%|▎         | 7/220 [00:02<01:05,  3.27it/s, val_loss=0.964][A
  3%|▎         | 7/220 [00

{'val_loss': 0.6040912886462588}

100%|██████████| 979/979 [31:45<00:00,  1.95s/it, loss=0.761, lr=1.21e-7]


{'train_loss': 0.5132046901132575}



100%|██████████| 220/220 [01:06<00:00,  3.33it/s, val_loss=0.071]


{'val_loss': 0.6030305925806836}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 50%|█████     | 498/988 [15:45<13:38,  1.67s/it, loss=0.767, lr=4.9e-6]




  0%|          | 0/218 [00:00<?, ?it/s][A
  0%|          | 0/218 [00:00<?, ?it/s, val_loss=0.687][A
  0%|          | 1/218 [00:00<01:27,  2.48it/s, val_loss=0.687][A
  0%|          | 1/218 [00:00<01:27,  2.48it/s, val_loss=0.703][A
  1%|          | 2/218 [00:00<01:08,  3.17it/s, val_loss=0.703][A
  1%|          | 2/218 [00:01<01:08,  3.17it/s, val_loss=1.05] [A
  1%|▏         | 3/218 [00:01<01:11,  3.00it/s, val_loss=1.05][A
  1%|▏         | 3/218 [00:01<01:11,  3.00it/s, val_loss=0.813][A
  2%|▏         | 4/218 [00:01<01:01,  3.46it/s, val_loss=0.813][A
  2%|▏         | 4/218 [00:01<01:01,  3.46it/s, val_loss=0.825][A
  2%|▏         | 5/218 [00:01<00:57,  3.68it/s, val_loss=0.825][A
  2%|▏         | 5/218 [00:01<00:57,  3.68it/s, val_loss=0.704][A
  3%|▎         | 6/218 [00:01<00:59,  3.54it/s, val_loss=0.704][A
  3%|▎         | 6/218 [00:02<00:59,  3.54it/s, val_loss=0.55] [A
  3%|▎         | 7/218 [00:02<01:05,  3.24it/s, val_loss=0.55][A
  3%|▎         | 7/218 [00:0

{'val_loss': 0.7687123475089199}

save model weight


100%|██████████| 988/988 [32:06<00:00,  1.95s/it, loss=0.767, lr=4.9e-6]


{'train_loss': 0.740562848216819}



100%|██████████| 218/218 [01:01<00:00,  3.55it/s, val_loss=0.681]


{'val_loss': 0.7091695120818744}

save model weight


 50%|█████     | 498/988 [15:27<13:59,  1.71s/it, loss=0.507, lr=4.01e-6]




  0%|          | 0/218 [00:00<?, ?it/s][A
  0%|          | 0/218 [00:00<?, ?it/s, val_loss=0.588][A
  0%|          | 1/218 [00:00<01:28,  2.46it/s, val_loss=0.588][A
  0%|          | 1/218 [00:00<01:28,  2.46it/s, val_loss=0.49] [A
  1%|          | 2/218 [00:00<01:07,  3.19it/s, val_loss=0.49][A
  1%|          | 2/218 [00:01<01:07,  3.19it/s, val_loss=0.814][A
  1%|▏         | 3/218 [00:01<01:11,  2.99it/s, val_loss=0.814][A
  1%|▏         | 3/218 [00:01<01:11,  2.99it/s, val_loss=0.572][A
  2%|▏         | 4/218 [00:01<01:00,  3.55it/s, val_loss=0.572][A
  2%|▏         | 4/218 [00:01<01:00,  3.55it/s, val_loss=0.731][A
  2%|▏         | 5/218 [00:01<00:56,  3.76it/s, val_loss=0.731][A
  2%|▏         | 5/218 [00:01<00:56,  3.76it/s, val_loss=0.565][A
  3%|▎         | 6/218 [00:01<01:00,  3.52it/s, val_loss=0.565][A
  3%|▎         | 6/218 [00:02<01:00,  3.52it/s, val_loss=0.494][A
  3%|▎         | 7/218 [00:02<01:05,  3.24it/s, val_loss=0.494][A
  3%|▎         | 7/218 [00:

{'val_loss': 0.6605667267873635}

save model weight


100%|██████████| 988/988 [32:10<00:00,  1.95s/it, loss=0.507, lr=4.01e-6]


{'train_loss': 0.6285262559760895}



100%|██████████| 218/218 [01:01<00:00,  3.57it/s, val_loss=0.982]


{'val_loss': 0.6631660893270411}



 50%|█████     | 498/988 [15:28<15:57,  1.95s/it, loss=0.516, lr=2.53e-6]




  0%|          | 0/218 [00:00<?, ?it/s][A
  0%|          | 0/218 [00:00<?, ?it/s, val_loss=0.65][A
  0%|          | 1/218 [00:00<01:27,  2.47it/s, val_loss=0.65][A
  0%|          | 1/218 [00:00<01:27,  2.47it/s, val_loss=0.435][A
  1%|          | 2/218 [00:00<01:07,  3.19it/s, val_loss=0.435][A
  1%|          | 2/218 [00:01<01:07,  3.19it/s, val_loss=0.88] [A
  1%|▏         | 3/218 [00:01<01:10,  3.03it/s, val_loss=0.88][A
  1%|▏         | 3/218 [00:01<01:10,  3.03it/s, val_loss=0.594][A
  2%|▏         | 4/218 [00:01<00:59,  3.57it/s, val_loss=0.594][A
  2%|▏         | 4/218 [00:01<00:59,  3.57it/s, val_loss=0.688][A
  2%|▏         | 5/218 [00:01<00:56,  3.79it/s, val_loss=0.688][A
  2%|▏         | 5/218 [00:01<00:56,  3.79it/s, val_loss=0.641][A
  3%|▎         | 6/218 [00:01<00:59,  3.58it/s, val_loss=0.641][A
  3%|▎         | 6/218 [00:02<00:59,  3.58it/s, val_loss=0.492][A
  3%|▎         | 7/218 [00:02<01:05,  3.23it/s, val_loss=0.492][A
  3%|▎         | 7/218 [00:02

{'val_loss': 0.656270076462189}

save model weight


100%|██████████| 988/988 [32:08<00:00,  1.95s/it, loss=0.516, lr=2.53e-6]


{'train_loss': 0.5629981067647957}



100%|██████████| 218/218 [01:01<00:00,  3.52it/s, val_loss=1.05]


{'val_loss': 0.677003660582007}



 50%|█████     | 498/988 [15:30<17:53,  2.19s/it, loss=0.467, lr=1.04e-6]




  0%|          | 0/218 [00:00<?, ?it/s][A
  0%|          | 0/218 [00:00<?, ?it/s, val_loss=0.683][A
  0%|          | 1/218 [00:00<01:28,  2.45it/s, val_loss=0.683][A
  0%|          | 1/218 [00:00<01:28,  2.45it/s, val_loss=0.396][A
  1%|          | 2/218 [00:00<01:09,  3.11it/s, val_loss=0.396][A
  1%|          | 2/218 [00:01<01:09,  3.11it/s, val_loss=0.903][A
  1%|▏         | 3/218 [00:01<01:13,  2.92it/s, val_loss=0.903][A
  1%|▏         | 3/218 [00:01<01:13,  2.92it/s, val_loss=0.577][A
  2%|▏         | 4/218 [00:01<01:01,  3.48it/s, val_loss=0.577][A
  2%|▏         | 4/218 [00:01<01:01,  3.48it/s, val_loss=0.669][A
  2%|▏         | 5/218 [00:01<00:57,  3.68it/s, val_loss=0.669][A
  2%|▏         | 5/218 [00:01<00:57,  3.68it/s, val_loss=0.603][A
  3%|▎         | 6/218 [00:01<01:00,  3.51it/s, val_loss=0.603][A
  3%|▎         | 6/218 [00:02<01:00,  3.51it/s, val_loss=0.446][A
  3%|▎         | 7/218 [00:02<01:05,  3.20it/s, val_loss=0.446][A
  3%|▎         | 7/218 [00

{'val_loss': 0.6519712227905186}

save model weight


100%|██████████| 988/988 [32:15<00:00,  1.96s/it, loss=0.467, lr=1.04e-6]


{'train_loss': 0.5091497667736224}



100%|██████████| 218/218 [01:02<00:00,  3.51it/s, val_loss=0.777]


{'val_loss': 0.643641846657029}

save model weight


 50%|█████     | 498/988 [15:33<14:58,  1.83s/it, loss=0.47, lr=1.23e-7] 




  0%|          | 0/218 [00:00<?, ?it/s][A
  0%|          | 0/218 [00:00<?, ?it/s, val_loss=0.706][A
  0%|          | 1/218 [00:00<01:28,  2.45it/s, val_loss=0.706][A
  0%|          | 1/218 [00:00<01:28,  2.45it/s, val_loss=0.385][A
  1%|          | 2/218 [00:00<01:08,  3.14it/s, val_loss=0.385][A
  1%|          | 2/218 [00:01<01:08,  3.14it/s, val_loss=0.927][A
  1%|▏         | 3/218 [00:01<01:14,  2.89it/s, val_loss=0.927][A
  1%|▏         | 3/218 [00:01<01:14,  2.89it/s, val_loss=0.604][A
  2%|▏         | 4/218 [00:01<01:01,  3.47it/s, val_loss=0.604][A
  2%|▏         | 4/218 [00:01<01:01,  3.47it/s, val_loss=0.637][A
  2%|▏         | 5/218 [00:01<00:59,  3.60it/s, val_loss=0.637][A
  2%|▏         | 5/218 [00:01<00:59,  3.60it/s, val_loss=0.658][A
  3%|▎         | 6/218 [00:01<01:01,  3.47it/s, val_loss=0.658][A
  3%|▎         | 6/218 [00:02<01:01,  3.47it/s, val_loss=0.431][A
  3%|▎         | 7/218 [00:02<01:08,  3.10it/s, val_loss=0.431][A
  3%|▎         | 7/218 [00

{'val_loss': 0.6572373303509034}

100%|██████████| 988/988 [32:08<00:00,  1.95s/it, loss=0.47, lr=1.23e-7]


{'train_loss': 0.47861652165802737}



100%|██████████| 218/218 [01:01<00:00,  3.53it/s, val_loss=0.859]


{'val_loss': 0.6551608099071864}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 51%|█████     | 498/972 [15:44<16:17,  2.06s/it, loss=0.87, lr=4.89e-6] 




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.719][A
  0%|          | 1/220 [00:00<01:06,  3.27it/s, val_loss=0.719][A
  0%|          | 1/220 [00:00<01:06,  3.27it/s, val_loss=0.461][A
  1%|          | 2/220 [00:00<01:01,  3.56it/s, val_loss=0.461][A
  1%|          | 2/220 [00:00<01:01,  3.56it/s, val_loss=0.811][A
  1%|▏         | 3/220 [00:00<01:02,  3.49it/s, val_loss=0.811][A
  1%|▏         | 3/220 [00:01<01:02,  3.49it/s, val_loss=0.743][A
  2%|▏         | 4/220 [00:01<00:54,  3.94it/s, val_loss=0.743][A
  2%|▏         | 4/220 [00:01<00:54,  3.94it/s, val_loss=0.592][A
  2%|▏         | 5/220 [00:01<01:07,  3.18it/s, val_loss=0.592][A
  2%|▏         | 5/220 [00:01<01:07,  3.18it/s, val_loss=0.566][A
  3%|▎         | 6/220 [00:01<01:02,  3.43it/s, val_loss=0.566][A
  3%|▎         | 6/220 [00:02<01:02,  3.43it/s, val_loss=0.756][A
  3%|▎         | 7/220 [00:02<01:03,  3.33it/s, val_loss=0.756][A
  3%|▎         | 7/220 [00

{'val_loss': 0.7131690948987475}

save model weight


100%|██████████| 972/972 [31:36<00:00,  1.95s/it, loss=0.87, lr=4.89e-6]


{'train_loss': 0.7419998481409165}



100%|██████████| 220/220 [01:01<00:00,  3.56it/s, val_loss=0.722]


{'val_loss': 0.6333102556298075}

save model weight


 51%|█████     | 498/972 [15:51<16:20,  2.07s/it, loss=0.559, lr=4e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.723][A
  0%|          | 1/220 [00:00<01:05,  3.36it/s, val_loss=0.723][A
  0%|          | 1/220 [00:00<01:05,  3.36it/s, val_loss=0.371][A
  1%|          | 2/220 [00:00<00:57,  3.78it/s, val_loss=0.371][A
  1%|          | 2/220 [00:00<00:57,  3.78it/s, val_loss=0.656][A
  1%|▏         | 3/220 [00:00<01:00,  3.60it/s, val_loss=0.656][A
  1%|▏         | 3/220 [00:01<01:00,  3.60it/s, val_loss=0.621][A
  2%|▏         | 4/220 [00:01<00:53,  4.07it/s, val_loss=0.621][A
  2%|▏         | 4/220 [00:01<00:53,  4.07it/s, val_loss=0.526][A
  2%|▏         | 5/220 [00:01<01:05,  3.31it/s, val_loss=0.526][A
  2%|▏         | 5/220 [00:01<01:05,  3.31it/s, val_loss=0.646][A
  3%|▎         | 6/220 [00:01<01:00,  3.53it/s, val_loss=0.646][A
  3%|▎         | 6/220 [00:01<01:00,  3.53it/s, val_loss=0.831][A
  3%|▎         | 7/220 [00:01<01:01,  3.45it/s, val_loss=0.831][A
  3%|▎         | 7/220 [00

{'val_loss': 0.6148029336755728}

save model weight


100%|██████████| 972/972 [31:45<00:00,  1.96s/it, loss=0.559, lr=4e-6]


{'train_loss': 0.6317455575861641}



100%|██████████| 220/220 [01:01<00:00,  3.58it/s, val_loss=0.751]


{'val_loss': 0.5990836136115127}

save model weight


 51%|█████     | 498/972 [15:51<15:22,  1.95s/it, loss=0.586, lr=2.52e-6]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.696][A
  0%|          | 1/220 [00:00<01:09,  3.15it/s, val_loss=0.696][A
  0%|          | 1/220 [00:00<01:09,  3.15it/s, val_loss=0.357][A
  1%|          | 2/220 [00:00<00:59,  3.64it/s, val_loss=0.357][A
  1%|          | 2/220 [00:00<00:59,  3.64it/s, val_loss=0.657][A
  1%|▏         | 3/220 [00:00<01:02,  3.45it/s, val_loss=0.657][A
  1%|▏         | 3/220 [00:01<01:02,  3.45it/s, val_loss=0.58] [A
  2%|▏         | 4/220 [00:01<00:55,  3.92it/s, val_loss=0.58][A
  2%|▏         | 4/220 [00:01<00:55,  3.92it/s, val_loss=0.575][A
  2%|▏         | 5/220 [00:01<01:06,  3.24it/s, val_loss=0.575][A
  2%|▏         | 5/220 [00:01<01:06,  3.24it/s, val_loss=0.657][A
  3%|▎         | 6/220 [00:01<01:02,  3.40it/s, val_loss=0.657][A
  3%|▎         | 6/220 [00:02<01:02,  3.40it/s, val_loss=0.673][A
  3%|▎         | 7/220 [00:02<01:03,  3.37it/s, val_loss=0.673][A
  3%|▎         | 7/220 [00:

{'val_loss': 0.6039340309356088}

100%|██████████| 972/972 [31:37<00:00,  1.95s/it, loss=0.586, lr=2.52e-6]


{'train_loss': 0.5703548238799305}



100%|██████████| 220/220 [01:01<00:00,  3.58it/s, val_loss=0.779]


{'val_loss': 0.5921410898307174}

save model weight


 51%|█████     | 498/972 [15:24<12:37,  1.60s/it, loss=0.48, lr=1.03e-6] 




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.766][A
  0%|          | 1/220 [00:00<01:08,  3.21it/s, val_loss=0.766][A
  0%|          | 1/220 [00:00<01:08,  3.21it/s, val_loss=0.348][A
  1%|          | 2/220 [00:00<00:58,  3.71it/s, val_loss=0.348][A
  1%|          | 2/220 [00:00<00:58,  3.71it/s, val_loss=0.659][A
  1%|▏         | 3/220 [00:00<01:00,  3.56it/s, val_loss=0.659][A
  1%|▏         | 3/220 [00:01<01:00,  3.56it/s, val_loss=0.605][A
  2%|▏         | 4/220 [00:01<00:54,  3.98it/s, val_loss=0.605][A
  2%|▏         | 4/220 [00:01<00:54,  3.98it/s, val_loss=0.546][A
  2%|▏         | 5/220 [00:01<01:06,  3.25it/s, val_loss=0.546][A
  2%|▏         | 5/220 [00:01<01:06,  3.25it/s, val_loss=0.611][A
  3%|▎         | 6/220 [00:01<01:01,  3.50it/s, val_loss=0.611][A
  3%|▎         | 6/220 [00:02<01:01,  3.50it/s, val_loss=0.731][A
  3%|▎         | 7/220 [00:02<01:03,  3.35it/s, val_loss=0.731][A
  3%|▎         | 7/220 [00

{'val_loss': 0.5933913057533605}

100%|██████████| 972/972 [31:38<00:00,  1.95s/it, loss=0.48, lr=1.03e-6]


{'train_loss': 0.5149236434713925}



100%|██████████| 220/220 [01:01<00:00,  3.60it/s, val_loss=0.865]


{'val_loss': 0.5943492608287447}



 51%|█████     | 498/972 [15:40<15:09,  1.92s/it, loss=0.223, lr=1.19e-7]




  0%|          | 0/220 [00:00<?, ?it/s][A
  0%|          | 0/220 [00:00<?, ?it/s, val_loss=0.786][A
  0%|          | 1/220 [00:00<01:06,  3.31it/s, val_loss=0.786][A
  0%|          | 1/220 [00:00<01:06,  3.31it/s, val_loss=0.358][A
  1%|          | 2/220 [00:00<00:57,  3.78it/s, val_loss=0.358][A
  1%|          | 2/220 [00:00<00:57,  3.78it/s, val_loss=0.679][A
  1%|▏         | 3/220 [00:00<01:00,  3.61it/s, val_loss=0.679][A
  1%|▏         | 3/220 [00:01<01:00,  3.61it/s, val_loss=0.564][A
  2%|▏         | 4/220 [00:01<00:53,  4.03it/s, val_loss=0.564][A
  2%|▏         | 4/220 [00:01<00:53,  4.03it/s, val_loss=0.505][A
  2%|▏         | 5/220 [00:01<01:06,  3.26it/s, val_loss=0.505][A
  2%|▏         | 5/220 [00:01<01:06,  3.26it/s, val_loss=0.576][A
  3%|▎         | 6/220 [00:01<01:01,  3.48it/s, val_loss=0.576][A
  3%|▎         | 6/220 [00:02<01:01,  3.48it/s, val_loss=0.711][A
  3%|▎         | 7/220 [00:02<01:03,  3.34it/s, val_loss=0.711][A
  3%|▎         | 7/220 [00

{'val_loss': 0.5984515917934446}

100%|██████████| 972/972 [31:29<00:00,  1.94s/it, loss=0.223, lr=1.19e-7]


{'train_loss': 0.4862157244331998}



100%|██████████| 220/220 [01:01<00:00,  3.60it/s, val_loss=0.854]


{'val_loss': 0.5985377777854722}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 51%|█████     | 498/977 [15:49<15:26,  1.93s/it, loss=0.685, lr=4.89e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.387][A
  0%|          | 1/219 [00:00<01:37,  2.23it/s, val_loss=0.387][A
  0%|          | 1/219 [00:00<01:37,  2.23it/s, val_loss=0.498][A
  1%|          | 2/219 [00:00<01:11,  3.02it/s, val_loss=0.498][A
  1%|          | 2/219 [00:00<01:11,  3.02it/s, val_loss=0.572][A
  1%|▏         | 3/219 [00:00<01:06,  3.22it/s, val_loss=0.572][A
  1%|▏         | 3/219 [00:01<01:06,  3.22it/s, val_loss=0.621][A
  2%|▏         | 4/219 [00:01<01:05,  3.30it/s, val_loss=0.621][A
  2%|▏         | 4/219 [00:01<01:05,  3.30it/s, val_loss=0.699][A
  2%|▏         | 5/219 [00:01<01:01,  3.46it/s, val_loss=0.699][A
  2%|▏         | 5/219 [00:01<01:01,  3.46it/s, val_loss=0.581][A
  3%|▎         | 6/219 [00:01<01:02,  3.40it/s, val_loss=0.581][A
  3%|▎         | 6/219 [00:02<01:02,  3.40it/s, val_loss=0.71] [A
  3%|▎         | 7/219 [00:02<01:04,  3.27it/s, val_loss=0.71][A
  3%|▎         | 7/219 [00:

{'val_loss': 0.6836670250590949}

save model weight


100%|██████████| 977/977 [31:50<00:00,  1.96s/it, loss=0.685, lr=4.89e-6]


{'train_loss': 0.742093698699068}



100%|██████████| 219/219 [01:01<00:00,  3.57it/s, val_loss=0.954]


{'val_loss': 0.6560595212344368}

save model weight


 51%|█████     | 498/977 [15:37<15:16,  1.91s/it, loss=1.26, lr=4e-6] 




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.357][A
  0%|          | 1/219 [00:00<01:33,  2.34it/s, val_loss=0.357][A
  0%|          | 1/219 [00:00<01:33,  2.34it/s, val_loss=0.68] [A
  1%|          | 2/219 [00:00<01:09,  3.12it/s, val_loss=0.68][A
  1%|          | 2/219 [00:00<01:09,  3.12it/s, val_loss=0.477][A
  1%|▏         | 3/219 [00:00<01:05,  3.30it/s, val_loss=0.477][A
  1%|▏         | 3/219 [00:01<01:05,  3.30it/s, val_loss=0.545][A
  2%|▏         | 4/219 [00:01<01:04,  3.35it/s, val_loss=0.545][A
  2%|▏         | 4/219 [00:01<01:04,  3.35it/s, val_loss=0.835][A
  2%|▏         | 5/219 [00:01<01:02,  3.41it/s, val_loss=0.835][A
  2%|▏         | 5/219 [00:01<01:02,  3.41it/s, val_loss=0.492][A
  3%|▎         | 6/219 [00:01<01:05,  3.26it/s, val_loss=0.492][A
  3%|▎         | 6/219 [00:02<01:05,  3.26it/s, val_loss=0.677][A
  3%|▎         | 7/219 [00:02<01:06,  3.19it/s, val_loss=0.677][A
  3%|▎         | 7/219 [00:

{'val_loss': 0.6448034606617142}

save model weight


100%|██████████| 977/977 [32:00<00:00,  1.97s/it, loss=1.26, lr=4e-6]


{'train_loss': 0.627691711836132}



100%|██████████| 219/219 [01:01<00:00,  3.54it/s, val_loss=0.886]


{'val_loss': 0.6299298923813653}

save model weight


 51%|█████     | 498/977 [15:42<13:44,  1.72s/it, loss=0.579, lr=2.52e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.42][A
  0%|          | 1/219 [00:00<01:32,  2.35it/s, val_loss=0.42][A
  0%|          | 1/219 [00:00<01:32,  2.35it/s, val_loss=0.708][A
  1%|          | 2/219 [00:00<01:09,  3.12it/s, val_loss=0.708][A
  1%|          | 2/219 [00:00<01:09,  3.12it/s, val_loss=0.46] [A
  1%|▏         | 3/219 [00:00<01:06,  3.23it/s, val_loss=0.46][A
  1%|▏         | 3/219 [00:01<01:06,  3.23it/s, val_loss=0.486][A
  2%|▏         | 4/219 [00:01<01:06,  3.25it/s, val_loss=0.486][A
  2%|▏         | 4/219 [00:01<01:06,  3.25it/s, val_loss=0.767][A
  2%|▏         | 5/219 [00:01<01:01,  3.47it/s, val_loss=0.767][A
  2%|▏         | 5/219 [00:01<01:01,  3.47it/s, val_loss=0.477][A
  3%|▎         | 6/219 [00:01<01:02,  3.40it/s, val_loss=0.477][A
  3%|▎         | 6/219 [00:02<01:02,  3.40it/s, val_loss=0.624][A
  3%|▎         | 7/219 [00:02<01:04,  3.28it/s, val_loss=0.624][A
  3%|▎         | 7/219 [00:02

{'val_loss': 0.6149686198267112}

save model weight


100%|██████████| 977/977 [32:09<00:00,  1.98s/it, loss=0.579, lr=2.52e-6]


{'train_loss': 0.5738842532611096}



100%|██████████| 219/219 [01:01<00:00,  3.54it/s, val_loss=0.902]


{'val_loss': 0.6104033085338422}

save model weight


 51%|█████     | 498/977 [15:35<15:20,  1.92s/it, loss=0.596, lr=1.04e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.351][A
  0%|          | 1/219 [00:00<01:32,  2.35it/s, val_loss=0.351][A
  0%|          | 1/219 [00:00<01:32,  2.35it/s, val_loss=0.628][A
  1%|          | 2/219 [00:00<01:09,  3.10it/s, val_loss=0.628][A
  1%|          | 2/219 [00:00<01:09,  3.10it/s, val_loss=0.648][A
  1%|▏         | 3/219 [00:00<01:06,  3.26it/s, val_loss=0.648][A
  1%|▏         | 3/219 [00:01<01:06,  3.26it/s, val_loss=0.534][A
  2%|▏         | 4/219 [00:01<01:05,  3.29it/s, val_loss=0.534][A
  2%|▏         | 4/219 [00:01<01:05,  3.29it/s, val_loss=0.822][A
  2%|▏         | 5/219 [00:01<01:01,  3.46it/s, val_loss=0.822][A
  2%|▏         | 5/219 [00:01<01:01,  3.46it/s, val_loss=0.495][A
  3%|▎         | 6/219 [00:01<01:02,  3.41it/s, val_loss=0.495][A
  3%|▎         | 6/219 [00:02<01:02,  3.41it/s, val_loss=0.574][A
  3%|▎         | 7/219 [00:02<01:04,  3.27it/s, val_loss=0.574][A
  3%|▎         | 7/219 [00

{'val_loss': 0.6365968097226433}

100%|██████████| 977/977 [31:50<00:00,  1.96s/it, loss=0.596, lr=1.04e-6]


{'train_loss': 0.5302646541660709}



100%|██████████| 219/219 [01:01<00:00,  3.55it/s, val_loss=0.976]


{'val_loss': 0.6122901493010366}



 51%|█████     | 498/977 [15:27<12:59,  1.63s/it, loss=0.495, lr=1.2e-7] 




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.384][A
  0%|          | 1/219 [00:00<01:38,  2.22it/s, val_loss=0.384][A
  0%|          | 1/219 [00:00<01:38,  2.22it/s, val_loss=0.7]  [A
  1%|          | 2/219 [00:00<01:14,  2.91it/s, val_loss=0.7][A
  1%|          | 2/219 [00:01<01:14,  2.91it/s, val_loss=0.479][A
  1%|▏         | 3/219 [00:01<01:08,  3.15it/s, val_loss=0.479][A
  1%|▏         | 3/219 [00:01<01:08,  3.15it/s, val_loss=0.459][A
  2%|▏         | 4/219 [00:01<01:06,  3.26it/s, val_loss=0.459][A
  2%|▏         | 4/219 [00:01<01:06,  3.26it/s, val_loss=0.751][A
  2%|▏         | 5/219 [00:01<01:02,  3.43it/s, val_loss=0.751][A
  2%|▏         | 5/219 [00:01<01:02,  3.43it/s, val_loss=0.422][A
  3%|▎         | 6/219 [00:01<01:04,  3.31it/s, val_loss=0.422][A
  3%|▎         | 6/219 [00:02<01:04,  3.31it/s, val_loss=0.611][A
  3%|▎         | 7/219 [00:02<01:05,  3.23it/s, val_loss=0.611][A
  3%|▎         | 7/219 [00:0

{'val_loss': 0.610746067239414}

100%|██████████| 977/977 [31:38<00:00,  1.94s/it, loss=0.495, lr=1.2e-7]


{'train_loss': 0.5048325823876214}



100%|██████████| 219/219 [01:01<00:00,  3.58it/s, val_loss=0.975]


{'val_loss': 0.6114572223512098}

Some weights of the model checkpoint at microsoft/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




 50%|████▉     | 498/1001 [15:50<18:04,  2.16s/it, loss=0.983, lr=4.9e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.777][A
  0%|          | 1/219 [00:00<00:38,  5.67it/s, val_loss=0.777][A
  0%|          | 1/219 [00:00<00:38,  5.67it/s, val_loss=1.03] [A
  1%|          | 2/219 [00:00<00:43,  4.96it/s, val_loss=1.03][A
  1%|          | 2/219 [00:00<00:43,  4.96it/s, val_loss=0.61][A
  1%|▏         | 3/219 [00:00<00:47,  4.56it/s, val_loss=0.61][A
  1%|▏         | 3/219 [00:00<00:47,  4.56it/s, val_loss=0.571][A
  2%|▏         | 4/219 [00:00<00:57,  3.74it/s, val_loss=0.571][A
  2%|▏         | 4/219 [00:01<00:57,  3.74it/s, val_loss=0.895][A
  2%|▏         | 5/219 [00:01<01:08,  3.12it/s, val_loss=0.895][A
  2%|▏         | 5/219 [00:01<01:08,  3.12it/s, val_loss=0.873][A
  3%|▎         | 6/219 [00:01<01:02,  3.39it/s, val_loss=0.873][A
  3%|▎         | 6/219 [00:01<01:02,  3.39it/s, val_loss=0.734][A
  3%|▎         | 7/219 [00:01<01:00,  3.51it/s, val_loss=0.734][A
  3%|▎         | 7/219 [00:02

{'val_loss': 0.7557708076579254}

save model weight


100%|█████████▉| 997/1001 [32:33<00:07,  1.93s/it, loss=0.983, lr=4.9e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.697][A
  0%|          | 1/219 [00:00<00:39,  5.58it/s, val_loss=0.697][A
  0%|          | 1/219 [00:00<00:39,  5.58it/s, val_loss=0.773][A
  1%|          | 2/219 [00:00<00:42,  5.12it/s, val_loss=0.773][A
  1%|          | 2/219 [00:00<00:42,  5.12it/s, val_loss=0.618][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.618][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.487][A
  2%|▏         | 4/219 [00:00<00:57,  3.73it/s, val_loss=0.487][A
  2%|▏         | 4/219 [00:01<00:57,  3.73it/s, val_loss=0.802][A
  2%|▏         | 5/219 [00:01<01:07,  3.17it/s, val_loss=0.802][A
  2%|▏         | 5/219 [00:01<01:07,  3.17it/s, val_loss=0.689][A
  3%|▎         | 6/219 [00:01<01:04,  3.31it/s, val_loss=0.689][A
  3%|▎         | 6/219 [00:01<01:04,  3.31it/s, val_loss=0.606][A
  3%|▎         | 7/219 [00:01<01:01,  3.46it/s, val_loss=0.606][A
  3%|▎         | 7/219 [00

{'val_loss': 0.6782179464273166}

save model weight


100%|██████████| 1001/1001 [33:50<00:00,  2.03s/it, loss=0.983, lr=4.9e-6]


{'train_loss': 0.7411061039176751}



100%|██████████| 219/219 [01:02<00:00,  3.52it/s, val_loss=0.452]


{'val_loss': 0.67832522434912}



 50%|████▉     | 498/1001 [15:37<19:36,  2.34s/it, loss=0.66, lr=4.02e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.815][A
  0%|          | 1/219 [00:00<00:38,  5.65it/s, val_loss=0.815][A
  0%|          | 1/219 [00:00<00:38,  5.65it/s, val_loss=0.674][A
  1%|          | 2/219 [00:00<00:43,  5.03it/s, val_loss=0.674][A
  1%|          | 2/219 [00:00<00:43,  5.03it/s, val_loss=0.653][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.653][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.458][A
  2%|▏         | 4/219 [00:00<00:57,  3.75it/s, val_loss=0.458][A
  2%|▏         | 4/219 [00:01<00:57,  3.75it/s, val_loss=0.986][A
  2%|▏         | 5/219 [00:01<01:06,  3.19it/s, val_loss=0.986][A
  2%|▏         | 5/219 [00:01<01:06,  3.19it/s, val_loss=0.702][A
  3%|▎         | 6/219 [00:01<01:01,  3.45it/s, val_loss=0.702][A
  3%|▎         | 6/219 [00:01<01:01,  3.45it/s, val_loss=0.607][A
  3%|▎         | 7/219 [00:01<00:59,  3.56it/s, val_loss=0.607][A
  3%|▎         | 7/219 [00

{'val_loss': 0.6894083042140192}

100%|█████████▉| 997/1001 [32:17<00:08,  2.19s/it, loss=0.66, lr=4.02e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.805][A
  0%|          | 1/219 [00:00<00:37,  5.76it/s, val_loss=0.805][A
  0%|          | 1/219 [00:00<00:37,  5.76it/s, val_loss=0.618][A
  1%|          | 2/219 [00:00<00:43,  5.03it/s, val_loss=0.618][A
  1%|          | 2/219 [00:00<00:43,  5.03it/s, val_loss=0.645][A
  1%|▏         | 3/219 [00:00<00:48,  4.46it/s, val_loss=0.645][A
  1%|▏         | 3/219 [00:01<00:48,  4.46it/s, val_loss=0.426][A
  2%|▏         | 4/219 [00:01<01:00,  3.57it/s, val_loss=0.426][A
  2%|▏         | 4/219 [00:01<01:00,  3.57it/s, val_loss=0.864][A
  2%|▏         | 5/219 [00:01<01:09,  3.08it/s, val_loss=0.864][A
  2%|▏         | 5/219 [00:01<01:09,  3.08it/s, val_loss=0.781][A
  3%|▎         | 6/219 [00:01<01:04,  3.29it/s, val_loss=0.781][A
  3%|▎         | 6/219 [00:01<01:04,  3.29it/s, val_loss=0.596][A
  3%|▎         | 7/219 [00:01<01:01,  3.46it/s, val_loss=0.596][A
  3%|▎         | 7/219 [00

{'val_loss': 0.657155149048147}

save model weight


100%|██████████| 1001/1001 [33:32<00:00,  2.01s/it, loss=0.66, lr=4.02e-6]


{'train_loss': 0.6252033801399711}



100%|██████████| 219/219 [01:02<00:00,  3.52it/s, val_loss=0.452]


{'val_loss': 0.6381813602249128}

save model weight


 50%|████▉     | 498/1001 [15:43<15:45,  1.88s/it, loss=0.427, lr=2.54e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.695][A
  0%|          | 1/219 [00:00<00:41,  5.20it/s, val_loss=0.695][A
  0%|          | 1/219 [00:00<00:41,  5.20it/s, val_loss=0.526][A
  1%|          | 2/219 [00:00<00:45,  4.74it/s, val_loss=0.526][A
  1%|          | 2/219 [00:00<00:45,  4.74it/s, val_loss=0.59] [A
  1%|▏         | 3/219 [00:00<00:48,  4.46it/s, val_loss=0.59][A
  1%|▏         | 3/219 [00:00<00:48,  4.46it/s, val_loss=0.377][A
  2%|▏         | 4/219 [00:00<00:58,  3.69it/s, val_loss=0.377][A
  2%|▏         | 4/219 [00:01<00:58,  3.69it/s, val_loss=0.647][A
  2%|▏         | 5/219 [00:01<01:08,  3.15it/s, val_loss=0.647][A
  2%|▏         | 5/219 [00:01<01:08,  3.15it/s, val_loss=0.613][A
  3%|▎         | 6/219 [00:01<01:02,  3.41it/s, val_loss=0.613][A
  3%|▎         | 6/219 [00:01<01:02,  3.41it/s, val_loss=0.611][A
  3%|▎         | 7/219 [00:01<01:02,  3.41it/s, val_loss=0.611][A
  3%|▎         | 7/219 [00:

{'val_loss': 0.6265627074565727}

save model weight


100%|█████████▉| 997/1001 [32:25<00:07,  1.95s/it, loss=0.427, lr=2.54e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.713][A
  0%|          | 1/219 [00:00<00:39,  5.47it/s, val_loss=0.713][A
  0%|          | 1/219 [00:00<00:39,  5.47it/s, val_loss=0.676][A
  1%|          | 2/219 [00:00<00:44,  4.86it/s, val_loss=0.676][A
  1%|          | 2/219 [00:00<00:44,  4.86it/s, val_loss=0.618][A
  1%|▏         | 3/219 [00:00<00:47,  4.51it/s, val_loss=0.618][A
  1%|▏         | 3/219 [00:00<00:47,  4.51it/s, val_loss=0.423][A
  2%|▏         | 4/219 [00:00<00:58,  3.70it/s, val_loss=0.423][A
  2%|▏         | 4/219 [00:01<00:58,  3.70it/s, val_loss=0.757][A
  2%|▏         | 5/219 [00:01<01:08,  3.12it/s, val_loss=0.757][A
  2%|▏         | 5/219 [00:01<01:08,  3.12it/s, val_loss=0.673][A
  3%|▎         | 6/219 [00:01<01:03,  3.37it/s, val_loss=0.673][A
  3%|▎         | 6/219 [00:01<01:03,  3.37it/s, val_loss=0.524][A
  3%|▎         | 7/219 [00:01<01:00,  3.50it/s, val_loss=0.524][A
  3%|▎         | 7/219 [00

{'val_loss': 0.6244957696830082}

save model weight


100%|██████████| 1001/1001 [33:42<00:00,  2.02s/it, loss=0.427, lr=2.54e-6]


{'train_loss': 0.5628450729036819}



100%|██████████| 219/219 [01:01<00:00,  3.54it/s, val_loss=0.477]


{'val_loss': 0.6235288999228954}

save model weight


 50%|████▉     | 498/1001 [15:46<13:35,  1.62s/it, loss=0.282, lr=1.05e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.725][A
  0%|          | 1/219 [00:00<00:41,  5.32it/s, val_loss=0.725][A
  0%|          | 1/219 [00:00<00:41,  5.32it/s, val_loss=0.636][A
  1%|          | 2/219 [00:00<00:43,  4.96it/s, val_loss=0.636][A
  1%|          | 2/219 [00:00<00:43,  4.96it/s, val_loss=0.63] [A
  1%|▏         | 3/219 [00:00<00:48,  4.48it/s, val_loss=0.63][A
  1%|▏         | 3/219 [00:01<00:48,  4.48it/s, val_loss=0.423][A
  2%|▏         | 4/219 [00:01<01:00,  3.53it/s, val_loss=0.423][A
  2%|▏         | 4/219 [00:01<01:00,  3.53it/s, val_loss=0.79] [A
  2%|▏         | 5/219 [00:01<01:10,  3.06it/s, val_loss=0.79][A
  2%|▏         | 5/219 [00:01<01:10,  3.06it/s, val_loss=0.708][A
  3%|▎         | 6/219 [00:01<01:04,  3.28it/s, val_loss=0.708][A
  3%|▎         | 6/219 [00:01<01:04,  3.28it/s, val_loss=0.528][A
  3%|▎         | 7/219 [00:01<01:02,  3.40it/s, val_loss=0.528][A
  3%|▎         | 7/219 [00:0

{'val_loss': 0.6265914776479741}

100%|█████████▉| 997/1001 [32:26<00:06,  1.58s/it, loss=0.282, lr=1.05e-6]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.674][A
  0%|          | 1/219 [00:00<00:38,  5.71it/s, val_loss=0.674][A
  0%|          | 1/219 [00:00<00:38,  5.71it/s, val_loss=0.624][A
  1%|          | 2/219 [00:00<00:42,  5.14it/s, val_loss=0.624][A
  1%|          | 2/219 [00:00<00:42,  5.14it/s, val_loss=0.627][A
  1%|▏         | 3/219 [00:00<00:46,  4.66it/s, val_loss=0.627][A
  1%|▏         | 3/219 [00:00<00:46,  4.66it/s, val_loss=0.443][A
  2%|▏         | 4/219 [00:00<00:57,  3.77it/s, val_loss=0.443][A
  2%|▏         | 4/219 [00:01<00:57,  3.77it/s, val_loss=0.689][A
  2%|▏         | 5/219 [00:01<01:07,  3.16it/s, val_loss=0.689][A
  2%|▏         | 5/219 [00:01<01:07,  3.16it/s, val_loss=0.679][A
  3%|▎         | 6/219 [00:01<01:02,  3.42it/s, val_loss=0.679][A
  3%|▎         | 6/219 [00:01<01:02,  3.42it/s, val_loss=0.55] [A
  3%|▎         | 7/219 [00:01<00:59,  3.54it/s, val_loss=0.55][A
  3%|▎         | 7/219 [00:

{'val_loss': 0.6122578247863971}

save model weight


100%|██████████| 1001/1001 [33:44<00:00,  2.02s/it, loss=0.282, lr=1.05e-6]


{'train_loss': 0.5070557399330649}



100%|██████████| 219/219 [01:02<00:00,  3.52it/s, val_loss=0.486]


{'val_loss': 0.6137917176774942}



 50%|████▉     | 498/1001 [15:48<15:55,  1.90s/it, loss=0.364, lr=1.26e-7]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.674][A
  0%|          | 1/219 [00:00<00:38,  5.66it/s, val_loss=0.674][A
  0%|          | 1/219 [00:00<00:38,  5.66it/s, val_loss=0.616][A
  1%|          | 2/219 [00:00<00:43,  4.97it/s, val_loss=0.616][A
  1%|          | 2/219 [00:00<00:43,  4.97it/s, val_loss=0.617][A
  1%|▏         | 3/219 [00:00<00:47,  4.54it/s, val_loss=0.617][A
  1%|▏         | 3/219 [00:00<00:47,  4.54it/s, val_loss=0.41] [A
  2%|▏         | 4/219 [00:00<00:57,  3.71it/s, val_loss=0.41][A
  2%|▏         | 4/219 [00:01<00:57,  3.71it/s, val_loss=0.649][A
  2%|▏         | 5/219 [00:01<01:09,  3.10it/s, val_loss=0.649][A
  2%|▏         | 5/219 [00:01<01:09,  3.10it/s, val_loss=0.65] [A
  3%|▎         | 6/219 [00:01<01:03,  3.36it/s, val_loss=0.65][A
  3%|▎         | 6/219 [00:01<01:03,  3.36it/s, val_loss=0.56][A
  3%|▎         | 7/219 [00:01<01:00,  3.50it/s, val_loss=0.56][A
  3%|▎         | 7/219 [00:02<

{'val_loss': 0.611240423214267}

save model weight


100%|█████████▉| 997/1001 [32:42<00:07,  1.87s/it, loss=0.364, lr=1.26e-7]




  0%|          | 0/219 [00:00<?, ?it/s][A
  0%|          | 0/219 [00:00<?, ?it/s, val_loss=0.676][A
  0%|          | 1/219 [00:00<00:38,  5.69it/s, val_loss=0.676][A
  0%|          | 1/219 [00:00<00:38,  5.69it/s, val_loss=0.635][A
  1%|          | 2/219 [00:00<00:42,  5.11it/s, val_loss=0.635][A
  1%|          | 2/219 [00:00<00:42,  5.11it/s, val_loss=0.621][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.621][A
  1%|▏         | 3/219 [00:00<00:47,  4.59it/s, val_loss=0.412][A
  2%|▏         | 4/219 [00:00<00:57,  3.74it/s, val_loss=0.412][A
  2%|▏         | 4/219 [00:01<00:57,  3.74it/s, val_loss=0.655][A
  2%|▏         | 5/219 [00:01<01:07,  3.16it/s, val_loss=0.655][A
  2%|▏         | 5/219 [00:01<01:07,  3.16it/s, val_loss=0.659][A
  3%|▎         | 6/219 [00:01<01:02,  3.40it/s, val_loss=0.659][A
  3%|▎         | 6/219 [00:01<01:02,  3.40it/s, val_loss=0.552][A
  3%|▎         | 7/219 [00:01<01:00,  3.49it/s, val_loss=0.552][A
  3%|▎         | 7/219 [00

{'val_loss': 0.6109003057465782}

save model weight


100%|██████████| 1001/1001 [33:56<00:00,  2.03s/it, loss=0.364, lr=1.26e-7]


{'train_loss': 0.47679371799965714}



100%|██████████| 219/219 [01:02<00:00,  3.52it/s, val_loss=0.49]


{'val_loss': 0.610898596915049}

save model weight
Starting upload for file model.tar


100%|██████████| 7.56G/7.56G [03:51<00:00, 35.1MB/s]


Upload successful: model.tar (8GB)
Starting upload for file fig.tar


100%|██████████| 10.0k/10.0k [00:01<00:00, 5.28kB/s]


Upload successful: fig.tar (10KB)
Starting upload for file preds.tar


100%|██████████| 2.89M/2.89M [00:01<00:00, 1.63MB/s]


Upload successful: preds.tar (3MB)
Starting upload for file tokenizer.tar


100%|██████████| 3.22M/3.22M [00:01<00:00, 1.75MB/s]


Upload successful: tokenizer.tar (3MB)
Starting upload for file modelconfig.pth


100%|██████████| 2.36k/2.36k [00:02<00:00, 1.19kB/s]


Upload successful: modelconfig.pth (2KB)


In [None]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

if cfg.upload_from_colab and cfg.COLAB:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

This environment is Google Colab
Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.16.2
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 14.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 70.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 89.7 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=8

100%|██████████| 7.56G/7.56G [03:48<00:00, 35.5MB/s]


Upload successful: model.tar (8GB)
Starting upload for file fig.tar


100%|██████████| 10.0k/10.0k [00:01<00:00, 5.81kB/s]


Upload successful: fig.tar (10KB)
Starting upload for file preds.tar


100%|██████████| 2.89M/2.89M [00:01<00:00, 1.71MB/s]


Upload successful: preds.tar (3MB)
Starting upload for file tokenizer.tar


100%|██████████| 3.22M/3.22M [00:02<00:00, 1.50MB/s]


Upload successful: tokenizer.tar (3MB)
Starting upload for file modelconfig.pth


100%|██████████| 2.36k/2.36k [00:01<00:00, 1.43kB/s]


Upload successful: modelconfig.pth (2KB)
