In this notebook, a pipeline is used from training in Google Colablatory to uploading to Kaggle Dataset.　　

参考：https://www.kaggle.com/code/columbia2131/uspppm-roberta-base-colab-baseline-train/notebook

## Setting

When you run everything, the directory structure will look like this, for example.

```
Shareddrives  
├USPatent  
│ └Author Name
│    ├Notebook  
│    │   └this notebook
│    ├Input
│    │   ├train.csv
│    │   ├test.csv
│    │   └example_sample_submission.csv
│    ├Output
│    │   └Exp001-ColabTraining
│    │      ├preds
│    │      ├model
│    │      └fig
│    ├Dataset
│    └Submission
└kaggle.json
```

In [None]:
! nvidia-smi

Thu Apr 14 07:43:27 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import os

class Config:
    AUTHOR = "wanwan7123"

    NAME = "USP-Exp002-deberta-base-epoch10"
    MODEL_PATH = "microsoft/deberta-v3-large"
    DATASET_PATH = []

    COMPETITION = "us-patent-phrase-to-phrase-matching"
    COLAB_PATH = "/content/drive/MyDrive/DataAnalysis/competicion/competition_USPPPM" 
    DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    api_path = "/content/drive/MyDrive/kaggle.json"

    seed = 42
    num_fold = 5
    trn_fold = [0, 1, 2, 3, 4]
    batch_size = 32
    n_epochs = 7
    max_len = 64
    
    fc_dropout = 0.1
    weight_decay = 2e-5
    beta = (0.9, 0.98)
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1

    upload_from_colab = True

In [None]:
# ========================================
# Library
# ========================================
import os
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold
)

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

In [None]:
def setup(cfg):
    cfg.COLAB = 'google.colab' in sys.modules
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if cfg.COLAB:
        print('This environment is Google Colab')

        # mount
        from google.colab import drive
        if not os.path.isdir('/content/drive'):
            drive.mount('/content/drive') 

        # pip install
        ! pip install transformers==4.16.2
        ! pip install tokenizers==0.11.6
        ! pip install transformers[sentencepiece]

        # use kaggle api (need kaggle token)
        f = open(cfg.api_path, 'r')
        json_data = json.load(f) 
        os.environ['KAGGLE_USERNAME'] = json_data['username']
        os.environ['KAGGLE_KEY'] = json_data['key']

        # set dirs
        cfg.DRIVE = cfg.DRIVE_PATH
        cfg.EXP = (cfg.NAME if cfg.NAME is not None 
            else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
        )
        cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
        cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Output')
        cfg.SUBMISSION = os.path.join(cfg.DRIVE, 'Submission')
        cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

        cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
        cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

        # make dirs
        for d in [cfg.INPUT, cfg.SUBMISSION, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
        
        if not os.path.isfile(os.path.join(cfg.INPUT, 'train.csv')):
            # load dataset
            ! pip install --upgrade --force-reinstall --no-deps kaggle
            ! kaggle competitions download -c $cfg.COMPETITION -p $cfg.INPUT
            filepath = os.path.join(cfg.INPUT,cfg.COMPETITION+'.zip')
            ! unzip -d $cfg.INPUT $filepath
            
        
        for path in cfg.DATASET_PATH:
            datasetpath = os.path.join(cfg.DATASET,  path.split('/')[1])
            if not os.path.exists(datasetpath):
                os.makedirs(datasetpath, exist_ok=True)
                ! kaggle datasets download $path -p $datasetpath
                filepath = os.path.join(datasetpath, path.split("/")[1]+'.zip')
                ! unzip -d $datasetpath $filepath

    else:
        print('This environment is Kaggle Kernel')

        # set dirs
        cfg.INPUT = f'../input/{cfg.COMPETITION}'
        cfg.EXP = cfg.NAME
        cfg.OUTPUT_EXP = cfg.NAME
        cfg.SUBMISSION = './'
        cfg.DATASET = '../input/'
        
        cfg.EXP_MODEL = os.path.join(cfg.EXP, 'model')
        cfg.EXP_FIG = os.path.join(cfg.EXP, 'fig')
        cfg.EXP_PREDS = os.path.join(cfg.EXP, 'preds')

        # make dirs
        for d in [cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
            os.makedirs(d, exist_ok=True)
    return cfg


def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [None]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_kfold(train, n_splits, seed):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train)
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

def get_groupkfold(train, target_col, group_col, n_splits):
    kf = GroupKFold(n_splits=n_splits)
    generator = kf.split(train, train[target_col], train[group_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

In [None]:
# =====================
# Dataset, Model
# =====================
def processing_features(df):
    df['alp_context'] = df['context'].map(lambda x: x[0])
    df['num_context'] = df['context'].map(lambda x: int(x[1:]))
    df['alp_context'] = df['alp_context'].map({
        'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':'7'
    }).astype(int)
    return df


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.anchor = df['anchor'].to_numpy()
        self.target = df['target'].to_numpy()
        self.score = df['score'].to_numpy()
        self.alp_context = df['alp_context'].to_numpy()
        self.num_context = df['num_context'].to_numpy()
        
    def __len__(self):
        return len(self.anchor)

    def __getitem__(self, index):
        inputs = self.prepare_input(
            self.cfg, 
            self.anchor[index], 
            self.target[index]
        )        
        labels = torch.tensor(
            self.score[index],
            dtype=torch.half
        )
        alps = torch.tensor(
            self.alp_context[index],
            dtype=torch.long
        )
        nums = torch.tensor(
            self.num_context[index],
            dtype=torch.long
        )
        return inputs, alps, nums, labels
    
    @staticmethod
    def prepare_input(cfg, anchor_text, target_text):
        inputs = cfg.tokenizer(
            anchor_text, 
            target_text, 
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False
        )
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len


class CustomModel(nn.Module):
    def __init__(self, cfg, num_alp=9, emb_alp=8, num_num=100, emb_num=8):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.embedding_alp = nn.Embedding(
            num_embeddings=num_alp,
            embedding_dim=emb_alp,
        )
        self.embedding_num = nn.Embedding(
            num_embeddings=num_num,
            embedding_dim=emb_num,
        )
        
        self.linear1 = nn.Sequential(
            nn.Linear(self.config.hidden_size+emb_alp+emb_num, 1024),
            nn.SELU(),
            nn.Linear(1024, 1024),
            nn.SELU(),
            nn.Linear(1024, 1)
        )

    def forward(self, inputs, alps, nums):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        alp_outputs = self.embedding_alp(alps)
        num_outputs = self.embedding_num(nums)
        
        outputs = torch.cat([outputs, alp_outputs, num_outputs], axis=1)
        outputs = self.linear1(outputs)
        return outputs.flatten()

In [None]:
def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros(len(train), dtype=np.float32)
    for fold in cfg.trn_fold:
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True,
            pin_memory=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

        # model
        model = CustomModel(cfg)
        model = model.to(cfg.device)

        # optimizer, scheduler
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
                'weight_decay': cfg.weight_decay
            },
            {
                'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
                'weight_decay': 0.0
            }
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=cfg.lr,
            betas=cfg.beta,
            weight_decay=cfg.weight_decay,
        )
        num_train_optimization_steps = int(
            len(train_loader) * cfg.n_epochs // cfg.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * cfg.num_warmup_steps_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )

        # model-training
        criterion = nn.MSELoss()
        best_val_score = -1
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            model.train() 
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, alps, nums, labels) in enumerate(pbar):
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    alps = alps.to(cfg.device)
                    nums = nums.to(cfg.device)
                    labels = labels.to(cfg.device)

                    optimizer.zero_grad()
                    with autocast():
                        output = model(inputs, alps, nums)
                    loss = criterion(output, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()
                    if cfg.clip_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), 
                            cfg.clip_grad_norm
                        )
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()

            # evaluating
            val_preds = []
            val_losses = []
            val_nums = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for (inputs, alps, nums, labels) in pbar:
                        for k, v in inputs.items():
                            inputs[k] = v.to(cfg.device)
                        alps = alps.to(cfg.device)
                        nums = nums.to(cfg.device)
                        labels = labels.to(cfg.device)
                        with autocast():
                            output = model(inputs, alps, nums)
                        loss = criterion(output, labels.to(torch.float))
                        
                        output = output.detach().cpu().numpy()
                        val_preds.append(output)
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.concatenate(val_preds)
            val_loss = sum(val_losses) / sum(val_nums)
            corr_score = np.corrcoef(val_preds, valid_df['score'])[0, 1]

            val_log = {
                'val_loss': val_loss,
                'score': corr_score,
            }
            display(val_log)

            if best_val_score < corr_score:
                print("save model weight")
                best_val_preds = val_preds
                best_val_score = corr_score
                torch.save(
                    model.state_dict(), 
                    os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
                )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        del model; gc.collect()

    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)

    # =====================
    # scoring
    # =====================
    corr_score = np.corrcoef(oof_pred, train['score'])[0, 1]
    print('CV:', round(corr_score, 5))
    return corr_score

In [None]:
# =====================
# Main
# =====================

# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup
import tokenizers
import sentencepiece
%env TOKENIZERS_PARALLELISM=true
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submission.csv'))

train = processing_features(train)
test = processing_features(test)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.folds = get_kfold(train, cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))
score = training(cfg, train)

if cfg.upload_from_colab and cfg.COLAB:
    from kaggle.api.kaggle_api_extended import KaggleApi
    dataset_create_new(dataset_name=Config.EXP, upload_dir=Config.OUTPUT_EXP)

This environment is Google Colab
env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8337448459683787, 'val_loss': 0.024878487028462418}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8407455950624875, 'val_loss': 0.02428628630205897}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8510851627076432, 'val_loss': 0.019812209666480014}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.850523964139147, 'val_loss': 0.020904679935504673}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8500210547739817, 'val_loss': 0.02188799567035488}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8530434159188292, 'val_loss': 0.01997603027710488}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8531787549353431, 'val_loss': 0.020291868898943525}

save model weight


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8309914748937688, 'val_loss': 0.022586114049938297}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8475988865550725, 'val_loss': 0.023152509344469774}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8543022729690164, 'val_loss': 0.019017198666000792}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8543290889509808, 'val_loss': 0.021461881908975577}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8555406227890224, 'val_loss': 0.020098843206182337}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8564584014558123, 'val_loss': 0.02054982171448189}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8578695861279443, 'val_loss': 0.019754003758546707}

save model weight


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.049974380786452034, 'val_loss': 0.06643586690761685}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.06262313361811374, 'val_loss': 0.06626118202684185}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.06249043190649667, 'val_loss': 0.06722043725094459}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.05789816003386566, 'val_loss': 0.06887657006996964}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.059005249696599746, 'val_loss': 0.07070517338878737}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.06199229072220737, 'val_loss': 0.06659290809963074}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.06445068112080661, 'val_loss': 0.06624723147394071}

save model weight


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8160535391327302, 'val_loss': 0.023420055725001165}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8475342380963484, 'val_loss': 0.02125764472809987}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8536448275584313, 'val_loss': 0.019419747449443345}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8543134108738988, 'val_loss': 0.019267982555834732}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8598003610903872, 'val_loss': 0.01908518938735344}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8618948957664833, 'val_loss': 0.01999653459682117}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8608314960284015, 'val_loss': 0.020016837610258187}

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8207791909741344, 'val_loss': 0.02416801302940724}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.840281202575648, 'val_loss': 0.0224232859567343}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8490247235394328, 'val_loss': 0.022053634620179435}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.853122522697485, 'val_loss': 0.020074295861917734}

save model weight


  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8504942297157406, 'val_loss': 0.02132357705168963}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8515458151091887, 'val_loss': 0.023074991957388956}



  0%|          | 0/911 [00:00<?, ?it/s]

  0%|          | 0/228 [00:00<?, ?it/s]

{'score': 0.8528549020909487, 'val_loss': 0.020799256071105113}

CV: 0.76613
Starting upload for file model.tar


100%|██████████| 8.12G/8.12G [07:35<00:00, 19.2MB/s]


Upload successful: model.tar (8GB)
Starting upload for file fig.tar


100%|██████████| 10.0k/10.0k [00:04<00:00, 2.14kB/s]


Upload successful: fig.tar (10KB)
Starting upload for file preds.tar


100%|██████████| 500k/500k [00:09<00:00, 53.7kB/s]


Upload successful: preds.tar (500KB)
