In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [2]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/-liar'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='PPPM'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=10
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [4]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='PPPM-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/pppm-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.11.0
Found existing installation: transformers 4.18.0
Uninstalling transformers-4.18.0:
  Successfully uninstalled transformers-4.18.0




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/pppm-pip-wheels
Processing /kaggle/input/pppm-pip-wheels/transformers-4.18.0-py3-none-any.whl
Processing /kaggle/input/pppm-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.12.1 transformers-4.18.0




Looking in links: ../input/pppm-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0
env: TOKENIZERS_PARALLELISM=true


In [6]:
# ====================================================
# Utils
# ====================================================
from sklearn.metrics import accuracy_score
def get_score(y_true, y_pred):
    score = accuracy_score(y_true, y_pred)
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [7]:
# ====================================================
# Data Loading
# ====================================================
def change(category):
    if category=='false' or category=='barely-true' or category=='pants-fire':
            return 0
    else:
        return 1

train = pd.read_table(INPUT_DIR+'/train.tsv',names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])

test=pd.read_table(INPUT_DIR+'/test.tsv',names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party",
                                            "barely-true", "false", "half-true", "mostly-true", "pants-fire", "venue"])
train['Label']=train['label'].apply(change)
test['Label']=test['label'].apply(change)
# print(f"train.shape: {train.shape}")
# print(f"test.shape: {test.shape}")
# display(train.head())
# display(test.head())
train.head()
train['statement']

0        Says the Annies List political group supports ...
1        When did the decline of coal start? It started...
2        Hillary Clinton agrees with John McCain "by vo...
3        Health care reform legislation is likely to ma...
4        The economic turnaround started at the end of ...
                               ...                        
10235    There are a larger number of shark attacks in ...
10236    Democrats have now become the party of the [At...
10237    Says an alternative to Social Security that op...
10238    On lifting the U.S. Cuban embargo and allowing...
10239    The Department of Veterans Affairs has a manua...
Name: statement, Length: 10240, dtype: object

In [8]:
train['text'] = train['statement']
test['text'] = test['statement']

train['score'] = train['Label']
test['score'] = test['Label']

display(train.head())
display(test.head())

Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue,Label,text,score
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0,Says the Annies List political group supports ...,0
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,1,When did the decline of coal start? It started...,1
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,1,"Hillary Clinton agrees with John McCain ""by vo...",1
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,0,Health care reform legislation is likely to ma...,0
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,1,The economic turnaround started at the end of ...,1


Unnamed: 0,id,label,statement,subject,speaker,job,state,party,barely-true,false,half-true,mostly-true,pants-fire,venue,Label,text,score
0,11972.json,true,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,Governor,Texas,republican,30,30,42,23,18,Radio interview,1,Building a wall on the U.S.-Mexico border will...,1
1,11685.json,false,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,State representative,Wisconsin,democrat,2,1,0,0,0,a news conference,0,Wisconsin is on pace to double the number of l...,0
2,11096.json,false,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,President-Elect,New York,republican,63,114,51,37,61,comments on ABC's This Week.,0,Says John McCain has done nothing to help the ...,0
3,5209.json,half-true,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,consultant,Oregon,republican,1,1,3,1,1,a radio show,1,Suzanne Bonamici supports a plan that will cut...,1
4,9524.json,pants-fire,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,,Wisconsin,democrat,5,7,2,2,7,a web video,0,When asked by a reporter whether hes at the ce...,0


In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
# ====================================================
# Define max_len
# ====================================================
lengths_dict = {}

for text_col in ['text']:
    lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        lengths.append(length)
    lengths_dict[text_col] = lengths
    
CFG.max_len = max(lengths_dict['text']) + 4 # CLS + SEP + SEP + SEP
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/10240 [00:00<?, ?it/s]

max_len: 701


In [11]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=150,
                           padding="max_length",
                           return_offsets_mapping=False,
                          truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['score'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

In [12]:
# ====================================================
# Model
# ====================================================
import torch.nn.functional as F
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        self.fc_dropout = nn.Dropout(0.4)
        self.fc = nn.Linear(150, self.cfg.target_size)
        self._init_weights(self.fc)
        self.conv2d_trigram=nn.Conv1d(in_channels=1024, out_channels=128, kernel_size=3)
        self.conv2d_bigram=nn.Conv1d(in_channels=1024, out_channels=128, kernel_size=2)
        self.pool=nn.MaxPool1d(3)
        self.conv2d_fourgram=nn.Conv1d(in_channels=1024,out_channels=128, kernel_size=4)
        self.flat=nn.Flatten(start_dim=1,end_dim=-1)
        self.convnet=nn.Sequential(
                     nn.Conv1d(in_channels=128,out_channels=64,kernel_size=3),
                     nn.MaxPool1d(3),
                     nn.ELU(),
                     nn.Conv1d(in_channels=64,out_channels=64,kernel_size=3),
                     nn.MaxPool1d(3),
                     nn.ELU())
        self.linearlayer1=nn.Linear(960,128)
        self.relu=nn.ReLU()
        self.linearlayer2=nn.Linear(128,self.cfg.target_size)
        
#         self.attention = nn.Sequential(
#             nn.Linear(self.config.hidden_size, 512),
#             nn.Tanh(),
#             nn.Linear(512, 1),
#             nn.Softmax(dim=1)
#         )
#         self._init_weights(self.attention)
        
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        # feature = torch.mean(last_hidden_states, 1)
#         weights = self.attention(last_hidden_states)
#         feature = torch.sum(weights * last_hidden_states, dim=1)
        
#         print("Shape: ",(weights*last_hidden_states).shape)
#         print("Features.shape= ",feature.shape)
        return last_hidden_states
   

    def forward(self, inputs):
        feature = self.feature(inputs)
        #increase the dimension of feature
        feature=feature.permute(0,2,1)
        conv_output_bigram=self.conv2d_bigram(feature)
        conv_pooled_bigram=self.pool(conv_output_bigram)
        conv_output_trigram=self.conv2d_trigram(feature)
        conv_pooled_trigram=self.pool(conv_output_trigram)
        conv_output_fourgram=self.conv2d_fourgram(feature)
        conv_pooled_fourgram=self.pool(conv_output_fourgram)
        concat_all_conv=torch.concat([  conv_pooled_bigram,  conv_pooled_trigram,  conv_pooled_fourgram],dim=2)
        pass_through_conv=self.flat(self.convnet(concat_all_conv))
        pass_through_first=self.relu(self.linearlayer1(self.fc_dropout(pass_through_conv)))
        pass_through_last=self.linearlayer2(self.fc_dropout(pass_through_first))
        
#         print("bigram.shape=",conv_output_bigram.shape)
#         print("concat_all_conv=",concat_all_conv.shape)
#         print("pass_through_conv=", pass_through_conv.shape)
        #kernel size=2
#         conv_output_bigram=self.conv2d_bigram(feature).squeeze(3)
#         pooled_output_bigram=F.max_pool1d(conv_output_bigram,conv_output_bigram.shape[2]).squeeze(2)
#         #kernel size=3
#         conv_output_trigram=self.conv2d_trigram(feature).squeeze(3)
#         pooled_output_trigram=F.max_pool1d(conv_output_trigram,conv_output_trigram.shape[2]).squeeze(2)
#         #kernel_size=4
#         conv_output_fourgram=self.conv2d_fourgram(feature).squeeze(3)
#         pooled_output_fourgram=F.max_pool1d(conv_output_fourgram,conv_output_fourgram.shape[2]).squeeze(2)
        
        
#         print("convolution_output_fourgram:",conv_output_fourgram.shape)
#         #pooling
#         print("pooled_output_fourgram:",pooled_output_fourgram.shape)
#         cat=torch.cat([pooled_output_bigram, pooled_output_trigram, pooled_output_fourgram],dim=1)
#         print("cat.shape=",cat.shape)
#         output =self.fc(self.fc_dropout(cat))
        return pass_through_last

In [13]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[epoch{epoch+1}] loss": losses.val,
                       f"[epoch{epoch+1}] lr": scheduler.get_lr()[0]})
        
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [14]:
# ====================================================
# train loop
# ====================================================
def train_loop():
    
    
    # ====================================================
    # loader
    # ====================================================
    train_folds = train
    valid_folds = test
    valid_labels = valid_folds['score'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        predictions = predictions > 0.5
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')
        if CFG.wandb:
            wandb.log( {
                       f"[epoch{epoch+1}] avg_train_loss": avg_loss, 
                       f"[epoch{epoch+1}] avg_val_loss": avg_val_loss,
                       f"[epoch{epoch+1}] score": score})
        
        
        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [15]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['score'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        _oof_df = train_loop()
        oof_df = pd.concat([oof_df, _oof_df])
        
        get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/640] Elapsed 0m 7s (remain 82m 5s) Loss: 0.6885(0.6885) Grad: 54765.3867  LR: 0.00002000  
Epoch: [1][100/640] Elapsed 1m 53s (remain 10m 5s) Loss: 0.7854(0.6787) Grad: 131121.5156  LR: 0.00001999  
Epoch: [1][200/640] Elapsed 3m 38s (remain 7m 57s) Loss: 0.6747(0.6841) Grad: 66368.2266  LR: 0.00001995  
Epoch: [1][300/640] Elapsed 5m 24s (remain 6m 5s) Loss: 0.6394(0.6822) Grad: 75223.0547  LR: 0.00001989  
Epoch: [1][400/640] Elapsed 7m 9s (remain 4m 16s) Loss: 0.6534(0.6810) Grad: 103180.1562  LR: 0.00001981  
Epoch: [1][500/640] Elapsed 8m 55s (remain 2m 28s) Loss: 0.6653(0.6773) Grad: 212480.3750  LR: 0.00001970  
Epoch: [1][600/640] Elapsed 10m 40s (remain 0m 41s) Loss: 0.6507(0.6740) Grad: 125382.8672  LR: 0.00001957  
Epoch: [1][639/640] Elapsed 11m 21s (remain 0m 0s) Loss: 0.6548(0.6729) Grad: 144661.0781  LR: 0.00001951  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 42s) Loss: 0.6749(0.6749) 


Epoch 1 - avg_train_loss: 0.6729  avg_val_loss: 0.6493  time: 707s
Epoch 1 - Score: 0.6330
Epoch 1 - Save Best Score: 0.6330 Model


EVAL: [79/80] Elapsed 0m 25s (remain 0m 0s) Loss: 0.6656(0.6493) 
Epoch: [2][0/640] Elapsed 0m 1s (remain 15m 1s) Loss: 0.6140(0.6140) Grad: 133357.5156  LR: 0.00001951  
Epoch: [2][100/640] Elapsed 1m 46s (remain 9m 30s) Loss: 0.5660(0.6444) Grad: 159900.6094  LR: 0.00001935  
Epoch: [2][200/640] Elapsed 3m 32s (remain 7m 43s) Loss: 0.6709(0.6363) Grad: 416814.9375  LR: 0.00001916  
Epoch: [2][300/640] Elapsed 5m 17s (remain 5m 58s) Loss: 0.7639(0.6311) Grad: 349639.4062  LR: 0.00001895  
Epoch: [2][400/640] Elapsed 7m 3s (remain 4m 12s) Loss: 0.5700(0.6296) Grad: 211818.5625  LR: 0.00001872  
Epoch: [2][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.6280(0.6289) Grad: 120750.9688  LR: 0.00001847  
Epoch: [2][600/640] Elapsed 10m 34s (remain 0m 41s) Loss: 0.6502(0.6302) Grad: 94035.9297  LR: 0.00001820  
Epoch: [2][639/640] Elapsed 11m 15s (remain 0m 0s) Loss: 0.6453(0.6304) Grad: 102508.1562  LR: 0.00001809  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 43s) Loss: 0.6454(0.6454) 


Epoch 2 - avg_train_loss: 0.6304  avg_val_loss: 0.6283  time: 701s
Epoch 2 - Score: 0.6504
Epoch 2 - Save Best Score: 0.6504 Model


EVAL: [79/80] Elapsed 0m 25s (remain 0m 0s) Loss: 0.3854(0.6283) 
Epoch: [3][0/640] Elapsed 0m 1s (remain 13m 49s) Loss: 0.6636(0.6636) Grad: 203971.0312  LR: 0.00001809  
Epoch: [3][100/640] Elapsed 1m 47s (remain 9m 32s) Loss: 0.6055(0.5333) Grad: 405567.7500  LR: 0.00001779  
Epoch: [3][200/640] Elapsed 3m 32s (remain 7m 44s) Loss: 0.3885(0.5268) Grad: 421069.2812  LR: 0.00001747  
Epoch: [3][300/640] Elapsed 5m 18s (remain 5m 58s) Loss: 0.7878(0.5272) Grad: 241820.8281  LR: 0.00001714  
Epoch: [3][400/640] Elapsed 7m 3s (remain 4m 12s) Loss: 0.4068(0.5264) Grad: 179143.1562  LR: 0.00001678  
Epoch: [3][500/640] Elapsed 8m 49s (remain 2m 26s) Loss: 0.6129(0.5292) Grad: 199187.4531  LR: 0.00001642  
Epoch: [3][600/640] Elapsed 10m 34s (remain 0m 41s) Loss: 0.4083(0.5274) Grad: 251875.5781  LR: 0.00001603  
Epoch: [3][639/640] Elapsed 11m 15s (remain 0m 0s) Loss: 0.5072(0.5291) Grad: 96306.4219  LR: 0.00001588  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 42s) Loss: 0.4620(0.4620) 


Epoch 3 - avg_train_loss: 0.5291  avg_val_loss: 0.6296  time: 701s
Epoch 3 - Score: 0.6630
Epoch 3 - Save Best Score: 0.6630 Model


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 0.4376(0.6296) 
Epoch: [4][0/640] Elapsed 0m 1s (remain 14m 12s) Loss: 0.4563(0.4563) Grad: 363522.2188  LR: 0.00001587  
Epoch: [4][100/640] Elapsed 1m 46s (remain 9m 30s) Loss: 0.3569(0.3926) Grad: 561737.8125  LR: 0.00001547  
Epoch: [4][200/640] Elapsed 3m 32s (remain 7m 43s) Loss: 0.2305(0.3765) Grad: 216442.8281  LR: 0.00001505  
Epoch: [4][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.3436(0.3604) Grad: 631768.2500  LR: 0.00001462  
Epoch: [4][400/640] Elapsed 7m 2s (remain 4m 11s) Loss: 0.4956(0.3706) Grad: 254354.5469  LR: 0.00001418  
Epoch: [4][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.1790(0.3790) Grad: 181143.1250  LR: 0.00001373  
Epoch: [4][600/640] Elapsed 10m 33s (remain 0m 41s) Loss: 0.3150(0.3874) Grad: 199066.5000  LR: 0.00001327  
Epoch: [4][639/640] Elapsed 11m 14s (remain 0m 0s) Loss: 0.4056(0.3861) Grad: 180552.8125  LR: 0.00001309  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 43s) Loss: 0.5050(0.5050) 


Epoch 4 - avg_train_loss: 0.3861  avg_val_loss: 0.7876  time: 700s
Epoch 4 - Score: 0.6622


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 0.1335(0.7876) 
Epoch: [5][0/640] Elapsed 0m 1s (remain 13m 33s) Loss: 0.4952(0.4952) Grad: 805134.9375  LR: 0.00001309  
Epoch: [5][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.4980(0.2663) Grad: 420189.6250  LR: 0.00001262  
Epoch: [5][200/640] Elapsed 3m 32s (remain 7m 43s) Loss: 0.0302(0.2388) Grad: 16235.2207  LR: 0.00001214  
Epoch: [5][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.4516(0.2610) Grad: 291067.5938  LR: 0.00001166  
Epoch: [5][400/640] Elapsed 7m 2s (remain 4m 12s) Loss: 0.5711(0.2647) Grad: 487942.9688  LR: 0.00001117  
Epoch: [5][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.0637(0.2607) Grad: 91005.3672  LR: 0.00001068  
Epoch: [5][600/640] Elapsed 10m 33s (remain 0m 41s) Loss: 0.2565(0.2596) Grad: 383843.4375  LR: 0.00001019  
Epoch: [5][639/640] Elapsed 11m 14s (remain 0m 0s) Loss: 0.1359(0.2606) Grad: 470471.1250  LR: 0.00001000  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 40s) Loss: 0.8039(0.8039) 


Epoch 5 - avg_train_loss: 0.2606  avg_val_loss: 0.9835  time: 700s
Epoch 5 - Score: 0.6732
Epoch 5 - Save Best Score: 0.6732 Model


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 0.2956(0.9835) 
Epoch: [6][0/640] Elapsed 0m 1s (remain 13m 27s) Loss: 0.3951(0.3951) Grad: 256276.2656  LR: 0.00001000  
Epoch: [6][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.2610(0.1298) Grad: nan  LR: 0.00000950  
Epoch: [6][200/640] Elapsed 3m 32s (remain 7m 43s) Loss: 0.4481(0.1577) Grad: 91516.5703  LR: 0.00000901  
Epoch: [6][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.0009(0.1713) Grad: 97.3571  LR: 0.00000853  
Epoch: [6][400/640] Elapsed 7m 2s (remain 4m 11s) Loss: 0.5421(0.2051) Grad: 103079.2109  LR: 0.00000804  
Epoch: [6][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.1715(0.2130) Grad: 61389.0078  LR: 0.00000757  
Epoch: [6][600/640] Elapsed 10m 33s (remain 0m 41s) Loss: 0.3527(0.2073) Grad: 223318.7031  LR: 0.00000709  
Epoch: [6][639/640] Elapsed 11m 14s (remain 0m 0s) Loss: 0.0653(0.2088) Grad: 53010.5547  LR: 0.00000691  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 39s) Loss: 0.5324(0.5324) 


Epoch 6 - avg_train_loss: 0.2088  avg_val_loss: 1.4656  time: 700s
Epoch 6 - Score: 0.6425


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 0.6811(1.4656) 
Epoch: [7][0/640] Elapsed 0m 1s (remain 13m 36s) Loss: 0.1491(0.1491) Grad: 1491244.5000  LR: 0.00000691  
Epoch: [7][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.0065(0.1343) Grad: 10355.4160  LR: 0.00000644  
Epoch: [7][200/640] Elapsed 3m 32s (remain 7m 43s) Loss: 0.0006(0.1338) Grad: 198.3382  LR: 0.00000599  
Epoch: [7][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.0012(0.1533) Grad: 160.1034  LR: 0.00000554  
Epoch: [7][400/640] Elapsed 7m 3s (remain 4m 12s) Loss: 0.0018(0.1543) Grad: 865.5806  LR: 0.00000511  
Epoch: [7][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.2558(0.1615) Grad: 91332.1406  LR: 0.00000469  
Epoch: [7][600/640] Elapsed 10m 34s (remain 0m 41s) Loss: 0.0044(0.1660) Grad: 18739.7734  LR: 0.00000428  
Epoch: [7][639/640] Elapsed 11m 15s (remain 0m 0s) Loss: 0.0009(0.1658) Grad: 2062.9458  LR: 0.00000412  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 40s) Loss: 0.8538(0.8538) 


Epoch 7 - avg_train_loss: 0.1658  avg_val_loss: 2.1049  time: 700s
Epoch 7 - Score: 0.6630


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 2.0490(2.1049) 
Epoch: [8][0/640] Elapsed 0m 1s (remain 13m 18s) Loss: 0.2359(0.2359) Grad: 151639.7188  LR: 0.00000412  
Epoch: [8][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.0009(0.0929) Grad: 9062.3564  LR: 0.00000373  
Epoch: [8][200/640] Elapsed 3m 31s (remain 7m 42s) Loss: 0.0001(0.1020) Grad: 54.6351  LR: 0.00000335  
Epoch: [8][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.0002(0.0985) Grad: 70.9988  LR: 0.00000300  
Epoch: [8][400/640] Elapsed 7m 2s (remain 4m 11s) Loss: 0.0022(0.0998) Grad: 1436.4733  LR: 0.00000265  
Epoch: [8][500/640] Elapsed 8m 47s (remain 2m 26s) Loss: 0.0004(0.1017) Grad: 42.9265  LR: 0.00000233  
Epoch: [8][600/640] Elapsed 10m 33s (remain 0m 41s) Loss: 0.0040(0.0996) Grad: 476.5901  LR: 0.00000202  
Epoch: [8][639/640] Elapsed 11m 14s (remain 0m 0s) Loss: 0.0001(0.0989) Grad: 177.0195  LR: 0.00000191  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 41s) Loss: 1.2500(1.2500) 


Epoch 8 - avg_train_loss: 0.0989  avg_val_loss: 2.6719  time: 699s
Epoch 8 - Score: 0.6701


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 2.0655(2.6719) 
Epoch: [9][0/640] Elapsed 0m 1s (remain 13m 33s) Loss: 0.0001(0.0001) Grad: 316.8648  LR: 0.00000191  
Epoch: [9][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.0008(0.0733) Grad: 3968.6702  LR: 0.00000163  
Epoch: [9][200/640] Elapsed 3m 31s (remain 7m 42s) Loss: 0.0002(0.0735) Grad: 23.2188  LR: 0.00000137  
Epoch: [9][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.4258(0.0714) Grad: 172486.5781  LR: 0.00000113  
Epoch: [9][400/640] Elapsed 7m 3s (remain 4m 12s) Loss: 0.0004(0.0701) Grad: 50.5563  LR: 0.00000092  
Epoch: [9][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.0003(0.0677) Grad: 39.0408  LR: 0.00000072  
Epoch: [9][600/640] Elapsed 10m 33s (remain 0m 41s) Loss: 0.0002(0.0653) Grad: 22.6635  LR: 0.00000055  
Epoch: [9][639/640] Elapsed 11m 14s (remain 0m 0s) Loss: 0.6464(0.0658) Grad: 673259.0000  LR: 0.00000049  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 40s) Loss: 0.9942(0.9942) 


Epoch 9 - avg_train_loss: 0.0658  avg_val_loss: 2.7856  time: 700s
Epoch 9 - Score: 0.6598


EVAL: [79/80] Elapsed 0m 24s (remain 0m 0s) Loss: 2.8570(2.7856) 
Epoch: [10][0/640] Elapsed 0m 1s (remain 13m 12s) Loss: 0.0048(0.0048) Grad: 39459.5898  LR: 0.00000049  
Epoch: [10][100/640] Elapsed 1m 46s (remain 9m 28s) Loss: 0.0002(0.0605) Grad: 228.9574  LR: 0.00000035  
Epoch: [10][200/640] Elapsed 3m 31s (remain 7m 42s) Loss: 0.0015(0.0528) Grad: 381.6662  LR: 0.00000023  
Epoch: [10][300/640] Elapsed 5m 17s (remain 5m 57s) Loss: 0.0004(0.0539) Grad: 345.2900  LR: 0.00000014  
Epoch: [10][400/640] Elapsed 7m 2s (remain 4m 11s) Loss: 0.0003(0.0492) Grad: 935.2029  LR: 0.00000007  
Epoch: [10][500/640] Elapsed 8m 48s (remain 2m 26s) Loss: 0.0011(0.0498) Grad: 11153.9570  LR: 0.00000002  
Epoch: [10][600/640] Elapsed 10m 35s (remain 0m 41s) Loss: 0.1203(0.0498) Grad: 1069581.8750  LR: 0.00000000  
Epoch: [10][639/640] Elapsed 11m 17s (remain 0m 0s) Loss: 0.0002(0.0501) Grad: 57.7584  LR: 0.00000000  
EVAL: [0/80] Elapsed 0m 0s (remain 0m 49s) Loss: 1.0029(1.0029) 


Epoch 10 - avg_train_loss: 0.0501  avg_val_loss: 2.8252  time: 703s
Epoch 10 - Score: 0.6614


EVAL: [79/80] Elapsed 0m 25s (remain 0m 0s) Loss: 2.8873(2.8252) 


Score: 0.6732
Score: 0.6732


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[epoch10] avg_train_loss,▁
[epoch10] avg_val_loss,▁
[epoch10] loss,▁▂▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅▁▁▁▁▁▁▁▁▁▁▁▁
[epoch10] lr,██▇▇▇▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
[epoch10] score,▁
[epoch1] avg_train_loss,▁
[epoch1] avg_val_loss,▁
[epoch1] loss,▅▄▅▆▅▄▃▃▄▅▆▅▅█▅▄▅▄▄▅▅▅▄▆▄▅▆▃▄▃▁▄▃▅▆▄▄▄▇▄
[epoch1] lr,███████████▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▂▂▂▂▁
[epoch1] score,▁

0,1
[epoch10] avg_train_loss,0.05006
[epoch10] avg_val_loss,2.82517
[epoch10] loss,0.00018
[epoch10] lr,0.0
[epoch10] score,0.6614
[epoch1] avg_train_loss,0.67295
[epoch1] avg_val_loss,0.64928
[epoch1] loss,0.65479
[epoch1] lr,2e-05
[epoch1] score,0.63299
