# 計画
1. deberta-v3-largeで特別な前処理もせずに実装
2. 前処理(Augumentaionやhtmlタグ削除など)
3. 

In [None]:
!nvidia-smi

Tue Sep  6 07:36:33 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# !pip

In [None]:
!pip install -qqq --upgrade wandb
!pip install -qqq transformers
!pip install -qqq sentencepiece
!pip install -qqq colorama
!pip install googletrans==4.0.0-rc1

[K     |████████████████████████████████| 1.8 MB 7.9 MB/s 
[K     |████████████████████████████████| 158 kB 72.1 MB/s 
[K     |████████████████████████████████| 181 kB 67.1 MB/s 
[K     |████████████████████████████████| 63 kB 1.8 MB/s 
[K     |████████████████████████████████| 157 kB 78.1 MB/s 
[K     |████████████████████████████████| 157 kB 85.5 MB/s 
[K     |████████████████████████████████| 157 kB 83.3 MB/s 
[K     |████████████████████████████████| 157 kB 95.6 MB/s 
[K     |████████████████████████████████| 157 kB 87.5 MB/s 
[K     |████████████████████████████████| 157 kB 86.5 MB/s 
[K     |████████████████████████████████| 157 kB 87.5 MB/s 
[K     |████████████████████████████████| 156 kB 87.2 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 4.7 MB 6.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 40.0 MB/s 
[K     |████████████████████████████████| 120 kB 94.4 MB/s 
[K     |███████████

# IMPORT

In [None]:
import os
import gc
import copy
import time
import random
import string
import joblib
import re

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

from googletrans import Translator

# Utils
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# CONFIG

In [None]:
CONFIG = {"output_name": "exp_002",
          "debug": False,
          "seed": 2022,
          'model_name': "microsoft/deberta-v3-large",
          "epochs": 3,
          "n_fold": 4,
          "train_batch_size": 2,
          "valid_batch_size": 8,
          "max_length": 512,
          "learning_rate": 1e-5,
          "scheduler": 'CosineAnnealingLR',
          "min_lr": 1e-6,
          "T_max": 500, #スケジューラーにおける学習率の周期
          "weight_decay": 1e-6,
          "n_accumulate": 4,
          "num_classes": 2,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          "dropout": 0.1,
          "pooling": "mean pooling"
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
CONFIG['group'] = f'{CONFIG["output_name"]}'

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Seed Setting

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(CONFIG['seed'])

<img src="https://i.imgur.com/gb6B4ig.png" width="400" alt="Weights & Biases" />

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;"> Weights & Biases (W&B) is a set of machine learning tools that helps you build better models faster. <strong>Kaggle competitions require fast-paced model development and evaluation</strong>. There are a lot of components: exploring the training data, training different models, combining trained models in different combinations (ensembling), and so on.</span>

> <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">⏳ Lots of components = Lots of places to go wrong = Lots of time spent debugging</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">W&B can be useful for Kaggle competition with it's lightweight and interoperable tools:</span>

* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Quickly track experiments,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Version and iterate on datasets, <br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Evaluate model performance,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Reproduce models,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Visualize results and spot regressions,<br></span>
* <span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">Share findings with colleagues.</span>

<span style="color: #000508; font-family: Segoe UI; font-size: 1.2em; font-weight: 300;">To learn more about Weights and Biases check out this <strong><a href="https://www.kaggle.com/ayuraj/experiment-tracking-with-weights-and-biases">kernel</a></strong>.</span>

In [None]:
import wandb

try:
    wandb.login(key="02e98f3d906f4dc5041c8e50aaa2462432a6e969")
    anony = None
except:
    anony = "must"
    print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Data Loading

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Competitions/SIGNATE/MUFG/input/train.csv")
df.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0
3,train_00003,1001-2000,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1
4,train_00004,1001-2000,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1


In [None]:
text = df["html_content"].values[0]
encoded = CONFIG["tokenizer"](text)
print(encoded.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


# Preproseccing

## New Column

In [None]:
# maltiple category1 and category2

df["cat1×2"] = df["category1"] + df["category2"]

In [None]:
df

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,cat1×2
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1,artmixed media
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0,foodrestaurants
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0,artperformance art
3,train_00003,1001-2000,US,30,art,mixed media,"<div class=""contents""><div><div class=""templat...",1,artmixed media
4,train_00004,1001-2000,US,29,film & video,webseries,"<div class=""contents""><div><p>The story of the...",1,film & videowebseries
...,...,...,...,...,...,...,...,...,...
9786,train_09786,1-1000,US,15,music,electronic music,"<div class=""contents""><div><p>So the story beh...",0,musicelectronic music
9787,train_09787,3001-4000,CA,30,fashion,ready-to-wear,"<div class=""contents""><div><h1 class=""page-anc...",0,fashionready-to-wear
9788,train_09788,100000+,GB,30,technology,software,"<div class=""contents""><div><p>We don't think a...",0,technologysoftware
9789,train_09789,79001-80000,US,35,technology,gadgets,"<div class=""contents""><div><a href=""http://dum...",1,technologygadgets



## Processing html_contents

In [None]:
df["html_content"].values[0]

'<div class="contents"><div><p><a href="http://dummy.com">http://dummy.com<p>In its first year, The Shillito\'s Elves Display won an international \ndesign award for Shillito\'s department store.\xa0 The elves display is arts\n and crafts at its finest.\xa0 The mixed media exhibit displays the talents\n of local fine arts graduates, and the display, while "folksy", is as \ntechnologically advanced as Disney World\'s famous "It\'s a Small World" \nride. </p><p>The Shillito\'s Elves attracted close to 100,000 people each\n year.\xa0 It was one of the most beloved Christmas traditions in \nCincinnati.\xa0 For many in the Cincinnati area, it is a fond childhood \nholiday memory and one that they would love to share with their own \nfamilies.\xa0 In the next 40 days, we are asking for your help to make the \nentire display viewable again for the first time in 25 years.\xa0 In order \nto make this happen, we must meet our financial goal.\xa0 </p><p>Your money will be used in the following wa

In [None]:
# delete htmltag

def tag_delete(_s):
    tag_list = re.findall(r"<.*?>", _s)
    _s = re.sub("<.*?>", " ", _s) # htmlタグの削除
    _s = re.sub("\n|\\|xa0", "", _s) # 改行, \, xa0を削除
    return _s

df["html_content"] = df["html_content"].map(tag_delete)

In [None]:
df["html_content"].values[0]

'    http://dummy.com In its first year, The Shillito\'s Elves Display won an international design award for Shillito\'s department store.\xa0 The elves display is arts and crafts at its finest.\xa0 The mixed media exhibit displays the talents of local fine arts graduates, and the display, while "folksy", is as technologically advanced as Disney World\'s famous "It\'s a Small World" ride.   The Shillito\'s Elves attracted close to 100,000 people each year.\xa0 It was one of the most beloved Christmas traditions in Cincinnati.\xa0 For many in the Cincinnati area, it is a fond childhood holiday memory and one that they would love to share with their own families.\xa0 In the next 40 days, we are asking for your help to make the entire display viewable again for the first time in 25 years.\xa0 In order to make this happen, we must meet our financial goal.\xa0   Your money will be used in the following ways:   Repair broken animated elves (75 motors need repair)  Replace faded clothing  Fix

## Data Augmentation

# Cross Validation

In [None]:
def create_folds(df, num_splits):
    df["kfold"] = -1

    mskf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=2022)
    labels = df['state']

    for f, (t_, v_) in enumerate(mskf.split(df, labels)):
        df.loc[v_, "kfold"] = f

    return df

df = create_folds(df, num_splits=CONFIG["n_fold"])

In [None]:
df.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,cat1×2,kfold
0,train_00000,20001-21000,US,45,art,mixed media,"http://dummy.com In its first year, The Sh...",1,artmixed media,0
1,train_00001,19001-20000,US,59,food,restaurants,Cultural Pretzel Sports Bar is a place wher...,0,foodrestaurants,1
2,train_00002,2001-3000,US,38,art,performance art,I want to perform this piece guerilla style...,0,artperformance art,1
3,train_00003,1001-2000,US,30,art,mixed media,"Canyon de Chelley, Dine' (Navajo) R...",1,artmixed media,0
4,train_00004,1001-2000,US,29,film & video,webseries,"The story of the show, both on and off scre...",1,film & videowebseries,0


#Create Weights

In [None]:
weights = compute_class_weight(class_weight="balanced", classes=[0, 1], y=df["state"])
weights

array([0.98481191, 1.0156639 ])

# Dataset Class

In [None]:
class Data(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.df = df
        self.text = df["text"].values
        self.target = df['label'].values
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.text[idx]
        inputs = self.tokenizer(
            text,
            padding = False,
            max_length = self.max_length,
            truncation = True
        )

        return {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
            'target': self.target[idx]
        }

In [None]:
collate_fn = DataCollatorWithPadding(tokenizer=CONFIG['tokenizer'])

In [None]:
df_sample = pd.DataFrame()
TEXT_COLUMNS = ['goal', 'country', 'duration', 'category1', 'category2', 'html_content']
df_sample['text'] = df[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(df[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=CONFIG["tokenizer"].sep_token)
df_sample['label'] = df['state']
data = Data(df_sample, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])

# Model Class

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [None]:
class Model(nn.Module):
    def __init__(self, model_name, dropout):
        super(Model, self).__init__()
        self.model = AutoModel.from_pretrained(model_name)
        self.config = AutoConfig.from_pretrained(model_name)
        self.drop = nn.Dropout(p=dropout) 
        self.pooler = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, CONFIG["num_classes"])
        
    def forward(self, ids, mask):
        out = self.model(input_ids=ids,attention_mask=mask,
                         output_hidden_states=False)
        out = self.pooler(out.last_hidden_state, mask)
        out = self.drop(out)
        outputs = self.fc(out)
        return outputs

# Loss Function

In [None]:
# def criterion(outputs, targets):
#     return nn.BCELoss()(outputs, targets)

In [None]:
def criterion(outputs, labels, device):
    return nn.CrossEntropyLoss(weight=torch.Tensor(weights).to(device))(outputs, labels)

# Traning Function

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [None]:
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
    model.train()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        # print("INPUTS", ids, mask, targets)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        # print(outputs, outputs.dtype)
        # outputs = sigmoid(outputs[0][0])

        # print(targets, targets.dtype)

        loss = criterion(outputs, targets, device)
        # print(f"loss: {loss}")
        loss = loss / CONFIG['n_accumulate']
        loss.backward()
    
        if (step + 1) % CONFIG['n_accumulate'] == 0:
            optimizer.step()

            # zero the parameter gradients
            optimizer.zero_grad()

            if scheduler is not None:
                scheduler.step()
                
        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        # bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])

    gc.collect()
    
    return epoch_loss

# Validation Function

In [None]:
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch):
    output_list = []
    target_list = []
    model.eval()
    
    dataset_size = 0
    running_loss = 0.0
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype=torch.long)
        
        batch_size = ids.size(0)

        outputs = model(ids, mask)
        
        loss = criterion(outputs, targets, device)
        # print(outputs, outputs.shape)
        # print(targets, targets.shape)
        # f1_macro = f1_score(torch.argmax(outputs, dim=1).cpu().detach().numpy(), (targets-1).cpu().detach().numpy(), average="macro")
        # print("F1_macro: ", f1_macro)
        output_list.append(torch.argmax(outputs, dim=1).cpu().detach().numpy().tolist())
        target_list.append((targets.cpu().detach().numpy()).tolist())

        running_loss += (loss.item() * batch_size)
        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss,
                        LR=optimizer.param_groups[0]['lr'])

    #appendしたoutputとtargetを１次元化する
    output_list = sum(output_list, [])
    target_list = sum(target_list, [])
    # print(output_list)
    # print(target_list)
    f1_macro = f1_score(output_list, target_list, average="macro")
    
    gc.collect()
    
    return epoch_loss, f1_macro

# Run Training

In [None]:
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
    # To automatically log gradients
    wandb.watch(model, log_freq=100)
    
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {}\n".format(torch.cuda.get_device_name()))

    path = f"/content/drive/MyDrive/Competitions/SIGNATE/MUFG/model/{CONFIG['output_name']}"
    if not os.path.exists(path):
        os.mkdir(path)
    elif not [f for f in os.listdir(path) if not f.startswith(".")]:
        pass
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_epoch_loss = np.inf
    best_epoch_f1 = -np.inf
    history = defaultdict(list)

    for epoch in range(1, num_epochs + 1): 
        gc.collect()
        train_epoch_loss = train_one_epoch(model, optimizer, scheduler, 
                                           dataloader=train_loader, 
                                           device=CONFIG['device'], epoch=epoch)
        
        val_epoch_loss, f1 = valid_one_epoch(model, valid_loader, device=CONFIG['device'], 
                                         epoch=epoch)
        
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(val_epoch_loss)
        
        # Log the metrics
        wandb.log({"Train Loss": train_epoch_loss})
        wandb.log({"Valid Loss": val_epoch_loss})
        wandb.log({"F1-macro": f1})
        
        # deep copy the model
        if val_epoch_loss <= best_epoch_loss:
            print(f"{b_}Validation Loss Improved ({best_epoch_loss} ---> {val_epoch_loss})")
            best_epoch_loss = val_epoch_loss
            f1_macro = f1
            run.summary["Best Loss"] = best_epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            PATH = f"/content/drive/MyDrive/Competitions/SIGNATE/MUFG/model/{CONFIG['output_name']}/Loss-Fold-{fold}.bin"

            torch.save(model.state_dict(), PATH)
            # Save a model file from the current directory
            print(f"Model Saved{sr_}")
        if f1 >= best_epoch_f1:
            best_epoch_f1 = f1
        print(f"Epoch {epoch} f1_score: ", f1)
    
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best Loss: {:.4f}".format(best_epoch_loss))
    print("Best F1: {:.4f}".format(best_epoch_f1))
    
    # load best model weights
    model.load_state_dict(best_model_wts)
    
    return model, history, f1_macro, best_epoch_f1

In [None]:
def fetch_scheduler(optimizer):
    if CONFIG['scheduler'] == 'CosineAnnealingLR':
        scheduler = lr_scheduler.CosineAnnealingLR(optimizer,T_max=CONFIG['T_max'], 
                                                   eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == 'CosineAnnealingWarmRestarts':
        scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer,T_0=CONFIG['T_0'], 
                                                             eta_min=CONFIG['min_lr'])
    elif CONFIG['scheduler'] == None:
        return None
        
    return scheduler

# Main

In [None]:
if CONFIG["debug"]:
    df = df.head(100)
    CONFIG["epochs"] = 1

TEXT_COLUMNS = ['goal', 'country', 'duration', 'category1', 'category2', 'html_content', "cat1×2"]
df['text'] = df[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(df[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=CONFIG["tokenizer"].sep_token)
df['label'] = df['state']

f1_average = []
best_f1_average = []
kf = KFold(n_splits=CONFIG["n_fold"], shuffle=True, random_state=2022)
for fold in range(CONFIG["n_fold"]):
    print(f"{y_}====== Fold: {fold} ======{sr_}")
    run = wandb.init(project='SIGNATE_MUFG', 
                     config=CONFIG,
                     job_type='Train',
                     group=CONFIG['group'],
                     tags=[CONFIG['model_name'], f'{CONFIG["output_name"]}'],
                     name=f'{CONFIG["output_name"]}-fold-{fold}',
                     anonymous='must')
    
    df_train = df.copy().query("kfold != @fold")
    df_valid = df.copy().query("kfold == @fold")
    train_dataset = Data(df_train, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    valid_dataset = Data(df_valid, tokenizer=CONFIG['tokenizer'], max_length=CONFIG['max_length'])
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], collate_fn=collate_fn, 
                                  num_workers=2, shuffle=True, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], collate_fn=collate_fn,
                                  num_workers=2, shuffle=False, pin_memory=True)
    
    model = Model(CONFIG['model_name'], CONFIG["dropout"])
    model.to(CONFIG['device'])

    # Define Optimizer and Scheduler
    optimizer = AdamW(model.parameters(), lr=CONFIG['learning_rate'], weight_decay=CONFIG['weight_decay'])
    scheduler = fetch_scheduler(optimizer)

    model, history, best_f1_fold, best_epoch_f1 = run_training(model, optimizer, scheduler,
                                  device=CONFIG['device'],
                                  num_epochs=CONFIG['epochs'],
                                  fold=fold)
    
    print("F1-macro: ", best_f1_fold)
    f1_average.append(best_f1_fold)
    best_f1_average.append(best_epoch_f1)
    if fold == CONFIG["n_fold"] - 1:
        f1_cv = sum(f1_average) / CONFIG["n_fold"]
        best_f1_cv = sum(best_f1_average) / CONFIG["n_fold"]
        run.summary["CV"] = f1_cv
        run.summary["BestF1CV"] = best_f1_cv
    run.finish()
    
    del model, history, train_loader, valid_loader
    _ = gc.collect()
    print()

print(f"CV: {f1_cv:5f}")
print(f"Best f1 CV: {best_f1_cv:5f}")

[34m[1mwandb[0m: Currently logged in as: [33myanagikk[0m ([33mynu_uec[0m). Use [1m`wandb login --relogin`[0m to force relogin




Downloading pytorch_model.bin:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 3671/3671 [40:16<00:00,  1.52it/s]
100%|██████████| 306/306 [03:44<00:00,  1.37it/s, Epoch=1, LR=9.4e-6, Valid_Loss=0.481]


[34mValidation Loss Improved (inf ---> 0.4814109087379929)
Model Saved[0m
Epoch 1 f1_score:  0.765881699092296


100%|██████████| 3671/3671 [39:52<00:00,  1.53it/s]
100%|██████████| 306/306 [03:44<00:00,  1.36it/s, Epoch=2, LR=7.77e-6, Valid_Loss=0.419]


[34mValidation Loss Improved (0.4814109087379929 ---> 0.41865908322868006)
Model Saved[0m
Epoch 2 f1_score:  0.8001300615814484


100%|██████████| 3671/3671 [39:54<00:00,  1.53it/s]
100%|██████████| 306/306 [03:44<00:00,  1.36it/s, Epoch=3, LR=5.53e-6, Valid_Loss=0.469]


Epoch 3 f1_score:  0.7942082397406756
Training complete in 2h 12m 6s
Best Loss: 0.4187
Best F1: 0.8001
F1-macro:  0.8001300615814484


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
F1-macro,▁█▇
Train Loss,█▄▁
Valid Loss,█▁▇

0,1
Best Loss,0.41866
F1-macro,0.79421
Train Loss,0.06142
Valid Loss,0.46873





Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 3671/3671 [39:01<00:00,  1.57it/s]
100%|██████████| 306/306 [03:40<00:00,  1.39it/s, Epoch=1, LR=9.4e-6, Valid_Loss=0.473]


[34mValidation Loss Improved (inf ---> 0.4727446259869859)
Model Saved[0m
Epoch 1 f1_score:  0.7653264762830441


100%|██████████| 3671/3671 [38:59<00:00,  1.57it/s]
100%|██████████| 306/306 [03:40<00:00,  1.39it/s, Epoch=2, LR=7.77e-6, Valid_Loss=0.406]


[34mValidation Loss Improved (0.4727446259869859 ---> 0.40562753251516354)
Model Saved[0m
Epoch 2 f1_score:  0.8070724377841789


100%|██████████| 3671/3671 [38:51<00:00,  1.57it/s]
100%|██████████| 306/306 [03:40<00:00,  1.39it/s, Epoch=3, LR=5.53e-6, Valid_Loss=0.435]


Epoch 3 f1_score:  0.8214802254626175
Training complete in 2h 8m 41s
Best Loss: 0.4056
Best F1: 0.8215
F1-macro:  0.8070724377841789


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
F1-macro,▁▆█
Train Loss,█▄▁
Valid Loss,█▁▄

0,1
Best Loss,0.40563
F1-macro,0.82148
Train Loss,0.06647
Valid Loss,0.43502





Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 3671/3671 [39:00<00:00,  1.57it/s]
100%|██████████| 306/306 [03:39<00:00,  1.39it/s, Epoch=1, LR=9.4e-6, Valid_Loss=0.474]


[34mValidation Loss Improved (inf ---> 0.4735124083905438)
Model Saved[0m
Epoch 1 f1_score:  0.7689667501972599


100%|██████████| 3671/3671 [38:59<00:00,  1.57it/s]
100%|██████████| 306/306 [03:39<00:00,  1.39it/s, Epoch=2, LR=7.77e-6, Valid_Loss=0.501]


Epoch 2 f1_score:  0.7696578374544477


100%|██████████| 3671/3671 [39:04<00:00,  1.57it/s]
100%|██████████| 306/306 [03:39<00:00,  1.39it/s, Epoch=3, LR=5.53e-6, Valid_Loss=0.428]


[34mValidation Loss Improved (0.4735124083905438 ---> 0.42845963847403434)
Model Saved[0m
Epoch 3 f1_score:  0.8131487889273357
Training complete in 2h 8m 48s
Best Loss: 0.4285
Best F1: 0.8131
F1-macro:  0.8131487889273357


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
F1-macro,▁▁█
Train Loss,█▄▁
Valid Loss,▅█▁

0,1
Best Loss,0.42846
F1-macro,0.81315
Train Loss,0.06819
Valid Loss,0.42846





Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[INFO] Using GPU: Tesla P100-PCIE-16GB



100%|██████████| 3672/3672 [39:13<00:00,  1.56it/s]
100%|██████████| 306/306 [03:41<00:00,  1.38it/s, Epoch=1, LR=9.42e-6, Valid_Loss=0.475]


[34mValidation Loss Improved (inf ---> 0.474619233564401)
Model Saved[0m
Epoch 1 f1_score:  0.7569196415395214


100%|██████████| 3672/3672 [39:04<00:00,  1.57it/s]
100%|██████████| 306/306 [03:42<00:00,  1.38it/s, Epoch=2, LR=7.81e-6, Valid_Loss=0.403]


[34mValidation Loss Improved (0.474619233564401 ---> 0.4033868635704335)
Model Saved[0m
Epoch 2 f1_score:  0.817716976795769


100%|██████████| 3672/3672 [39:29<00:00,  1.55it/s]
100%|██████████| 306/306 [03:44<00:00,  1.36it/s, Epoch=3, LR=5.61e-6, Valid_Loss=0.411]


Epoch 3 f1_score:  0.8045208356730682
Training complete in 2h 9m 25s
Best Loss: 0.4034
Best F1: 0.8177
F1-macro:  0.817716976795769


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
F1-macro,▁█▆
Train Loss,█▄▁
Valid Loss,█▁▂

0,1
Best Loss,0.40339
BestF1CV,0.81312
CV,0.80952
F1-macro,0.80452
Train Loss,0.07226
Valid Loss,0.41051



CV: 0.809517
Best f1 CV: 0.813119


In [None]:
url = f"https://wandb.ai/ynu_uec/SIGNATE_MUFG/groups/{CONFIG['group']}/"

# This is just to display the W&B run page in this interactive session.
from IPython import display

# we create an IFrame and set the width and height
# iF = display.IFrame(url, width=1080, height=720)
iF = display.IFrame(url, width=1500, height=600)

iF