In [1]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer
import transformers as tf

In [2]:
# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))

Device: NVIDIA GeForce RTX 3060


In [3]:
# config
data_dir = os.path.join("../Input/")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "train.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
MODELS_DIR = "../models/"
MODEL_NAME = 'microsoft/deberta-base'
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 25
NUM_SPLITS = 5
SMOOTHING = 0.03
MAX_LENGTH = 175
DROPOUT = 0.15

In [4]:
import re

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_tag(text)
        #clean_text = ' '.join(text)
        clean_texts.append(text)
    return clean_texts

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

In [5]:
# dataset
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=MAX_LENGTH))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset

In [6]:
# model
# class Classifier(nn.Module):
#     def __init__(self, model_name, num_classes=4):
#         super().__init__()

#         self.bert = AutoModel.from_pretrained(model_name)
#         self.dropout = nn.Dropout(0.1)
#         self.linear = nn.Linear(768, num_classes)
#         nn.init.normal_(self.linear.weight, std=0.02)
#         nn.init.zeros_(self.linear.bias)
#         self.cnn = nn.Conv1d(768, 4, kernel_size=2, padding=1)
#         self.pooling = nn.MaxPool1d(3, stride=2)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         output = self.bert(
#             input_ids = input_ids,
#             attention_mask = attention_mask,
#             token_type_ids = token_type_ids)
#         output, _ = output['last_hidden_state'].max(1)
# #         output = output["last_hidden_state"][:, 0, :]
#         output = self.dropout(output)
#         SIZE = output.size()[0]
        
        
        
# #         output = torch.reshape(output, (SIZE, 768, 1))
# #         output = self.cnn(output)
# #         #print("post cnn")
# #         #print(output.size())
# #         output = torch.max(output, 2)
# #         output = output[0]
        
# #         output = torch.reshape(output, (SIZE, 4))

#         output = self.linear(output)
#         return output

In [7]:
# model
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(DROPOUT)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids)
        output = output["last_hidden_state"][:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

In [8]:
# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels, token_type_ids = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, token_type_ids)
        del input_ids, attention_mask, token_type_ids
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels, token_type_ids = batch.values()
            del batch

            outputs = model(input_ids, attention_mask, token_type_ids)
            del input_ids, attention_mask, token_type_ids
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1

In [9]:
def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    loss_ax.savefig(f"../figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    acc_ax.savefig(f"../figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    f1_ax.savefig(f"../figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

In [10]:
def linear_combination(x, y, epsilon):
    return (1 - epsilon) * x + epsilon * y

def reduce_loss(loss, reduction='mean'):
    return loss.mean() if reduction == 'mean' else loss.sum() if reduction == 'sum' else loss

class LabelSmoothingCrossEntropy(nn.Module):
    def __init__(self, epsilon=SMOOTHING, reduction='mean'):
        super().__init__()
        self.epsilon = epsilon
        self.reduction = reduction

    def forward(self, preds, target):
        n = preds.size()[-1]
        log_preds = F.log_softmax(preds, dim=-1)
        loss = reduce_loss(-log_preds.sum(dim=-1), self.reduction)
        nll = F.nll_loss(log_preds, target, reduction=self.reduction)
        return linear_combination(nll, loss/n, self.epsilon)

In [11]:
def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)
    
    for param in model.parameters():
        param.requires_grad = False
        
    last_layer = list(model.children())[-1]
    print(f'except last layer: {last_layer}')
    for param in last_layer.parameters():
        param.requires_grad = True
    
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=10000000, gamma=1.0)

    criterion = LabelSmoothingCrossEntropy()
    
    model_params = list(model.named_parameters())
    
    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        
        if epoch == 5:
            for param in model.parameters():
                param.requires_grad = True
        
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1


In [12]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
with open(f"../result/{MODEL_NAME}_result.txt", mode='w') as f:
    f.write(lines)



  0%|          | 0/1212 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


except last layer: Linear(in_features=768, out_features=4, bias=True)




  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.421755035718282  Acc: 0.05921052631578947  f1: 0.027950310559006212  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.4059468905131023  Acc: 0.33223684210526316  f1: 0.12469135802469135  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3922196229298909  Acc: 0.33223684210526316  f1: 0.12469135802469135  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.380162278811137  Acc: 0.33223684210526316  f1: 0.12469135802469135  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3693779706954956  Acc: 0.33223684210526316  f1: 0.12469135802469135  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7625863154729208  Acc: 0.7631578947368421  f1: 0.6885741128460772  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7036093274752299  Acc: 0.7796052631578947  f1: 0.6271942347302701  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7030110557874044  Acc: 0.7697368421052632  f1: 0.7256216982347636  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6748752395311991  Acc: 0.8026315789473685  f1: 0.7915182884748101  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7652474443117777  Acc: 0.7861842105263158  f1: 0.7522985944083809  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7802255153656006  Acc: 0.7894736842105263  f1: 0.7596273580977821  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7897480328877767  Acc: 0.7796052631578947  f1: 0.7438461135847815  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7759602268536886  Acc: 0.7993421052631579  f1: 0.7714491038326712  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7698160807291666  Acc: 0.8256578947368421  f1: 0.8090295864567433  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8332583705584208  Acc: 0.7927631578947368  f1: 0.7244248633357545  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8107264439264933  Acc: 0.7993421052631579  f1: 0.7727586009378327  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7940769195556641  Acc: 0.8125  f1: 0.7941637336954512  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8283825119336446  Acc: 0.7927631578947368  f1: 0.7490230600893439  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8519873023033142  Acc: 0.7796052631578947  f1: 0.7299916093838951  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8433240652084351  Acc: 0.7697368421052632  f1: 0.7365089302589303  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8690004547437032  Acc: 0.7730263157894737  f1: 0.7172240089690087  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8293167153994242  Acc: 0.7763157894736842  f1: 0.7496413380562195  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0624146461486816  Acc: 0.7302631578947368  f1: 0.6316281830439804  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7363789478937784  Acc: 0.8026315789473685  f1: 0.7789250455730546  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8188033898671468  Acc: 0.7697368421052632  f1: 0.7488838375007286  

<fold=0> best score: 0.8090295864567433



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


except last layer: Linear(in_features=768, out_features=4, bias=True)




  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.4096683263778687  Acc: 0.3333333333333333  f1: 0.125  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.394184947013855  Acc: 0.3333333333333333  f1: 0.125  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3800989389419556  Acc: 0.3333333333333333  f1: 0.125  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3675087292989094  Acc: 0.3333333333333333  f1: 0.125  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.356032133102417  Acc: 0.3333333333333333  f1: 0.125  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8472404082616171  Acc: 0.6897689768976898  f1: 0.5283438155136269  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.709072470664978  Acc: 0.7227722772277227  f1: 0.6306689626631625  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7365599671999613  Acc: 0.7227722772277227  f1: 0.6795710205746925  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8512751261393229  Acc: 0.7458745874587459  f1: 0.6853800259373015  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.950474480787913  Acc: 0.7326732673267327  f1: 0.6583525881071893  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9152575929959615  Acc: 0.7557755775577558  f1: 0.6904581901489117  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9604883193969727  Acc: 0.735973597359736  f1: 0.6793181837322644  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9841475089391073  Acc: 0.735973597359736  f1: 0.6803272742134894  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0756358702977498  Acc: 0.7425742574257426  f1: 0.6629762259075199  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9851118326187134  Acc: 0.7458745874587459  f1: 0.711367795500077  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0193790197372437  Acc: 0.7392739273927392  f1: 0.680553038540534  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9928177992502848  Acc: 0.7557755775577558  f1: 0.6924718277231101  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0110860466957092  Acc: 0.7491749174917491  f1: 0.6847646102770062  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0445324977238972  Acc: 0.7656765676567657  f1: 0.7239956226723947  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0375951528549194  Acc: 0.735973597359736  f1: 0.6841591560168909  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0298089186350505  Acc: 0.7557755775577558  f1: 0.695852534562212  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0397347609202068  Acc: 0.7557755775577558  f1: 0.7039145685953803  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0217921733856201  Acc: 0.759075907590759  f1: 0.6945402579696236  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0192635854085286  Acc: 0.759075907590759  f1: 0.6960701158940454  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0384162664413452  Acc: 0.7293729372937293  f1: 0.6688279957389904  

<fold=1> best score: 0.7239956226723947



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


except last layer: Linear(in_features=768, out_features=4, bias=True)




  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2983087301254272  Acc: 0.30033003300330036  f1: 0.11548223350253808  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2909509738286336  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2847631772359211  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2793201605478923  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.275234619776408  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8549795349438986  Acc: 0.6996699669966997  f1: 0.5331390248113428  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.662304162979126  Acc: 0.7887788778877888  f1: 0.71006264797792  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6906889875729879  Acc: 0.7557755775577558  f1: 0.6214163048574166  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6895545522371928  Acc: 0.7854785478547854  f1: 0.7015732040789965  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7232275009155273  Acc: 0.7953795379537953  f1: 0.7181281702426328  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8044611612955729  Acc: 0.7788778877887789  f1: 0.7002301251887418  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.815599262714386  Acc: 0.7821782178217822  f1: 0.6822107330897975  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8480101029078165  Acc: 0.7953795379537953  f1: 0.7113711347570411  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8508577346801758  Acc: 0.7887788778877888  f1: 0.6914300202839756  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9121517340342203  Acc: 0.7557755775577558  f1: 0.6642751463287584  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.883186399936676  Acc: 0.7755775577557755  f1: 0.671667133556857  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8848959803581238  Acc: 0.7788778877887789  f1: 0.6746655478177217  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8566420873006185  Acc: 0.7854785478547854  f1: 0.6806117166163381  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8801807363828024  Acc: 0.7986798679867987  f1: 0.6904221963312621  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9380548199017843  Acc: 0.7722772277227723  f1: 0.6700906196932367  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9253263473510742  Acc: 0.7788778877887789  f1: 0.6739556469650116  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9227205912272135  Acc: 0.7722772277227723  f1: 0.679295537081978  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9156429568926493  Acc: 0.759075907590759  f1: 0.6715785209736336  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.122807780901591  Acc: 0.7425742574257426  f1: 0.6412380122057542  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9275244474411011  Acc: 0.7788778877887789  f1: 0.6929450757575757  

<fold=2> best score: 0.7181281702426328



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


except last layer: Linear(in_features=768, out_features=4, bias=True)


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.305566708246867  Acc: 0.30033003300330036  f1: 0.11548223350253808  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2986867427825928  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2923349936803181  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2866891622543335  Acc: 0.30363036303630364  f1: 0.12157661977834613  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2819782495498657  Acc: 0.32673267326732675  f1: 0.16291211232560204  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9142185648282369  Acc: 0.6633663366336634  f1: 0.5008333361283563  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6870709260304769  Acc: 0.7557755775577558  f1: 0.6317325571994502  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6666847666104635  Acc: 0.7755775577557755  f1: 0.7275828729281768  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7014937400817871  Acc: 0.7953795379537953  f1: 0.7429850005178379  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.908808171749115  Acc: 0.6996699669966997  f1: 0.6060158152243489  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7921321392059326  Acc: 0.759075907590759  f1: 0.702594020098779  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8389293750127157  Acc: 0.7722772277227723  f1: 0.7247844727526871  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8849142591158549  Acc: 0.759075907590759  f1: 0.7282927800169179  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9084794521331787  Acc: 0.7821782178217822  f1: 0.7476446890713935  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9359033107757568  Acc: 0.7623762376237624  f1: 0.7001439395523583  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8699391881624857  Acc: 0.7920792079207921  f1: 0.7426184937385467  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8477563858032227  Acc: 0.7986798679867987  f1: 0.7539896441658162  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9429567456245422  Acc: 0.7557755775577558  f1: 0.6857841038513307  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8558801611264547  Acc: 0.7722772277227723  f1: 0.719577461997144  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8322210907936096  Acc: 0.801980198019802  f1: 0.7771677231498662  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8689191142717997  Acc: 0.7755775577557755  f1: 0.7245731143202526  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0104211171468098  Acc: 0.7755775577557755  f1: 0.7276485615118434  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.899309237798055  Acc: 0.759075907590759  f1: 0.7162437787818499  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9828693270683289  Acc: 0.7854785478547854  f1: 0.759109873081001  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0439409414927165  Acc: 0.7392739273927392  f1: 0.6976043165947011  

<fold=3> best score: 0.7771677231498662



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


except last layer: Linear(in_features=768, out_features=4, bias=True)




  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3494792779286702  Acc: 0.30033003300330036  f1: 0.11548223350253808  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3393363952636719  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3302042881647747  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3222131331761677  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.315281629562378  Acc: 0.30033003300330036  f1: 0.11548223350253808  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2874527772267659  Acc: 0.31353135313531355  f1: 0.13951609069734278  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9277047316233317  Acc: 0.6270627062706271  f1: 0.4818817135765349  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7387879689534506  Acc: 0.7623762376237624  f1: 0.5882231220079638  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6855067412058512  Acc: 0.7524752475247525  f1: 0.6855832551967671  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7457071940104166  Acc: 0.759075907590759  f1: 0.704724234615869  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9902512629826864  Acc: 0.7062706270627063  f1: 0.62516367298976  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8429780205090841  Acc: 0.7623762376237624  f1: 0.7083256162253624  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8008528550465902  Acc: 0.7656765676567657  f1: 0.7102819302341254  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9369064966837565  Acc: 0.7458745874587459  f1: 0.68094290872172  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9521599014600118  Acc: 0.7557755775577558  f1: 0.7194528456692968  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9533926248550415  Acc: 0.768976897689769  f1: 0.7033786691052818  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9645955562591553  Acc: 0.7656765676567657  f1: 0.7042442810457517  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9258459607760111  Acc: 0.7425742574257426  f1: 0.6870844911403783  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9470388690630595  Acc: 0.7557755775577558  f1: 0.6875180375180376  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9331601063410441  Acc: 0.7788778877887789  f1: 0.7001277829054091  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9802619020144144  Acc: 0.7623762376237624  f1: 0.7137069950385166  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9140208959579468  Acc: 0.768976897689769  f1: 0.7237749058253689  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8953603704770406  Acc: 0.7854785478547854  f1: 0.7407920900450591  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0834070444107056  Acc: 0.7392739273927392  f1: 0.6956077944450036  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1005584001541138  Acc: 0.7557755775577558  f1: 0.7127912562584355  

<fold=4> best score: 0.7407920900450591

CV: 0.7538226385133392


<Figure size 432x288 with 0 Axes>

In [13]:
import requests as rt
token = "7Tv3wEVOWrOyeEEK4iEIjcawacGLueqoBO0nYBR7QaR"
line = "https://notify-api.line.me/api/notify"
head = {"Authorization": "Bearer " + token}
mes = {"message": lines}
rt.post(line, headers=head, data=mes)

<Response [200]>

In [None]:
# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("../output/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("../output/submission.csv", index=False, header=False)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]