In [1]:
!nvidia-smi

Thu Oct 13 06:24:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   68C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Imports

In [2]:
%%capture
!pip install transformers
!pip install datasets
!pip install torchmetrics
!pip install sentencepiece
!pip install nlpaug
# !pip install pytorch-lightning

In [3]:
import gc
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive
import re
from tqdm.auto import tqdm
import glob

import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import F1Score
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

# Augmentation
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

set_seed()

import warnings
warnings.simplefilter('ignore')



[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
drive.mount('/content/drive', force_remount=False)

Mounted at /content/drive


In [5]:
%cd /content/drive/MyDrive/signate_student_cup_2022

/content/drive/MyDrive/signate_student_cup_2022


# Main part

In [6]:
def text_cleaning(batch):
    clean_texts = []
    for text in batch["description"]:
        clean_lines = []
        lines = text.split(r"</li>")
        for line in lines:
            line = remove_tag(line)
            clean_line = re.sub(r' &amp; ', ' and ', line)
            clean_line = re.sub(r'&amp;', '', clean_line)

            clean_line = re.sub(r'&lt;.*?style.*?&gt;', '', clean_line)
            clean_line = re.sub(r'&lt;', ' less than ', clean_line)
            clean_line = re.sub(r'&gt;', ' more than ', clean_line)
            clean_line = re.sub(r'\\u202f', '', clean_line)
            clean_line = re.sub(r'\\xa0', '', clean_line)
            clean_line = re.sub(r'\\', '', clean_line)

            clean_line = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', clean_line)
            clean_line = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', clean_line)

            clean_line = clean_line.strip()
            clean_line = clean_line + ('' if clean_line.endswith('.') else '.')
            if len(clean_line)!=1:
                clean_lines.append(clean_line)
        clean_texts.append(' '.join(clean_lines))
    return {"clean_description": clean_texts}

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

In [7]:
from datasets import Dataset
def get_train_data(train_path):
    train_df = pd.read_csv(train_path)
    train_df["jobflag"] -= 1
    train_ds = Dataset.from_pandas(train_df)
    train_ds = train_ds.map(text_cleaning, batched=True, batch_size=None)
    return train_ds

def get_test_data(test_path):
    test_df = pd.read_csv(test_path)
    test_ds = Dataset.from_pandas(test_df)
    test_ds = test_ds.map(text_cleaning, batched=True, batch_size=None)
    return test_ds

In [8]:
class Job_CLS_Dataset(Dataset):
    def __init__(self, config, descriptions, labels=None, max_token_len: int=128, mode=None, aug_data=False):
        self.config = config
        self.descriptions = descriptions
        self.labels = labels

        if aug_data:
            print(len(self.descriptions))
            print(len(self.labels))
            aug_df = pd.read_csv("data/augmented_data.csv",  index_col=0)
            aug_df["jobflag"] -= 1
            self.descriptions += list(aug_df["description"])
            self.labels += list(aug_df["jobflag"])
            del aug_df
            print(len(self.descriptions))
            print(len(self.labels))

        self.tokenizer = AutoTokenizer.from_pretrained(config.model_name)
        self.max_token_len = max_token_len

        if mode=="Train":
            augs = {}
            augs["synonym_replace"] = naw.SynonymAug(aug_src='wordnet')
            augs["random_insert"] = naw.ContextualWordEmbsAug(model_path="distilbert-base-uncased", 
                                            device="cuda", action="insert", aug_max=1)            
            augs["random_substitute"] = naw.ContextualWordEmbsAug(model_path="distilbert-base-uncased", 
                                            device="cuda", action="substitute", aug_p=0.5)
            augs["random_swap"] = naw.RandomWordAug(action="swap")
            augs["random_delete"] = naw.RandomWordAug()
            self.augs = augs

        self.mode = mode

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, index):
        description = self.descriptions[index]
        if self.mode=="Train":
            if np.random.rand() < 0.05:
                description = self.augs["synonym_replace"].augment(description)[0]
            if np.random.rand() < 0.05:
                description = self.augs["random_insert"].augment(description)[0]
            if np.random.rand() < 0.05:
                description = self.augs["random_substitute"].augment(description)[0]
            if np.random.rand() < 0.05:
                description = self.augs["random_swap"].augment(description)[0]
            if np.random.rand() < 0.05:
                description = self.augs["random_delete"].augment(description)[0]

            # Augmentation
        encoded = self.tokenizer.encode_plus(description,
                                                add_special_tokens=True,
                                                return_tensors='pt',
                                                truncation=True,
                                                padding='max_length',
                                                max_length=self.max_token_len,
                                                return_attention_mask = True)
        if self.labels is not None:
            label = torch.tensor(self.labels[index], dtype=torch.int64)
            return encoded.input_ids.flatten(), encoded.attention_mask.flatten(), label
        else:
            return encoded.input_ids.flatten(), encoded.attention_mask.flatten()

class Job_CLS_Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        cfg = AutoConfig.from_pretrained(
            config.model_name,
            output_hidden_states=True
        )
        self.pretrained_model = AutoModel.from_pretrained(config.model_name, config=cfg)

        self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, int(self.pretrained_model.config.hidden_size/2))
        torch.nn.init.xavier_uniform_(self.hidden.weight)

        self.classifier = torch.nn.Linear(int(self.pretrained_model.config.hidden_size/2), config.n_labels)
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        
        self.loss_func = nn.CrossEntropyLoss()
        self.f1_func = F1Score(num_classes=config.n_labels, average="macro")
        self.logits_dropouts = nn.ModuleList([nn.Dropout(p=0.1) for _ in range(config.num_msd)])
    

    def forward(self, input_ids, attention_mask, labels=None):

        output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
        output = output[:, 0, :]
        pooled_output = sum([self.hidden(dropout(output)) for dropout in self.logits_dropouts]) / self.config.num_msd
        pooled_output = F.relu(pooled_output)
        logits = sum([self.classifier(dropout(pooled_output)) for dropout in self.logits_dropouts]) / self.config.num_msd

        if labels is not None:
            loss = self.loss_func(logits, labels)
            with torch.no_grad():
                pass# F1の計算??
            return logits, loss
        else:
            return logits

In [9]:
def debug(ds):
    df = pd.DataFrame(ds).sort_values("jobflag").reset_index(drop=True)
    for i in range(4):
        print(f'# ===== jobflag={i} ===== #\n')
        temp_df = df[df["jobflag"]==i].reset_index(drop=True)
        for j in range(len(temp_df)):
            print("    ", temp_df.loc[j, "clean_description"], "\n")

In [10]:
def training(config, train_ds):

    folds = StratifiedKFold(n_splits=config.num_fold)
    splits = folds.split(np.zeros(train_ds.num_rows), train_ds["jobflag"])

    oof_pred = np.zeros((len(train_ds), 4), dtype=np.float32)
    fold_num = np.zeros(len(train_ds), dtype=np.int32)

    for fold, (train_idxs, val_idxs) in enumerate(splits):

        print(f'\n# ==== start fold{fold+1} ==== #\n')
        train_fold_ds = train_ds.select(train_idxs)
        valid_fold_ds = train_ds.select(val_idxs)

        # debug(valid_fold_ds)# ========================

        train_dataset = Job_CLS_Dataset(config, train_fold_ds["clean_description"], train_fold_ds["jobflag"], aug_data=True)# mode="Train"
        valid_dataset = Job_CLS_Dataset(config, valid_fold_ds["clean_description"], valid_fold_ds["jobflag"])

        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=config.batch_size, 
            shuffle=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=config.batch_size,
            shuffle=False,
            drop_last=False
        )

        best_val_preds = None
        best_val_f1_score = -1

        model = Job_CLS_Model(config)
        model.to(config.device)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = []
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': config.weight_decay
        })
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        })
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=config.lr,
            betas=config.beta,
            weight_decay=config.weight_decay,
        )

        num_train_optimization_steps = int(
            len(train_loader) * config.n_epochs // config.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * config.num_warmup_steps_rate)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        num_eval_step = len(train_loader) // config.num_eval + config.num_eval

        for epoch in range(config.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch+1} ============== #")
            model.train()
            train_preds = []
            train_labels = []
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (input_ids, attention_mask, labels) in enumerate(pbar):

                    input_ids = input_ids.to(config.device)
                    attention_mask = attention_mask.to(config.device)
                    labels = labels.to(config.device)

                    optimizer.zero_grad()
                    with autocast():
                        output, loss = model(input_ids, attention_mask, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if config.gradient_accumulation_steps>1:
                        loss = loss/config.gradient_accumulation_steps
                    scaler.scale(loss).backward()

                    if (step+1)%config.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                    
                    output = output.softmax(axis=1).detach().cpu().numpy()
                    train_preds.append(output)
                    train_labels.append(labels.detach().cpu().numpy())

            train_preds = np.concatenate(train_preds)
            train_labels = np.concatenate(train_labels)
            train_f1_score = f1_score(np.argmax(train_preds, axis=1), train_labels, average='macro')
            print(f'Training score: {train_f1_score}')
            # each F1
            each_F1 = []
            for l in range(4):
                temp_preds = np.where(np.argmax(train_preds, axis=1)==l, 1, 0)
                temp_labels = np.where(train_labels==l, 1, 0)
                con_matrix = confusion_matrix(temp_labels, temp_preds)
                TN = con_matrix[0][0] # 真陰性
                FP = con_matrix[0][1] # 偽陽性
                FN = con_matrix[1][0] # 偽陰性
                TP = con_matrix[1][1] # 真陽性
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
                temp_f1 = (2*precision*recall) / (precision+recall)
                each_F1.append(temp_f1)
            print(f'    Each F1: {each_F1}')

            # evaluating
            val_preds = []
            val_labels = []
            val_losses = []
            val_nums = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for (input_ids, attention_mask, labels) in pbar:

                        input_ids = input_ids.to(config.device)
                        attention_mask = attention_mask.to(config.device)
                        labels = labels.to(config.device)
                        with autocast():
                            output, loss = model(input_ids, attention_mask, labels)
                        output = output.softmax(axis=1).detach().cpu().numpy()
                        val_preds.append(output)
                        val_labels.append(labels.detach().cpu().numpy())
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.concatenate(val_preds)
            val_labels = np.concatenate(val_labels)
            val_loss = sum(val_losses) / sum(val_nums)
            val_f1_score = f1_score(np.argmax(val_preds, axis=1), val_labels, average='macro')

            val_log = {
                'val_loss': val_loss,
                'val_f1_score': val_f1_score,
            }
            display(val_log)

            # each F1
            each_F1 = []
            for l in range(4):
                temp_preds = np.where(np.argmax(val_preds, axis=1)==l, 1, 0)
                temp_labels = np.where(val_labels==l, 1, 0)
                con_matrix = confusion_matrix(temp_labels, temp_preds)
                TN = con_matrix[0][0] # 真陰性
                FP = con_matrix[0][1] # 偽陽性
                FN = con_matrix[1][0] # 偽陰性
                TP = con_matrix[1][1] # 真陽性
                precision = TP/(TP+FP)
                recall = TP/(TP+FN)
                temp_f1 = (2*precision*recall) / (precision+recall)
                each_F1.append(temp_f1)
            print(f'    Each F1: {each_F1}')

            if best_val_f1_score < val_f1_score:
                print("\n [ save model weight ] \n")
                best_val_preds = val_preds
                best_val_f1_score = val_f1_score
                torch.save(
                    model.state_dict(), 
                    os.path.join(config.model_save_path, f"fold{fold}.pth")
                )
            
        oof_pred[val_idxs] = best_val_preds.astype(np.float32)
        fold_num[val_idxs] = fold+1
        del model; gc.collect()

    # scoring
    score = f1_score(np.argmax(oof_pred, axis=1), train_ds['jobflag'], average='macro')
    print('CV:', round(score, 5))
    return score, oof_pred, fold_num

In [11]:
def inferring(config, test_ds):
    config.model_weights = [p for p in sorted(glob.glob(os.path.join(config.model_save_path, 'fold*.pth')))]
    sub_pred = np.zeros((len(test_ds), 4), dtype=np.float32)
    for fold, model_weight in enumerate(config.model_weights):
        test_dataset = Job_CLS_Dataset(config,  test_ds["clean_description"])
        test_loader = DataLoader(
            dataset=test_dataset, 
            batch_size=config.batch_size, 
            shuffle=False,
            drop_last=False
        )
        model = Job_CLS_Model(config)
        model.load_state_dict(torch.load(model_weight))
        model.to(config.device)

        model.eval()
        fold_pred = []
        with torch.no_grad():
            with tqdm(test_loader, total=len(test_loader)) as pbar:
                for (input_ids, attention_mask) in pbar:

                    input_ids = input_ids.to(config.device)
                    attention_mask = attention_mask.to(config.device)
                    with autocast():
                        output = model(input_ids, attention_mask)
                    output = output.softmax(axis=1).detach().cpu().numpy()
                    fold_pred.append(output)
        fold_pred = np.concatenate(fold_pred)
        # np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred_fold{fold}.npy'), fold_pred)
        sub_pred += fold_pred / len(config.model_weights)
        del model; gc.collect()

    np.save(os.path.join(config.preds_save_path, 'sub_pred.npy'), sub_pred)
    return sub_pred

In [12]:
class Config:
    # private
    # _colab_path = "/content/drive/MyDrive/signate_student_cup_2022" 
    _exp_num = '003'

    # 学習param
    seed = 0
    model_name = "microsoft/deberta-v2-xlarge"# deberta-v2-xlarge roberta-base, microsoft/deberta-base, microsoft/deberta-v3-base
    """
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
    # model='bert-base-uncased'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    """
    num_fold = 8
    batch_size = 2
    n_epochs = 10
    max_token_len = 256
    lr = 2e-5

    weight_decay = 2e-5
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.001
    gradient_accumulation_steps = 2
    num_eval = 1

    n_labels = 4
    num_msd = 8
    

    # 保存先
    save_folder_name = f'Exp{_exp_num}_{model_name}_fold{num_fold}_epoch{n_epochs}_tokenlen{max_token_len}_lr{lr}'
    
def setup(config):
    print("### Configration Setup...")
    config.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # mount
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive') 

    config.output_path = './outputs'
    config.experiment_path = os.path.join(config.output_path, config.save_folder_name)
    print(f'    experiment_path  >> {config.experiment_path}')
    config.model_save_path = os.path.join(config.experiment_path, 'model')
    print(f'    model_save_path >> {config.model_save_path}')
    config.figure_save_path = os.path.join(config.experiment_path, 'figure')
    print(f'    figure_save_path >> {config.figure_save_path}')
    config.preds_save_path = os.path.join(config.experiment_path, 'preds')
    print(f'    preds_save_path >> {config.preds_save_path}')
    
    for d in [config.output_path, config.experiment_path, config.model_save_path, config.figure_save_path, config.preds_save_path]:
        os.makedirs(d, exist_ok=True)

    print("### Setup Complete. \n")
    return config

In [13]:
def main():

    # config
    config = setup(Config)
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    submit_path = './data/submit_sample.csv'
    job_flags = ['Data scientist', 'Machine learning engineer','Software engineer','Consultant']

    train_ds = get_train_data(train_path)
    test_ds = get_test_data(test_path)

    score, oof_pred, fold_num = training(config, train_ds)

    sub_pred = inferring(config, test_ds)
    sub = pd.read_csv(submit_path, header=None)
    sub[1] = np.argmax(sub_pred, axis=1)
    sub[1] = sub[1].astype(int) + 1

    def fix_leak(sub, train_path, test_path):
        print("===== fix_leak =====")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        duplicated = pd.merge(test_df, train_df[["description", "jobflag"]], on="description", how="left")
        duplicated = duplicated[~duplicated["jobflag"].isnull()]
        for i in duplicated.index:
            print(f'Fix index{i}: {sub.loc[i,1]}')
            sub.loc[i, 1] = int(duplicated.loc[i, "jobflag"])
            print(f'To {sub.loc[i,1]}')
        return sub
    sub = fix_leak(sub, train_path, test_path)

    # 提出用ファイル
    sub.to_csv(os.path.join(config.preds_save_path, f'Exp{config._exp_num}_CV{int(score*(10**10))}_submission.csv'), index=False, header=False)

    return oof_pred, fold_num# debug

In [None]:
oof_pred, fold_num = main()

### Configration Setup...
    experiment_path  >> ./outputs/Exp003_microsoft/deberta-v2-xlarge_fold8_epoch10_tokenlen256_lr2e-05
    model_save_path >> ./outputs/Exp003_microsoft/deberta-v2-xlarge_fold8_epoch10_tokenlen256_lr2e-05/model
    figure_save_path >> ./outputs/Exp003_microsoft/deberta-v2-xlarge_fold8_epoch10_tokenlen256_lr2e-05/figure
    preds_save_path >> ./outputs/Exp003_microsoft/deberta-v2-xlarge_fold8_epoch10_tokenlen256_lr2e-05/preds
### Setup Complete. 



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


# ==== start fold1 ==== #

1326
1326
1574
1574


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/633 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.45M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/1.78G [00:00<?, ?B/s]

# Debug

In [None]:
train_path = './data/train.csv'
train_ds = get_train_data(train_path)

In [None]:
from torch.nn.functional import cross_entropy
def calculate_loss(logits, labels):

    pred_labels = np.argmax(logits, axis=-1)
    loss = cross_entropy(logits, labels, reduction="none")
    return pred_labels, loss

pred_labels, loss = calculate_loss(torch.tensor(oof_pred), torch.tensor(train_ds["jobflag"]))

In [None]:
df = pd.DataFrame(train_ds)[["id", "clean_description", "jobflag"]]
df["pred"] = pred_labels
df["loss"] = loss
df["fold"] = fold_num
df = df.sort_values("loss").reset_index(drop=True)
df

In [None]:
df.head(750)["jobflag"].value_counts()

In [None]:
df.tail(750)["jobflag"].value_counts()

In [None]:
df

In [None]:
job_flags = ['DS', 'ML', 'SE', 'CO']

In [None]:
from transformers import pipeline
tokenizer = AutoTokenizer.from_pretrained('facebook/mbart-large-50-one-to-many-mmt')
mbart_translator = pipeline('translation',
                            model='facebook/mbart-large-50-one-to-many-mmt',
                            src_lang='en_XX', tgt_lang='ja_XX', device=0)

In [None]:
# DS
print("\n=========== Head 20 ===========\n")
temp_df = df[df["jobflag"]==0].head(20).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

print("\n=========== Tail 30 ===========\n")
temp_df = df[df["jobflag"]==0].tail(30).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

In [None]:
# ML
temp_df = df[df["jobflag"]==1].reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

In [None]:
# SE
print("\n=========== Head 20 ===========\n")
temp_df = df[df["jobflag"]==2].head(20).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

print("\n=========== Tail 30 ===========\n")
temp_df = df[df["jobflag"]==2].tail(30).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

In [None]:
# CO
print("\n=========== Head 20 ===========\n")
temp_df = df[df["jobflag"]==3].head(20).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")

print("\n=========== Tail 30 ===========\n")
temp_df = df[df["jobflag"]==3].tail(30).reset_index(drop=True)
for i in range(len(temp_df)):
    ex = temp_df.loc[i, :]
    print(f'pred={ex["pred"]}({job_flags[ex["pred"]]}) >> loss={ex["loss"]}')
    print(mbart_translator(ex["clean_description"])[0]["translation_text"])
    print(ex["clean_description"], "\n")
    # bt_desc = augs["bt_en_de"].augment(ex["clean_description"])[0]
    # print(mbart_translator(bt_desc)[0]["translation_text"])
    # print(bt_desc, "\n")