In [1]:
!pip3 install datasets transformers -q
!pip3 install wandb --upgrade -q

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm
from sklearn.preprocessing import MultiLabelBinarizer

from datasets import load_dataset
import random
from sklearn import metrics, model_selection, preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
def seed_everything(seed=73):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(1234)

**Dataset**

In [4]:
train = load_dataset('eurlex', 'eurlex57k', split='train').to_pandas()
# test = load_dataset('eurlex', 'eurlex57k', split='test').to_pandas()
val = load_dataset('eurlex', 'eurlex57k', split='validation').to_pandas()
# print(train.shape, test.shape, val.shape)

Reusing dataset eurlex (/root/.cache/huggingface/datasets/eurlex/eurlex57k/1.1.0/d2fdeaa4fcb5f41394d2ed0317c8541d7f9be85d2d601b9fa586c8b461bc3a34)
Reusing dataset eurlex (/root/.cache/huggingface/datasets/eurlex/eurlex57k/1.1.0/d2fdeaa4fcb5f41394d2ed0317c8541d7f9be85d2d601b9fa586c8b461bc3a34)


In [5]:
train['text'] = train['title'] + " " + train['text']
val['text'] = val['title'] + " " + val['text']
# test['text'] = test['title'] + " " + test['text']

In [6]:
train.head()

Unnamed: 0,celex_id,title,text,eurovoc_concepts
0,32014R0727,Commission Implementing Regulation (EU) No 727...,Commission Implementing Regulation (EU) No 727...,"[1402, 2771, 3191, 5055, 519, 5969, 5971]"
1,31975R2481,Regulation (EEC) No 2481/75 of the Council of ...,Regulation (EEC) No 2481/75 of the Council of ...,"[2319, 2713, 2938, 693]"
2,32010D0008,"2010/8/EU, Euratom: Commission Decision of 22 ...","2010/8/EU, Euratom: Commission Decision of 22 ...","[3560, 365, 4256, 4261, 4353, 4585]"
3,31982D0211,82/211/EEC: Commission Decision of 17 March 19...,82/211/EEC: Commission Decision of 17 March 19...,"[1091, 3842, 3874, 4110, 4381, 5287]"
4,31996D0084,"96/84/Euratom, ECSC, EC: Commission Decision o...","96/84/Euratom, ECSC, EC: Commission Decision o...","[1026, 1048, 2300, 3653, 4271, 4390]"


In [7]:
n_labels = 7000

In [8]:
train = train[:15000]
# val = val[:100]
# print(' '.join(train.iloc[0]["text"].split()[:512]))

In [9]:
def to_int(s):
    try:
        return int(s)

    except ValueError:
        # print(s)
        return n_labels

def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0]*n_labels
        # print(i, df.iloc[i]["eurovoc_concepts"])
        label_indices = list(map(to_int, df.iloc[i]["eurovoc_concepts"]))
        for index in label_indices:
            if index < n_labels:
                temp[index] = 1
        one_hot_encoding.append(temp)
    return pd.DataFrame(one_hot_encoding)

In [10]:
train_ohe_labels = one_hot_encoder(train)
val_ohe_labels = one_hot_encoder(val)
# test_ohe_labels = one_hot_encoder(test)

  0%|          | 0/15000 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

In [11]:
train = pd.concat([train, train_ohe_labels], axis=1)
val = pd.concat([val, val_ohe_labels], axis=1)
# test = pd.concat([test, test_ohe_labels], axis=1)

In [12]:
train.head()

Unnamed: 0,celex_id,title,text,eurovoc_concepts,0,1,2,3,4,5,...,6990,6991,6992,6993,6994,6995,6996,6997,6998,6999
0,32014R0727,Commission Implementing Regulation (EU) No 727...,Commission Implementing Regulation (EU) No 727...,"[1402, 2771, 3191, 5055, 519, 5969, 5971]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31975R2481,Regulation (EEC) No 2481/75 of the Council of ...,Regulation (EEC) No 2481/75 of the Council of ...,"[2319, 2713, 2938, 693]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,32010D0008,"2010/8/EU, Euratom: Commission Decision of 22 ...","2010/8/EU, Euratom: Commission Decision of 22 ...","[3560, 365, 4256, 4261, 4353, 4585]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31982D0211,82/211/EEC: Commission Decision of 17 March 19...,82/211/EEC: Commission Decision of 17 March 19...,"[1091, 3842, 3874, 4110, 4381, 5287]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,31996D0084,"96/84/Euratom, ECSC, EC: Commission Decision o...","96/84/Euratom, ECSC, EC: Commission Decision o...","[1026, 1048, 2300, 3653, 4271, 4390]",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
print(train_ohe_labels.head())

   0     1     2     3     4     5     6     7     8     9     ...  6990  \
0     0     0     0     0     0     0     0     0     0     0  ...     0   
1     0     0     0     0     0     0     0     0     0     0  ...     0   
2     0     0     0     0     0     0     0     0     0     0  ...     0   
3     0     0     0     0     0     0     0     0     0     0  ...     0   
4     0     0     0     0     0     0     0     0     0     0  ...     0   

   6991  6992  6993  6994  6995  6996  6997  6998  6999  
0     0     0     0     0     0     0     0     0     0  
1     0     0     0     0     0     0     0     0     0  
2     0     0     0     0     0     0     0     0     0  
3     0     0     0     0     0     0     0     0     0  
4     0     0     0     0     0     0     0     0     0  

[5 rows x 7000 columns]


In [14]:
class Dataset:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=True,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

**Train part:**

In [15]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33marrehova[0m (use `wandb login --relogin` to force relogin)


True

In [16]:
sweep_config = {
    'method': 'random', #grid, random, bayesian
    'metric': {
      'name': 'RP@K',
      'goal': 'maximize'   
    },
    'parameters': {

        'learning_rate': {
            'values': [3e-4]
        },
        'batch_size': {
            'values': [64]
        },
        'epochs':{'value': 10},
        'dropout':{
            'values': [0.4]
        },
        'tokenizer_max_len': {'value': 40},
    }
}

sweep_defaults = {
    'learning_rate': 3e-4,
    'batch_size': 64,
    'epochs': 10,
    'dropout': 0.3,
    'tokenizer_max_len': 40
}

sweep_id = wandb.sweep(sweep_config, project='bhaavnaye')

Create sweep with ID: 1oypluid
Sweep URL: https://wandb.ai/arrehova/bhaavnaye/sweeps/1oypluid


In [17]:
class Classifier(nn.Module):
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [18]:
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)

def build_dataset(tokenizer_max_len):
    train_dataset = Dataset(train.text.tolist(), train[range(n_labels)].values.tolist(), tokenizer, tokenizer_max_len)
    val_dataset = Dataset(val.text.tolist(), val[range(n_labels)].values.tolist(), tokenizer, tokenizer_max_len)
    
    return train_dataset, val_dataset 

def build_dataloader(train_dataset, val_dataset, batch_size):
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    return train_data_loader, val_data_loader

def ret_model(n_train_steps, do_prob):
  model = Classifier(n_train_steps, n_labels, do_prob, bert_model=bert_model)
  return model

In [19]:
bert_model = transformers.SqueezeBertModel.from_pretrained("squeezebert/squeezebert-uncased")

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
def ret_optimizer(model):
    '''
    Taken from Abhishek Thakur's Tez library example: 
    https://github.com/abhishekkrthakur/tez/blob/main/examples/text_classification/binary.py
    '''
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    opt = AdamW(optimizer_parameters, lr=wandb.config.learning_rate)
    return opt

def ret_scheduler(optimizer, num_train_steps):
    sch = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

def loss_fn(outputs, labels):
    if labels is None:
        return None
    return nn.BCEWithLogitsLoss()(outputs, labels.float())

def rp_k_metric(preds, labels, k=5):
    idx = np.argpartition(preds, kth=-k, axis=-1)[:,-k:]
    div_k = np.clip(np.sum(labels, axis=1), 0, k)
    N = labels.shape[0]

    return np.sum(np.sum(np.take_along_axis(labels, idx, axis=-1), axis=-1) / div_k) / N


def test_rp_k():
    labels = np.array([
        [0,0,1,0,1],
        [0,1,1,1,0],
        [1,0,0,0,1]               
    ])

    preds = np.array([
        [0.1,0.1,0.4,0,0.4],
        [0.1,0.2,0.2,0.2,0.3],
        [1,0,0,0,0]               
    ])

    # must be 2/3
    return rp_k_metric(preds, labels, k=2)


def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    # pr_micro, rec_micro, _ = metrics.precision_recall_curve(labels.ravel(), preds.ravel())
    
    # precision = dict()
    # recall = dict()
    # average_precision = dict()
    # for i in range(n_labels):
    #     precision[i], recall[i], _ = metrics.precision_recall_curve(labels[:, i], preds[:, i])
    #     average_precision[i] = metrics.average_precision_score(labels[:, i], preds[:, i])

    # A "micro-average": quantifying score on all classes jointly
    # precision["micro"], recall["micro"], _ = metrics.precision_recall_curve(labels.ravel(), preds.ravel())

    precision_micro = metrics.average_precision_score(labels, preds, average="micro")
    # precision_macro = metrics.average_precision_score(labels, preds, average="macro")
    rp_k = rp_k_metric(labels, preds)

    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    
    f1_score_micro = metrics.f1_score(labels, np.array(preds) > 0.2, average='micro')
    f1_score_macro = metrics.f1_score(labels, np.array(preds) > 0.2, average='macro')
    
    return {
        "precision_micro": precision_micro,
        "RP@K": rp_k,
        "auc": auc_micro,
        # "precision_macro": precision_macro,
        "f1_score_micro": f1_score_micro,
        "f1_score_macro": f1_score_macro
    }

# metrics.precision_score(y_true, y_pred, average='micro')
# metrics.precision_score(y_true, y_pred, average='macro')
# metrics.precision_score(y_true, y_pred, average='weighted')
# metrics.precision_score(y_true, y_pred, average='samples')

In [21]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''

    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return train_loss
    

def eval_fn(data_loader, model, device):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_fn(outputs, targets)
            eval_loss += loss.item()
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return eval_loss, fin_outputs, fin_targets

In [24]:
def trainer(config=None):
    with wandb.init(config=config):
        config = wandb.config

        train_dataset, val_dataset = build_dataset(config.tokenizer_max_len)
        train_data_loader, val_data_loader = build_dataloader(train_dataset, val_dataset, config.batch_size)
        print("Length of Train Dataloader: ", len(train_data_loader))
        print("Length of Valid Dataloader: ", len(val_data_loader))

        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        n_train_steps = int(len(train_dataset) / config.batch_size * 10)

        model = ret_model(n_train_steps, config.dropout)
        optimizer = ret_optimizer(model)
        scheduler = ret_scheduler(optimizer, n_train_steps)
        model.to(device)
        model = nn.DataParallel(model)
        wandb.watch(model)
        
        n_epochs = config.epochs

        best_val_loss = 100
        for epoch in tqdm(range(n_epochs)):
            train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
            eval_loss, preds, labels = eval_fn(val_data_loader, model, device)
          
            print(log_metrics(preds, labels))

            # check first 5 labels:
            print(np.argpartition(torch.stack(preds).cpu().detach().numpy(), kth=-5, axis=-1)[:20,-5:])

            avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(val_data_loader)
            wandb.log({
                "epoch": epoch + 1,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss
            })
            print("Average Train loss: ", avg_train_loss)
            print("Average Valid loss: ", avg_val_loss)

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model.state_dict(), "./best_model.pt")  
                print("Model saved as current val_loss is: ", best_val_loss)    

In [25]:
wandb.agent(sweep_id, function=trainer, count=6)

[34m[1mwandb[0m: Agent Starting Run: 74rh0fwg with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	tokenizer_max_len: 40


Length of Train Dataloader:  235
Length of Valid Dataloader:  94




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02679531985289183, 'RP@K': 0.015822588602701824, 'auc': 0.8999757304828533, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1309 2635 1605  693 1118]
 [1309 2635  693 1605 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1118  693 1605]
 [1309 2635 1605  693 1118]
 [1309 2635 1118  693 1605]
 [1309 2635 1605  693 1118]
 [1309 2635 1605  693 1118]
 [1309 2635 1118  693 1605]
 [1309 2635 1118  693 1605]
 [1309 2635 1118  693 1605]
 [1309 2635 1605  693 1118]
 [1309 2635  693 1605 1118]
 [1309 2635  693 1605 1118]
 [1309 2635 1605  693 1118]]
Average Train loss:  0.03431540387979847
Average Valid loss:  0.00575653083642271
Model saved as current val_loss is:  0.00575653083642271


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02674982736782259, 'RP@K': 0.014474278767903645, 'auc': 0.927969328437524, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]
 [1118 1605  693 3568 1309]]
Average Train loss:  0.005421286490448612
Average Valid loss:  0.004828671454154747
Model saved as current val_loss is:  0.004828671454154747


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.026943898447849814, 'RP@K': 0.015190849304199219, 'auc': 0.9369066334241969, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]
 [1605 1118 2635 1309 3568]]
Average Train loss:  0.004858801799251678
Average Valid loss:  0.004584183148603807
Model saved as current val_loss is:  0.004584183148603807


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02701398470945751, 'RP@K': 0.015409483591715494, 'auc': 0.9406972967708987, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 3568 1309]]
Average Train loss:  0.004675346927994743
Average Valid loss:  0.004482973475960341
Model saved as current val_loss is:  0.004482973475960341


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.026931875886562998, 'RP@K': 0.015426770528157552, 'auc': 0.9427346015450074, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[ 693 1605 1118 3568 1309]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 3568 1309]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]
 [ 693 1605 1118 1309 3568]]
Average Train loss:  0.004586232689387621
Average Valid loss:  0.004434479638300044
Model saved as current val_loss is:  0.004434479638300044


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.027045943776166398, 'RP@K': 0.015660741170247395, 'auc': 0.9439974091683014, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]
 [ 693 1605   20 1309 3568]]
Average Train loss:  0.0045364169661510495
Average Valid loss:  0.00440492328910276
Model saved as current val_loss is:  0.00440492328910276


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02719144420308897, 'RP@K': 0.015389516194661458, 'auc': 0.944788732868995, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]
 [1118 2635 3568 1309  693]]
Average Train loss:  0.004511702940502065
Average Valid loss:  0.004386581044564856
Model saved as current val_loss is:  0.004386581044564856


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02714221587486281, 'RP@K': 0.015201904296875, 'auc': 0.9452407584807766, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]
 [1605 1118  693 1309 3568]]
Average Train loss:  0.004489618706259322
Average Valid loss:  0.004374802783825137
Model saved as current val_loss is:  0.004374802783825137


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.02711693885476615, 'RP@K': 0.015572872161865235, 'auc': 0.945485347440288, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 3568 1309]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 3568 1309]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]
 [ 693 1118 1605 1309 3568]]
Average Train loss:  0.004476575480100322
Average Valid loss:  0.004368975570306499
Model saved as current val_loss is:  0.004368975570306499


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.027125558924491505, 'RP@K': 0.015580820719401041, 'auc': 0.9455789805121941, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]
 [1118  693 1605 3568 1309]]
Average Train loss:  0.004469440461314739
Average Valid loss:  0.004365778520723131
Model saved as current val_loss is:  0.004365778520723131



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▃▄▅▆▆▇█
train_loss,█▁▁▁▁▁▁▁▁▁
val_loss,█▃▂▂▁▁▁▁▁▁

0,1
epoch,10.0
train_loss,0.00447
val_loss,0.00437


[34m[1mwandb[0m: Agent Starting Run: jzmcshz4 with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	dropout: 0.4
[34m[1mwandb[0m: 	epochs: 10
[34m[1mwandb[0m: 	learning_rate: 0.0003
[34m[1mwandb[0m: 	tokenizer_max_len: 40


Length of Train Dataloader:  235
Length of Valid Dataloader:  94




  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.027727033106269255, 'RP@K': 0.01600224812825521, 'auc': 0.9006248201160478, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]
 [2635  693 1118 3568 1605]]
Average Train loss:  0.034500290956744485
Average Valid loss:  0.005776268073377457
Model saved as current val_loss is:  0.005776268073377457


  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'precision_micro': 0.027246394558867137, 'RP@K': 0.016286556243896484, 'auc': 0.9277184586247099, 'f1_score_micro': 0.0, 'f1_score_macro': 0.0}
[[ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]
 [ 693 1309 3568 1118 1605]]
Average Train loss:  0.005431102755221915
Average Valid loss:  0.004836258587447253
Model saved as current val_loss is:  0.004836258587447253


  0%|          | 0/235 [00:00<?, ?it/s]

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.
