In [81]:
%pip install datasets transformers -q

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m


In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.notebook import tqdm

from datasets import load_dataset
import random
from sklearn import metrics, model_selection, preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.preprocessing import MultiLabelBinarizer

In [83]:
#!g1.1
def seed_everything(seed=73):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # some cudnn methods can be random even after fixing the seed unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(1234)

In [84]:
dataset = load_dataset('eurlex', 'eurlex57k')
data = dataset.data

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1968.0, style=ProgressStyle(description…

Couldn't find file locally at eurlex/eurlex.py, or remotely at https://raw.githubusercontent.com/huggingface/datasets/1.5.0/datasets/eurlex/eurlex.py.
The file was picked from the master branch on github instead at https://raw.githubusercontent.com/huggingface/datasets/master/datasets/eurlex/eurlex.py.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=955.0, style=ProgressStyle(description_…

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=50289403.0, style=ProgressStyle(descrip…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…



Downloading and preparing dataset eurlex/eurlex57k (download: 47.96 MiB, generated: 201.79 MiB, post-processed: Unknown size, total: 249.75 MiB) to /tmp/xdg_cache/huggingface/datasets/eurlex/eurlex57k/1.1.0/d2fdeaa4fcb5f41394d2ed0317c8541d7f9be85d2d601b9fa586c8b461bc3a34...

Dataset eurlex downloaded and prepared to /tmp/xdg_cache/huggingface/datasets/eurlex/eurlex57k/1.1.0/d2fdeaa4fcb5f41394d2ed0317c8541d7f9be85d2d601b9fa586c8b461bc3a34. Subsequent calls will reuse this data.


In [85]:
train, valid, test = data["train"].to_pandas(), data["validation"].to_pandas(), data["test"].to_pandas()

In [86]:
print(train.shape, valid.shape, test.shape) 

(45000, 4) (6000, 4) (6000, 4)


In [87]:
train['text'] = train['title'] + " " + train['text']
valid['text'] = valid['title'] + " " + valid['text']
test['text'] = test['title'] + " " + test['text']

In [88]:
train.head()

Unnamed: 0,celex_id,eurovoc_concepts,text,title
0,32014R0727,"[1402, 2771, 3191, 5055, 519, 5969, 5971]",Commission Implementing Regulation (EU) No 727...,Commission Implementing Regulation (EU) No 727...
1,31975R2481,"[2319, 2713, 2938, 693]",Regulation (EEC) No 2481/75 of the Council of ...,Regulation (EEC) No 2481/75 of the Council of ...
2,32010D0008,"[3560, 365, 4256, 4261, 4353, 4585]","2010/8/EU, Euratom: Commission Decision of 22 ...","2010/8/EU, Euratom: Commission Decision of 22 ..."
3,31982D0211,"[1091, 3842, 3874, 4110, 4381, 5287]",82/211/EEC: Commission Decision of 17 March 19...,82/211/EEC: Commission Decision of 17 March 19...
4,31996D0084,"[1026, 1048, 2300, 3653, 4271, 4390]","96/84/Euratom, ECSC, EC: Commission Decision o...","96/84/Euratom, ECSC, EC: Commission Decision o..."


In [89]:
concepts = pd.read_json('dictionary.json').T['label']
mapping = concepts.to_dict()

In [90]:
indexing = dict(zip(concepts.values, range(len(concepts))))

In [91]:
def ohe(dataset):
    one_hot = MultiLabelBinarizer()
    labels = dataset['eurovoc_concepts']
    ohe = pd.DataFrame(one_hot.fit_transform(labels).T, index=one_hot.classes_)
    ohe = pd.merge(concepts, ohe, left_index=True, right_index=True)
    ohe.index = ohe['label']
    ohe = ohe.drop('label', axis=1)
    ohe = ohe.T
    ohe.columns = list(map(lambda x: indexing[x], ohe.columns.values))
    return ohe

In [92]:
ohe_train = ohe(train)
ohe_val = ohe(valid)
ohe_test = ohe(test)
all_documents = pd.concat([ohe_train, ohe_val, ohe_test]).fillna(0)
ohe_train = all_documents[:45000]
ohe_val = all_documents[45000:51000]
ohe_test = all_documents[51000:]

In [93]:
train = pd.concat([train, ohe_train], axis=1)
valid = pd.concat([valid, ohe_val], axis=1)
test = pd.concat([test, ohe_test], axis=1)

In [94]:
del(ohe_train)
del(ohe_test)
del(ohe_val)

In [95]:
n_labels = len(train.columns) - 4

In [96]:
train.head()

Unnamed: 0,celex_id,eurovoc_concepts,text,title,0,2,3,5,6,11,...,7182,7185,7186,7188,7190,7191,7192,7194,7196,7200
0,32014R0727,"[1402, 2771, 3191, 5055, 519, 5969, 5971]",Commission Implementing Regulation (EU) No 727...,Commission Implementing Regulation (EU) No 727...,0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0
1,31975R2481,"[2319, 2713, 2938, 693]",Regulation (EEC) No 2481/75 of the Council of ...,Regulation (EEC) No 2481/75 of the Council of ...,0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0
2,32010D0008,"[3560, 365, 4256, 4261, 4353, 4585]","2010/8/EU, Euratom: Commission Decision of 22 ...","2010/8/EU, Euratom: Commission Decision of 22 ...",0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0
3,31982D0211,"[1091, 3842, 3874, 4110, 4381, 5287]",82/211/EEC: Commission Decision of 17 March 19...,82/211/EEC: Commission Decision of 17 March 19...,0,0.0,0,0,0,1,...,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0
4,31996D0084,"[1026, 1048, 2300, 3653, 4271, 4390]","96/84/Euratom, ECSC, EC: Commission Decision o...","96/84/Euratom, ECSC, EC: Commission Decision o...",0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0


In [97]:
# def inspect_category_wise_data(label, n=5):
#     samples = train[train[label] == 1].sample(n)
#     sentiment = mapping[label]
    
#     print(f"{n} samples from {sentiment} sentiment: \n")
#     for text in samples["text"]:
#         print(text, end='\n\n')

In [98]:
#!g1.1
class Dataset:
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=True,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [108]:
#!g1.1
class Classifier(nn.Module):
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [105]:
#!g1.1
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)

def build_dataset(tokenizer_max_len):
    train_dataset = Dataset(train.text.tolist(), train[train.columns[4:]].values.tolist(), tokenizer, tokenizer_max_len)
    valid_dataset = Dataset(valid.text.tolist(), valid[train.columns[4:]].values.tolist(), tokenizer, tokenizer_max_len)
    test_dataset = Dataset(test.text.tolist(), test[train.columns[4:]].values.tolist(), tokenizer, tokenizer_max_len)
    
    return train_dataset, valid_dataset, test_dataset

def build_dataloader(train_dataset, valid_dataset, batch_size):
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    return train_data_loader, valid_data_loader

def ret_model(n_train_steps, do_prob):
    model = Classifier(n_train_steps, n_labels, do_prob, bert_model=bert_model)
    return model

In [101]:
#!g1.1
bert_model = transformers.SqueezeBertModel.from_pretrained("squeezebert/squeezebert-uncased")


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=103473649.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [102]:
#!g1.1
def loss_fn(outputs, labels):
    if labels is None:
        return None
    return nn.BCEWithLogitsLoss()(outputs, labels.float())

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    '''
    auc_micro_list = []
    for i in range(n_labels):
      current_pred = preds.T[i]
      current_label = labels.T[i]
      fpr_micro, tpr_micro, _ = metrics.roc_curve(current_label.T, current_pred.T)
      auc_micro = metrics.auc(fpr_micro, tpr_micro)
      auc_micro_list.append(auc_micro)
    
    return {"auc": np.array(auc_micro).mean()}
    '''

    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    return {"auc_micro": auc_micro}

In [103]:
#!g1.1
def train_fn(data_loader, model, optimizer, device, scheduler):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''

    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_fn(outputs, targets)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return train_loss
    

def eval_fn(data_loader, model, device):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_fn(outputs, targets)
            eval_loss += loss.item()
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return eval_loss, fin_outputs, fin_targets

In [109]:
#!g1.1

train_dataset, valid_dataset, test_dataset = build_dataset(40)
train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, 32)
print("Length of Train Dataloader: ", len(train_data_loader))
print("Length of Valid Dataloader: ", len(valid_data_loader))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

n_train_steps = int(len(train_dataset) / 32 * 10)

model = ret_model(n_train_steps, 0.3)
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias"]
optimizer_parameters = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.001,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
optimizer = AdamW(optimizer_parameters, lr=0.01)
scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=n_train_steps)
model.to(device)
model = nn.DataParallel(model)
# wandb.watch(model)
 
n_epochs = 10

best_val_loss = 100
for epoch in tqdm(range(n_epochs)):
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)

    auc_score = log_metrics(preds, labels)["auc_micro"]
    print("AUC score: ", auc_score)
    avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)

    print("Average Train loss: ", avg_train_loss)
    print("Average Valid loss: ", avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "./best_model.pt")  
        print("Model saved as current val_loss is: ", best_val_loss) 

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=1407.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=188.0), HTML(value='')))

  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)
  "labels": torch.tensor(label, dtype=torch.long)


Length of Train Dataloader:  1407
Length of Valid Dataloader:  188


AUC score:  0.8976910176256891
Average Train loss:  0.009143233316134352
Average Valid loss:  0.007353893993917773
Model saved as current val_loss is:  0.007353893993917773


AUC score:  0.8975165802478345
Average Train loss:  0.007440268303656205
Average Valid loss:  0.007362741464253595


AUC score:  0.9082086648947896
Average Train loss:  0.007362262574213146
Average Valid loss:  0.007237187594155523
Model saved as current val_loss is:  0.007237187594155523


AUC score:  0.910643427621018
Average Train loss:  0.007315286357080208
Average Valid loss:  0.00720484910948955
Model saved as current val_loss is:  0.00720484910948955


AUC score:  0.9120061300871479
Average Train loss:  0.007282131585850517
Average Valid loss:  0.0071891529158629635
Model saved as current val_loss is:  0.0071891529158629635


AUC score:  0.9137353278192017
Average Train loss:  0.007253700343053986
Average Valid loss:  0.007150948334960861


In [None]:
#!g1.1
model

In [None]:
#!g1.1
print('test')

In [None]:
#!g1.1
_, test_data_loader = build_dataloader(train_dataset, test_dataset, 32)
eval_loss, preds, labels = eval_fn(test_data_loader, model, device)

In [None]:
#!g1.1
