#Multilabel multiclass classification

(src: https://github.com/DhavalTaunk08/NLP_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)

Using the Jigsaw toxic data from Kaggle (https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)

The data contains the text, and the labels using one hot encoding

You are provided with a large number of Wikipedia comments which have been labeled by human raters for toxic behavior. The types of toxicity are:

* toxic
* severe_toxic
* obscene
* threat
* insult
* identity_hate

You must create a model which predicts a probability of each type of toxicity for each comment.



In [1]:
! pip install transformers



In [1]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
print(device)

cuda


In [4]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [5]:
data = pd.read_csv('final_dataset.tsv', sep='\t')

In [6]:
data.columns

Index(['PMID', 'Title/Abstract', 'MeshTerms', 'SemanticTypes'], dtype='object')

In [7]:
data.head()

Unnamed: 0,PMID,Title/Abstract,MeshTerms,SemanticTypes
0,34694464,limit use dithionit quench determin topolog me...,D005456;D008565,T130;T192;T116;T123
1,34694463,isol post traumat astereognosi case base revie...,D006225;D013236,T039;T023
2,34694462,loss chromatin remodel ddm caus segreg distort...,D017360;D029681,T116;T002
3,34694461,identif novel genom wide pleiotrop associ oral...,D058685,T028;T045
4,34694460,rare frequent lethal complic pulmonari vein is...,D001281;D017115;D004937;D011667,T023;T047;T046;T190;T061


In [8]:
data.drop(['PMID'], inplace=True, axis=1)
data.drop(['MeshTerms'], inplace=True, axis=1)
data.head()

Unnamed: 0,Title/Abstract,SemanticTypes
0,limit use dithionit quench determin topolog me...,T130;T192;T116;T123
1,isol post traumat astereognosi case base revie...,T039;T023
2,loss chromatin remodel ddm caus segreg distort...,T116;T002
3,identif novel genom wide pleiotrop associ oral...,T028;T045
4,rare frequent lethal complic pulmonari vein is...,T023;T047;T046;T190;T061


In [9]:
df_onehot = pd.concat([data.drop('SemanticTypes', 1), data['SemanticTypes'].str.get_dummies(sep=";")], 1)
df_onehot.head()

Unnamed: 0,Title/Abstract,T001,T002,T004,T005,T007,T008,T010,T011,T012,...,T190,T191,T192,T194,T195,T196,T197,T200,T201,T204
0,limit use dithionit quench determin topolog me...,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,isol post traumat astereognosi case base revie...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,loss chromatin remodel ddm caus segreg distort...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,identif novel genom wide pleiotrop associ oral...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,rare frequent lethal complic pulmonari vein is...,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
new_df = pd.DataFrame()
new_df['text'] = df_onehot['Title/Abstract']
new_df['labels'] = df_onehot.iloc[:, 1:].values.tolist()

In [11]:
new_df.head()

Unnamed: 0,text,labels
0,limit use dithionit quench determin topolog me...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,isol post traumat astereognosi case base revie...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,loss chromatin remodel ddm caus segreg distort...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,identif novel genom wide pleiotrop associ oral...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,rare frequent lethal complic pulmonari vein is...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
sample_df = new_df
sample_df

Unnamed: 0,text,labels
0,limit use dithionit quench determin topolog me...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,isol post traumat astereognosi case base revie...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,loss chromatin remodel ddm caus segreg distort...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,identif novel genom wide pleiotrop associ oral...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,rare frequent lethal complic pulmonari vein is...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
2875799,radioact level relat geolog substrat dynam nat...,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2875800,effect novel endoscop report system voic recog...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2875801,neurotrop sar cov neurolog diseas central nerv...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2875802,offlin tdcs modul prefront cortic subcort cere...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading: 100%|████████████████████████████| 213k/213k [00:00<00:00, 504kB/s]
Downloading: 100%|█████████████████████████| 1.11k/1.11k [00:00<00:00, 1.75MB/s]


In [46]:
tokenized_lengths = []
for index, row in sample_df.iterrows():
    txt = row['text']
    ln = len(tokenizer.encode(txt))
    if(ln>512):
        print(ln)

530
537
808
596
551
542
1095
1001
753
777
531
783
562
524
625
559
605
524
600


KeyboardInterrupt: 

In [None]:
# numpy array
data = np.array(tokenized_lengths)
  
# creating series
s = pd.Series(data)
s.describe()

In [8]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
# Defining some key variables that will be used later on in the training
MAX_LEN = tokenizer.model_max_length
EPOCHS = 20
LEARNING_RATE = 2e-05


In [71]:
train_size = 0.8
train_data=sample_df.sample(frac=train_size,random_state=200)
test_data=sample_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)
train_data.to_csv('train.csv')
test_data.to_csv('test.csv')


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (2875804, 2)
TRAIN Dataset: (2300643, 2)
TEST Dataset: (575161, 2)


In [10]:
train_data = pd.read_csv('train.csv')

In [11]:
train_data

Unnamed: 0.1,Unnamed: 0,text,labels
0,0,discoveri imidazoleisoindol deriv potent ido i...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,1,phytotox polymetal mine wast southern tuscani ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,mir promot tumor growth inhibit fbxw nsclc mir...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,construct autophagi relat prognost risk signat...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,calibr blood pressur measur jackson heart stud...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
2300638,2300638,brain gaba chang primari hypothyroid patient l...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2300639,2300639,non structur protein ns variant dengu virus cl...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2300640,2300640,ion transport gene famili physiolog target nat...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2300641,2300641,research evalu method enterpris independ innov...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [89]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained("bert-base-uncased")
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 124)

    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [90]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [91]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [123]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
            torch.save(model.state_dict(), 'bert.mod')
        loss.backward()
        optimizer.step()

In [93]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [94]:
# ONLY NEEDED ON FIRST RUN
#torch.save(model.state_dict(), 'bert.mod')

In [95]:
def validate(target, outputs):
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
    val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))
    
    print(f"Hamming Score = {val_hamming_score}")
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")
    print(f"Hamming Loss = {val_hamming_loss}")
    print()

In [132]:
TRAIN_BATCH_SIZE = 64
VALID_BATCH_SIZE = 64
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8,
                'pin_memory':False
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 8,
                'pin_memory':False
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [133]:

model.load_state_dict(torch.load('bert.mod'))
for epoch in range(EPOCHS):
   
    train(epoch)
    ###
    outputs, targets = validation(testing_loader)
    final_outputs = np.array(outputs) >=0.5
    validate(targets, final_outputs)
    ###
    
    

OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB (GPU 0; 44.56 GiB total capacity; 43.02 GiB already allocated; 22.50 MiB free; 43.40 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
outputs, targets = validation(testing_loader)



In [None]:
final_outputs = np.array(outputs) >=0.5

In [None]:
for ii in range(150):
    ind_tar = [i for i, x in enumerate(targets[ii]) if x]
    print(ind_tar)
    ind_pred = [i for i, x in enumerate(final_outputs[ii]) if x]
    print(ind_pred)
    print()

In [None]:
validate(targets, final_outputs)