# BERT

Imports

In [1]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel
import re
import numpy as np

# Model Code


In [2]:

"""BERT-based model for multi-label classification."""

import re

import torch
import torch.nn as nn

from transformers import BertModel, BertTokenizer

__author__ = "Upal Bhattacharya"
__license__ = ""
__copyright__ = ""
__version__ = "1.0"
__email__ = "upal.bhattacharya@gmail.com"


class BertMultiLabel(nn.Module):

    """BERT-based model for multi-label classification"""

    def __init__(self, labels, device, hidden_size=768, max_length=512,
                 bert_model_name="bert-base-uncased"):
        super(BertMultiLabel, self).__init__()
        self.hidden_size = hidden_size
        self.device = device
        self.max_length = max_length
        self.labels = [re.sub(r'[^A-Za-z]', '', label)
                       for label in labels]
        self.bert_model_name = bert_model_name
        self.bert_model = BertModel.from_pretrained(self.bert_model_name)
        # Keeping the tokenizer here makes the model better behaved
        # as opposed to using it in the DataLoader
        self.bert_tokenizer = BertTokenizer.from_pretrained(
                                                    self.bert_model_name)

        self.prediction = nn.ModuleDict({
            k: nn.Linear(in_features=self.hidden_size,
                         out_features=1,
                         bias=True,)
            for k in self.labels})
        
    def process(self, x):
        tokenized = self.bert_tokenizer(x, truncation=True, padding="longest", max_length=self.max_length, return_tensors='pt')
        return tokenized

    def forward(self, x):
        tokenized = self.process(x)
        tokenized = tokenized.to(self.device)
        preds = torch.tensor([])
        preds = preds.to(self.device)

        encoding = self.bert_model(**tokenized)
        # Retaining only the [CLS] token
        cls = encoding.last_hidden_state[:, 0, :]
        m = nn.Sigmoid()
        for label in self.labels:
            pred = self.prediction[label](cls)
            preds = torch.cat((preds, pred), dim=-1)

        preds = m(preds)
        return preds


In [3]:
with open("/home/workboots/Datasets/DHC/variations/var_3/targets/unique_labels.txt", 'r') as f:
    labels = f.readlines()
labels = list(map(lambda x: x.strip("\n"), labels))

In [4]:
labels

['Introduction',
 'General Exceptions',
 'Of Punishments',
 'Of the Right of Private Defence',
 'Of Abetment',
 'Of Offences against the State',
 'Of Offences relating to the Army, Navy and Air Force',
 'Of Offences by or relating to Public Servants',
 'Of Contempts of Lawful Authority of Public Servants',
 'Of False Evidence and Offences against Public Justice',
 'Of Offences affecting the Public Health, Safety, Convenience, Decency and Morals.',
 'Of Offences relating to Religion',
 'Of Offences Affecting Life including murder, culpable homicide',
 'Of Hurt',
 'Of Wrongful Restraint and Wrongful Confinement',
 'Of Criminal Force and Assault',
 'Of Kidnapping, Abduction, Slavery and Forced Labour',
 'Sexual Offences including rape and Sodomy',
 'Of Extortion',
 'Of Criminal Misappropriation of Property',
 'Of Criminal Breach of Trust',
 'Of the Receiving of Stolen Property',
 'Of Cheating',
 'Of Mischief',
 'Of Criminal Trespass',
 'Offences relating to Property and Other Marks',
 'Of

In [5]:
test_model = BertMultiLabel(device='cuda', labels=labels)
test_model.to("cuda")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertMultiLabel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affi

In [6]:
test_model.device

'cuda'

In [4]:
Sentences = ["This is the first sentence", "This is the second sentence", "Can it handle three?"]

In [5]:
output = test_model(Sentences)

In [6]:
output.shape

torch.Size([3, 5])

In [7]:
output

tensor([[0.5285, 0.4899, 0.4230, 0.4278, 0.4766],
        [0.5333, 0.4822, 0.4268, 0.4295, 0.4828],
        [0.5681, 0.4803, 0.4585, 0.4345, 0.4612]], device='cuda:0',
       grad_fn=<SigmoidBackward0>)

# Data Loader

In [6]:
import json
import os

import torch
from torch.utils.data import DataLoader, Dataset

__author__ = "Upal Bhattacharya"
__license__ = ""
__copyright__ = ""
__version__ = "1.0"
__email__ = "upal.bhattacharya@gmail.com"


class BertMultiLabelDataset(Dataset):
    def __init__(self, data_paths, targets_paths, unique_labels=None):
        self.data_paths = data_paths
        self.targets_paths = targets_paths
        if unique_labels is None:
            self.unique_labels = self.get_unique_labels()
        else:
            with open(unique_labels, 'r') as f:
                self.unique_labels = f.readlines()
            self.unique_labels = list(filter(None, map(lambda x: x.strip("\n"),
                                                       self.unique_labels)))

        self.text_paths = self.get_fullpaths()
        # IDs needed for __getitem__
        self.idx = {i: k for i, k in enumerate(self.text_paths)}

        self.targets_dict = self.get_targets()

    def __len__(self):
        return len(self.text_paths.keys())

    def __getitem__(self, idx):
        data = self.load_data(self.text_paths[self.idx[idx]])
        target = self.fetch_target(self.idx[idx])

        return data, target

    def get_fullpaths(self):

        doc_paths = {}
        for path in self.data_paths:
            for doc_idx in os.listdir(path):
                idx = os.path.splitext(doc_idx)[0]
                doc_paths[idx] = os.path.join(path, doc_idx)

        return doc_paths

    def get_targets(self) -> dict:
        """Get targets of documents from targets paths.

        Returns
        -------
        targets: dict
            Dictionary containing the targets of each document.
        """
        targets = {}
        for path in self.targets_paths:
            with open(path, 'r') as f:
                target = json.load(f)
            targets.update(target)

        return targets

    def fetch_target(self, doc):
        """Return target tensors for given batch

        Parameters
        ----------
        batch : list
            List of document IDs for a batch.

        Returns
        -------
        targets : torch.nn.Tensor
            Tensor containing the target tensors.
        """
        target = torch.tensor([int(label in self.targets_dict[doc])
                               for label in self.unique_labels])

        return target

    def load_data(self, path):
        with open(path, 'r') as f:
            data = f.read()
        return data

    def get_unique_labels(self):

        # Keeping it as a list for ordering ??
        unique_labels = list(set([label
                                  for labels in self.targets_dict.values()
                                  for label in labels]))

        # Extra step to ensure consistency with test dataset
        unique_labels = sorted(unique_labels)

        return unique_labels


In [7]:
train_dataset = BertMultiLabelDataset(data_paths = ["/home/workboots/Datasets/DHC/variations/var_3/data/ipc_data/cross_val/5_fold/fold_0/train"],
                                      targets_paths = ["/home/workboots/Datasets/DHC/variations/var_3/targets/ipc_case_offences.json"],
                                      unique_labels = "/home/workboots/Datasets/DHC/variations/var_3/targets/unique_labels.txt")

In [108]:
train_dataset

<__main__.BertMultiLabelDataset at 0x7ff654d988e0>

In [28]:
train_dataset[0]

('Delhi High Court Rinku Ram Prasad vs State on [DATE] IN THE HIGH COURT OF DELHI AT NEW DELHI [ID] CRL.M.BAIL1312/2019 Judgment reserved on 2019 Date of decision 2019 RINKU RAM PRASAD .Appellant Through [PERSON] Advocate.\nversus STATE .Respondent Through [PERSON] APP for State with SI Shashi Kumar PS Shalimar Bagh.\nCORAM HONBLE MS.\nJUSTICE ANU MALHOTRA JUDGMENT ANU MALHOTRA J.\nThe appellant namely Rinku Ram Prasad vide the present appeal assails the impugned judgment dated 2019 and the impugned order on sentence dated 2019 of the learned ASJ-01 North West Rohini Delhi Special Court POCSO in Sessions Case no.44/2017 vide which the appellant herein was convicted for the offence punishable under Section 366 of the Indian Indian Penal Code, 1860, 1860 1860 and Section 6 of the Protection of Children from Sexual Offences Act 2012 hereinafter referred to as POCSO Act 2012 and was sentenced to undergo Rigorous Imprisonment for a period of seven years along [ID]  with a fine of [MONEY] an

In [9]:
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [10]:
optimizer = torch.optim.Adam(test_model.parameters(), lr=3e-5)

In [11]:
loss_fn = nn.BCELoss(reduction='sum')

In [12]:
for i in range(100):
    # item, target = next(iter(train_dataloader))
    target = target.to('cuda')
    y_pred = test_model(item)
    loss = loss_fn(y_pred.float(), target.float())
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    print("Loss", loss.item())
    print("Activation", y_pred)

Loss 19.819351196289062
Activation tensor([[0.5042, 0.4357, 0.3685, 0.4575, 0.6835, 0.3397, 0.4679, 0.4171, 0.3988,
         0.5473, 0.5067, 0.5136, 0.6821, 0.5007, 0.4246, 0.5173, 0.4320, 0.5902,
         0.5301, 0.4009, 0.5511, 0.5146, 0.4745, 0.4947, 0.4871, 0.6821, 0.4926,
         0.5490, 0.5054]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Loss 10.606687545776367
Activation tensor([[0.3237, 0.3220, 0.2779, 0.3026, 0.3470, 0.2551, 0.2445, 0.2946, 0.2481,
         0.5842, 0.3160, 0.2920, 0.6367, 0.2425, 0.4079, 0.3034, 0.6579, 0.2827,
         0.3098, 0.4635, 0.2983, 0.2509, 0.3475, 0.2403, 0.2176, 0.2442, 0.2410,
         0.3897, 0.2163]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Loss 6.790558815002441
Activation tensor([[0.2034, 0.1855, 0.1335, 0.2183, 0.1672, 0.2312, 0.1650, 0.2211, 0.1591,
         0.8665, 0.2358, 0.1141, 0.7917, 0.1655, 0.1597, 0.1916, 0.8023, 0.2876,
         0.3461, 0.2016, 0.2787, 0.1868, 0.2737, 0.2083, 0.1386, 0.1942, 0.2895,
         0.1784, 0.3196

In [15]:
y_pred

tensor([[4.5783e-04, 4.0495e-04, 3.3090e-04, 4.8775e-04, 5.3766e-04, 4.0391e-04,
         3.8133e-04, 3.9583e-04, 3.4139e-04, 9.9958e-01, 4.7103e-04, 3.8015e-04,
         9.9958e-01, 3.8805e-04, 4.4206e-04, 4.2770e-04, 9.9945e-01, 5.5026e-04,
         5.7719e-04, 5.2520e-04, 5.1582e-04, 4.2667e-04, 4.4762e-04, 4.1705e-04,
         3.4564e-04, 5.3536e-04, 4.6976e-04, 4.7450e-04, 4.7204e-04]],
       device='cuda:0', grad_fn=<SigmoidBackward0>)

In [19]:
outputs_batch = (y_pred.data.cpu().detach().numpy()
                 > 0.7).astype(np.int32)


In [20]:
outputs_batch

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [13]:
target

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0]], device='cuda:0')

## Intentionally overfitting on 25 documents for debugging

#### Getting 25 documents

In [12]:
debug_cases = []
debug_targets = []

In [13]:
for _ in range(25):
    case, target = next(iter(train_dataloader))
    debug_cases.append(case)
    debug_targets.append(target)

#### Aiming to overfit

In [14]:
for e in range(50):
    print("epoch", e)
    for i, (case, target) in enumerate(zip(debug_cases, debug_targets)):
        target = target.to("cuda")
        y_pred = test_model(case)
        loss = loss_fn(y_pred.float(), target.float())
        loss.backward()
        
        if (i + 1) % 10 == 0:
            optimizer.step()
            optimizer.zero_grad()

        outputs_batch = (y_pred.data.cpu().detach().numpy()
                         > 0.5).astype(np.int32)
        print("Loss", loss.item())
        print("Levels", y_pred)
        print("Target", target)
        print("Predicted Target", outputs_batch)
        torch.cuda.empty_cache()


epoch 0
Loss 20.300128936767578
Levels tensor([[0.5089, 0.5240, 0.3631, 0.4046, 0.3986, 0.5175, 0.4822, 0.4361, 0.5204,
         0.5228, 0.3316, 0.4556, 0.4700, 0.4205, 0.4917, 0.4166, 0.4334, 0.5350,
         0.5718, 0.5922, 0.6151, 0.4848, 0.5839, 0.5993, 0.5505, 0.4468, 0.5855,
         0.6227, 0.4543]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Target tensor([[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 1, 0, 0]], device='cuda:0')
Predicted Target [[1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 0]]
Loss 20.16674041748047
Levels tensor([[0.4892, 0.5205, 0.3944, 0.3717, 0.4187, 0.5205, 0.5498, 0.4993, 0.4738,
         0.4957, 0.3000, 0.4408, 0.4139, 0.4135, 0.4659, 0.4596, 0.4105, 0.5213,
         0.5078, 0.5612, 0.6092, 0.5426, 0.5769, 0.5557, 0.5960, 0.4848, 0.5315,
         0.6289, 0.4478]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Target tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   

KeyboardInterrupt: 

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          truncation_side='left')

In [4]:
test_string = "This is a sentence"

In [5]:
tokenizer(test_string, truncation=True, padding="longest", max_length=2, return_tensors='pt')

We need to remove 4 to truncate the input but the first sequence has a length 4. 


{'input_ids': tensor([[ 101, 2023, 2003, 1037, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [9]:
train_dataset[0][0]

'Delhi High Court Rinku Ram Prasad vs State on [DATE] IN THE HIGH COURT OF DELHI AT NEW DELHI [ID] CRL.M.BAIL1312/2019 Judgment reserved on 2019 Date of decision 2019 RINKU RAM PRASAD .Appellant Through [PERSON] Advocate.\nversus STATE .Respondent Through [PERSON] APP for State with SI Shashi Kumar PS Shalimar Bagh.\nCORAM HONBLE MS.\nJUSTICE ANU MALHOTRA JUDGMENT ANU MALHOTRA J.\nThe appellant namely Rinku Ram Prasad vide the present appeal assails the impugned judgment dated 2019 and the impugned order on sentence dated 2019 of the learned ASJ-01 North West Rohini Delhi Special Court POCSO in Sessions Case no.44/2017 vide which the appellant herein was convicted for the offence punishable under Section 366 of the Indian Indian Penal Code, 1860, 1860 1860 and Section 6 of the Protection of Children from Sexual Offences Act 2012 hereinafter referred to as POCSO Act 2012 and was sentenced to undergo Rigorous Imprisonment for a period of seven years along [ID]  with a fine of [MONEY] and

In [23]:
encode_1 = tokenizer(train_dataset[0][0], truncation=True, padding="longest", max_length=512)

In [11]:
tokenizer_2 = BertTokenizer.from_pretrained('bert-base-uncased')

In [26]:
encode_2 = tokenizer_2(train_dataset[0][0], truncation=True, padding="longest", max_length=512)

In [24]:
tokenizer.convert_ids_to_tokens(encode_1["input_ids"])

['[CLS]',
 'later',
 'on',
 'the',
 'minor',
 'child',
 'victim',
 'c',
 'told',
 'the',
 '[',
 'id',
 ']',
 'name',
 'of',
 'that',
 'person',
 'was',
 'ja',
 '##anu',
 'ka',
 'cha',
 '##cha',
 'i',
 '.',
 'e',
 '.',
 'rink',
 '##u',
 'and',
 'further',
 'informed',
 'her',
 'mother',
 'sm',
 '##t',
 '.',
 'p',
 'that',
 'the',
 'accused',
 'i',
 '.',
 'e',
 '.',
 'the',
 'app',
 '##ellant',
 'here',
 '##in',
 'had',
 'put',
 'his',
 'private',
 'part',
 'into',
 'her',
 'private',
 'part',
 'us',
 '##ne',
 'ap',
 '##ne',
 'shu',
 'shu',
 'mere',
 'shu',
 'shu',
 'main',
 'da',
 '##al',
 'di',
 '##ya',
 'and',
 'when',
 'she',
 'cried',
 'the',
 'accused',
 'had',
 'run',
 'away',
 'from',
 'there',
 'after',
 'leaving',
 'the',
 'child',
 'in',
 'the',
 'jungle',
 '.',
 'c',
 'was',
 'examined',
 'by',
 'the',
 'learned',
 'trial',
 'court',
 'without',
 'administering',
 'her',
 'oath',
 'in',
 'view',
 'of',
 'her',
 'tender',
 'age',
 'in',
 'which',
 'she',
 'stated',
 'to',
 'th

In [27]:
tokenizer_2.convert_ids_to_tokens(encode_2["input_ids"])

['[CLS]',
 'delhi',
 'high',
 'court',
 'rink',
 '##u',
 'ram',
 'prasad',
 'vs',
 'state',
 'on',
 '[',
 'date',
 ']',
 'in',
 'the',
 'high',
 'court',
 'of',
 'delhi',
 'at',
 'new',
 'delhi',
 '[',
 'id',
 ']',
 'cr',
 '##l',
 '.',
 'm',
 '.',
 'bail',
 '##13',
 '##12',
 '/',
 '2019',
 'judgment',
 'reserved',
 'on',
 '2019',
 'date',
 'of',
 'decision',
 '2019',
 'rink',
 '##u',
 'ram',
 'prasad',
 '.',
 'app',
 '##ellant',
 'through',
 '[',
 'person',
 ']',
 'advocate',
 '.',
 'versus',
 'state',
 '.',
 'respond',
 '##ent',
 'through',
 '[',
 'person',
 ']',
 'app',
 'for',
 'state',
 'with',
 'si',
 'sha',
 '##shi',
 'kumar',
 'ps',
 'sha',
 '##lim',
 '##ar',
 'bag',
 '##h',
 '.',
 'cora',
 '##m',
 'hon',
 '##ble',
 'ms',
 '.',
 'justice',
 'an',
 '##u',
 'mal',
 '##hot',
 '##ra',
 'judgment',
 'an',
 '##u',
 'mal',
 '##hot',
 '##ra',
 'j',
 '.',
 'the',
 'app',
 '##ellant',
 'namely',
 'rink',
 '##u',
 'ram',
 'prasad',
 'vi',
 '##de',
 'the',
 'present',
 'appeal',
 'ass',
 '#

In [2]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
encoding = tokenizer("this is a sentence", return_tensors="pt")

In [15]:
model(**encoding)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.2044,  0.0647, -0.0468,  ..., -0.4177,  0.0913,  0.8801],
         [-0.6664, -0.4081, -0.0024,  ..., -0.4822,  0.1995,  0.4957],
         [-0.4097, -0.4502,  0.6113,  ..., -0.2729, -0.2079,  1.1024],
         [-0.5320, -0.4445,  0.8804,  ...,  0.0300, -0.2960,  1.4909],
         [ 0.0656, -0.1532, -0.1087,  ...,  0.2035, -0.1870,  0.1468],
         [ 0.7121,  0.0505, -0.3789,  ...,  0.5059, -0.9034, -0.1893]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-9.3256e-01, -3.1858e-01,  3.2554e-01,  7.5154e-01,  8.8071e-02,
         -2.4847e-01,  9.2342e-01,  2.2096e-01,  7.7670e-02, -9.9997e-01,
          1.7492e-01,  4.5256e-01,  9.8930e-01, -3.1794e-01,  9.4297e-01,
         -6.5648e-01, -1.4622e-01, -5.8712e-01,  4.2894e-01, -8.2384e-01,
          6.7287e-01,  9.9341e-01,  7.2679e-01,  2.3872e-01,  3.5744e-01,
          5.6887e-01, -6.3542e-01,  9.5415e-01,  9.6439e-01,  7.5773e-01,
       

In [17]:
encoding.attention_mask

tensor([[1, 1, 1, 1, 1, 1]])