In [1]:
import json
import random
import torch

from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
from IPython.display import display

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../data/train.json") as json_file:
    data = json.load(json_file)

In [3]:
doc_num = random.randint(0,len(data)-1)
file_num = random.randint(0,len(data[doc_num]['annotations'][0]['result'])-1)
sentence = data[doc_num]['annotations'][0]['result'][file_num]['value']['text']
label = data[doc_num]['annotations'][0]['result'][file_num]['value']['labels']
print(f"Doc: {doc_num}, Sentence: {file_num}")
print(f"Sentence: {sentence}\nLabel: {label}")

Doc: 61, Sentence: 33
Sentence: 
 9.     This is not a case where there is any patent illegality or perversity in the order passed by the learned Magistrate, or by the Additional Sessions Judge.
Label: ['RATIO']


In [6]:
tokenizer1 = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
tokenizer2 = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case = True)

In [7]:
inputs1 = tokenizer1(sentence, return_tensors="pt", truncation= True, padding= True, add_special_tokens= True)
inputs2 = tokenizer2(sentence, return_tensors="pt", truncation= True, padding= True, add_special_tokens= True)
display(inputs1)
display(inputs2)

{'input_ids': tensor([[  101,  1023,  1012,  2023,  2003,  2025,  1037,  2553,  2073,  2045,
          2003,  2151,  7353,  6206,  3012,  2030,  2566, 14028,  3012,  1999,
          1996,  2344,  2979,  2011,  1996,  4342, 14351,  1010,  2030,  2011,
          1996,  3176,  6521,  3648,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

{'input_ids': tensor([[  101,  1023,  1012,  2023,  2003,  2025,  1037,  2553,  2073,  2045,
          2003,  2151,  7353,  6206,  3012,  2030,  2566, 14028,  3012,  1999,
          1996,  2344,  2979,  2011,  1996,  4342, 14351,  1010,  2030,  2011,
          1996,  3176,  6521,  3648,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [8]:
model1 = BertModel.from_pretrained('bert-base-uncased')
model2 = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [9]:
bert_out = model1(**inputs1)
distbert_out = model2(**inputs2)

In [10]:
bert_out[0].size()

torch.Size([1, 36, 768])

In [13]:
distbert_out.last_hidden_state

tensor([[[-0.1195, -0.0178, -0.3173,  ...,  0.0165,  0.0603,  0.4219],
         [ 0.5130,  0.1210,  0.7045,  ...,  0.1548,  0.5731,  0.5405],
         [-0.2301, -0.1798, -0.2700,  ...,  0.4366, -0.2500, -0.4169],
         ...,
         [ 0.2499, -0.0690, -0.2068,  ..., -0.1951,  0.0507, -0.8161],
         [ 0.6201,  0.1486, -0.3171,  ...,  0.2881, -0.5333, -0.3570],
         [ 0.0995,  0.3918,  0.1354,  ...,  0.3199, -0.7452, -0.0473]]],
       grad_fn=<NativeLayerNormBackward0>)

In [60]:
emb = torch.nn.Embedding(30,50)

In [61]:
inputs1['input_ids']

tensor([[  101,  1996,  2152,  2457,  3970,  1996,  9964,  2545,  1005, 14865,
          2008,  2382,  1012,  1017,  1012,  5594,  2003,  1996,  3058,  1997,
          2034,  5096,  7609,  5350,  2011,  1996,  9964,  2545,  1012,   102]])

In [69]:
class SimpleLSTM(torch.nn.Module):
    def __init__(self,input_size, hidden_size, num_layers, output_size) -> None:
        super().__init__()
        
        self.hidden_size = hidden_size
        
        self.emb = torch.nn.Embedding(input_size, hidden_size)
        self.lstm = torch.nn.LSTM(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
        )
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(in_features= hidden_size, out_features= 64, bias= True),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features= 64, out_features= output_size, bias= True),
            torch.nn.Softmax(),
        )
        
    def forward(self,x):
        output = self.emb(x)
        
        return output

In [70]:
simple_lstm = SimpleLSTM(30,128,1,13)

In [72]:
simple_lstm(inputs1['input_ids'].squeeze(0))

IndexError: index out of range in self

## Cant use bert ids and get own embeddings