In [16]:
import pandas as pd

file_path = "data.jsonl"

jsonObj = pd.read_json(path_or_buf=file_path, lines=True)
jsonObj.set_index("key",inplace=True)


In [18]:
jsonObj.head()

Unnamed: 0_level_0,transcript,tool,heading,target
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"Heading is one five zero, target is green comm...",electromagnetic pulse,150,green commercial aircraft
1,"Heading is two six zero, target is black, whit...",surface-to-air missiles,260,"black, white, and yellow commercial aircraft"
2,"Heading is one zero five, target is silver, gr...",anti-air artillery,105,"silver, green, and yellow light aircraft"
3,"Heading is two niner zero, target is brown and...",electromagnetic pulse,290,brown and blue cargo aircraft
4,"Heading is zero one five, target is yellow cam...",EMP,15,yellow camouflage drone


In [22]:
jsonObj.shape

(51, 4)

In [25]:
from transformers import BertTokenizer

print("Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Loading BERT tokenizer...


vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 525kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 10.8kB/s]
config.json: 100%|██████████| 570/570 [00:00<00:00, 103kB/s]


In [32]:
sentence = jsonObj.transcript[0]
print(f'Original: {sentence}')

#Print the tweet split into tokens
print(f'Tokenized: {tokenizer.tokenize(sentence)}')

#Print the tweet mapped to token ids
print(f'Token IDs: {tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentence))}')

#In actual situation, we use tokenize.encode to convert it into Token IDs


Original: Heading is one five zero, target is green commercial aircraft, tool to deploy is electromagnetic pulse.
Tokenized: ['heading', 'is', 'one', 'five', 'zero', ',', 'target', 'is', 'green', 'commercial', 'aircraft', ',', 'tool', 'to', 'deploy', 'is', 'electromagnetic', 'pulse', '.']
Token IDs: [5825, 2003, 2028, 2274, 5717, 1010, 4539, 2003, 2665, 3293, 2948, 1010, 6994, 2000, 21296, 2003, 17225, 8187, 1012]


In [33]:
question = jsonObj.transcript

# Required Formatting

* Add special tokens to the start and end of the sentence
* Pad and Truncate all sentences to a single constant length
* Explicitly differentiate real tokens from padding tokens with the "attention mask"

## Special Tokens
* At the end of every sentence, we need to append [SEP] token
* This token is an artifact of two-sentence tasks, where BERT is given two separate sentences and asked to determine sometihing

* At the start of each sentence, we must prepend the special [CLS] token, this token is used by the classfier

## Sentence Length & Attention Mask
* Bert has two constraints:
  1. All sentences must be padded or truncated to a single, fixed length
  2. The maximum sentence length is 512 token
  3. [PAD] is used to fill in the blank spaces
  



In [44]:
#Testing for maximum length

max_len = 0

for length in question[:10]:

    #Tokenize the text and add [CLS] and [SEP] tokens
    input_ids = tokenizer.encode(length, add_special_tokens=True)

    #update max length
    max_len = max(max_len, len(input_ids))


    
print(f'Max Lenghth: {max_len}')


Max Lenghth: 30


## tokenizer.encode_plus

1. Split the sentence into tokens
2. Add the special [CLS] and [SEP] tokens
3. Map the tokens to their IDs
4. Pad or truncate all sentences to the same length
5. Create the attention masks which explicitly differentiate tokens from [PAD] token


In [53]:
import torch

In [57]:
input_ids = []
attention_masks = []

for quest in question:
    encoded_dict = tokenizer.encode_plus(
        quest, #question to encode
        add_special_tokens= True, #Add [CLS] and [SEP]
        max_length=40,
        truncation=True,
        padding = "max_length",
        return_attention_mask = True, #construct attention mask
        return_tensors= 'pt' #Return pytorch Tensors
    )

    #Add encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    #And its attention mask (simply differentiate padding from non-padding)
    attention_masks.append(encoded_dict["attention_mask"])

print(input_ids[0])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(labels) #labels are supposed to be target


tensor([[  101,  5825,  2003,  2028,  2274,  5717,  1010,  4539,  2003,  2665,
          3293,  2948,  1010,  6994,  2000, 21296,  2003, 17225,  8187,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])


NameError: name 'labels' is not defined