# Notebook for trying various things on-the-fly for the actual implementation

## Understanding how `torch.nn.CrossEntropyLoss` works

In [1]:
import torch.nn as nn
import torch

In [9]:
criterion = nn.CrossEntropyLoss(ignore_index=preds.size(-1))

In [15]:
preds = torch.rand((3,15))

In [19]:
preds.size()

torch.Size([3, 15])

In [21]:
target = torch.tensor([12,
                       11,
                       10])

In [22]:
target.size()

torch.Size([3])

In [24]:
criterion(preds.float(), target)

tensor(2.6205)

## Understanding how HuggingFace's models actually work

In [46]:
from transformers import BertForQuestionAnswering, AutoTokenizer

In [47]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [57]:
outputs.start_logits

tensor([[ 1.0374, -6.9344, -6.9556, -2.8814, -7.0174, -8.2111, -7.6869, -7.3035,
          0.8261, -4.2656, -5.2627,  0.3830,  7.0740,  5.2306,  5.6687, -7.3035]],
       grad_fn=<CloneBackward0>)

In [49]:
print(answer_start_index, answer_end_index)

tensor(12) tensor(14)


In [50]:
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

In [55]:
answer_start_index

tensor(12)

In [54]:
target_start_index

tensor([14])

In [51]:
outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss

In [53]:
loss.item()

7.4120683670043945

## Investigation of the compatibility of `start_index` and computed `end_index` of answers

In [44]:
import pandas as pd
import torch
from transformers import BertForQuestionAnswering, AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading the Training DataFrame

In [3]:
train_df = pd.read_csv("../data/train_df.csv")

In [4]:
train_df.columns

Index(['idx', 'question', 'answer', 'answer_start', 'is_impossible',
       'context'],
      dtype='object')

In [14]:
context = train_df.iloc[0:5]["context"].values.tolist()
start_idx = train_df.iloc[0:5]["answer_start"].values.tolist()
answer = train_df.iloc[0:5]["answer"].values.tolist()

In [15]:
len(context)

5

In [18]:
input = tokenizer(context, return_tensors="pt", padding="longest", truncation=True)

In [19]:
answer_encoded = tokenizer(answer, return_tensors="pt", padding="longest", truncation=True)

In [32]:
answer_encoded.input_ids

tensor([[ 101, 1107, 1103, 1523, 3281,  102],
        [ 101, 4241, 1105, 5923,  102,    0],
        [ 101, 1581,  102,    0,    0,    0],
        [ 101, 4666,  117, 2245,  102,    0],
        [ 101, 1523, 3281,  102,    0,    0]])

In [27]:
input_ids_list = input.input_ids.tolist()

In [54]:
answer_ids_list = answer_encoded.input_ids.tolist()

In [55]:
answer_ids_list = [item[1:item.index(tokenizer.encode(tokenizer.sep_token)[1])] for item in answer_ids_list]

In [56]:
answer_ids_list

[[1107, 1103, 1523, 3281],
 [4241, 1105, 5923],
 [1581],
 [4666, 117, 2245],
 [1523, 3281]]

In [36]:
start_idx = [[idx 
              for idx in range(len(input_ids_list[i]) - len(answer_ids_list[i]) + 1)
                                   if input_ids_list[i][idx: idx + len(answer_ids_list[i])] == answer_ids_list[i]]
             for i in range(len(answer_ids_list))]

In [37]:
start_idx

[[61], [49], [125], [41], [63]]

In [41]:
end_idx = [[start_idx[i][0] + len(answer_ids_list[i])] for i in range(len(start_idx))]

In [42]:
end_idx

[[65], [52], [126], [44], [65]]

In [45]:
start_idx = torch.tensor(start_idx)

In [46]:
start_idx.shape

torch.Size([5, 1])

In [47]:
end_idx = torch.tensor(end_idx)

In [48]:
end_idx.shape

torch.Size([5, 1])

In [74]:
tokenizer.decode(input_ids[61:61+len(answer_list)])

'in the late 1990s'

In [59]:
tokenizer.decode(input_ids[1][start_idx[1]:end_idx[1]])

'singing and dancing'