# Notebook for trying various things on-the-fly for the actual implementation

## Understanding how `torch.nn.CrossEntropyLoss` works

In [1]:
import torch.nn as nn
import torch

In [9]:
criterion = nn.CrossEntropyLoss(ignore_index=preds.size(-1))

In [15]:
preds = torch.rand((3,15))

In [19]:
preds.size()

torch.Size([3, 15])

In [21]:
target = torch.tensor([12,
                       11,
                       10])

In [22]:
target.size()

torch.Size([3])

In [24]:
criterion(preds.float(), target)

tensor(2.6205)

## Understanding how HuggingFace's models actually work

In [46]:
from transformers import BertForQuestionAnswering, AutoTokenizer

In [47]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [57]:
outputs.start_logits

tensor([[ 1.0374, -6.9344, -6.9556, -2.8814, -7.0174, -8.2111, -7.6869, -7.3035,
          0.8261, -4.2656, -5.2627,  0.3830,  7.0740,  5.2306,  5.6687, -7.3035]],
       grad_fn=<CloneBackward0>)

In [49]:
print(answer_start_index, answer_end_index)

tensor(12) tensor(14)


In [50]:
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

In [55]:
answer_start_index

tensor(12)

In [54]:
target_start_index

tensor([14])

In [51]:
outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss

In [53]:
loss.item()

7.4120683670043945

## Investigation of the compatibility of `start_index` and computed `end_index` of answers

In [2]:
import pandas as pd
from transformers import BertForQuestionAnswering, AutoTokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading the Training DataFrame

In [4]:
train_df = pd.read_csv("../data/train_df.csv")

In [6]:
train_df.columns

Index(['idx', 'question', 'answer', 'answer_start', 'is_impossible',
       'context'],
      dtype='object')

In [7]:
context = train_df.iloc[0]["context"]
start_idx = train_df.iloc[0]["answer_start"]
answer = train_df.iloc[0]["answer"]

In [10]:
print(context)

Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".


In [11]:
print(start_idx)

269


In [12]:
print(answer)

in the late 1990s


In [24]:
input = tokenizer(context, return_tensors="pt")

In [37]:
answer_encoded = tokenizer(answer, return_tensors="pt")

In [38]:
answer_ids = answer_encoded.input_ids.squeeze()

In [29]:
input_ids = input.input_ids.squeeze()

In [30]:
input_ids.shape

torch.Size([163])

In [47]:
input_ids

tensor([  101, 24041,   144, 22080, 25384,   118,  5007,   113,   120,   100,
          120, 17775,   118,   162, 11414,   118,  1474,   114,   113,  1255,
         1347,   125,   117,  2358,   114,  1110,  1126,  1237,  2483,   117,
         5523,   117,  1647,  2451,  1105,  3647,   119,  3526,  1105,  2120,
         1107,  4666,   117,  2245,   117,  1131,  1982,  1107,  1672,  4241,
         1105,  5923,  6025,  1112,   170,  2027,   117,  1105,  3152,  1106,
         8408,  1107,  1103,  1523,  3281,  1112,  1730,  2483,  1104,   155,
          111,   139,  1873,   118,  1372, 16784,   112,   188,  6405,   119,
         2268, 15841,  1118,  1123,  1401,   117, 15112,  5773, 25384,   117,
         1103,  1372,  1245,  1141,  1104,  1103,  1362,   112,   188,  1436,
          118,  4147,  1873,  2114,  1104,  1155,  1159,   119,  2397, 14938,
         1486,  1103,  1836,  1104, 24041,   112,   188,  1963,  1312,   117,
        20924,  1193,  1107,  2185,   113,  1581,   114,   117, 

In [69]:
input_list = input_ids.tolist()

In [67]:
answer_ids[1:-1].tolist().isin(input_ids.tolist())

AttributeError: 'list' object has no attribute 'isin'

In [70]:
answer_list = answer_ids[1:-1].tolist()

In [71]:
points = [idx for idx in range(len(input_list) - len(answer_list) + 1) if input_list[idx: idx + len(answer_list)] == answer_list]

In [72]:
points

[61]

In [74]:
tokenizer.decode(input_ids[61:61+len(answer_list)])

'in the late 1990s'

In [36]:
tokenizer.decode(input_ids[121:-1])

'##ly in Love ( 2003 ), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number - one singles " Crazy in Love " and " Baby Boy ".'

In [35]:
input_ids[start_idx]

IndexError: index 269 is out of bounds for dimension 0 with size 163

## Converting target start tokens to BERTTokenizer-compatible values