# Notebook for trying various things on-the-fly for the actual implementation

## Understanding how `torch.nn.CrossEntropyLoss` works

In [None]:
import torch.nn as nn
import torch

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=preds.size(-1))

In [None]:
preds = torch.rand((3,15))

In [None]:
preds.size()

In [None]:
target = torch.tensor([12,
                       11,
                       10])

In [None]:
target.size()

In [None]:
criterion(preds.float(), target)

## Understanding how HuggingFace's models actually work

In [None]:
from transformers import BertForQuestionAnswering, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [None]:
outputs.start_logits

In [None]:
print(answer_start_index, answer_end_index)

In [None]:
target_start_index = torch.tensor([14])
target_end_index = torch.tensor([15])

In [None]:
answer_start_index

In [None]:
target_start_index

In [None]:
outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
loss = outputs.loss

In [None]:
loss.item()

## Investigation of the compatibility of `start_index` and computed `end_index` of answers

In [None]:
import pandas as pd
import torch
from transformers import BertForQuestionAnswering, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")

Loading the Training DataFrame

In [None]:
train_df = pd.read_csv("../data/train_df.csv")

In [None]:
train_df.columns

In [None]:
context = train_df.iloc[0:5]["context"].values.tolist()
start_idx = train_df.iloc[0:5]["answer_start"].values.tolist()
answer = train_df.iloc[0:5]["answer"].values.tolist()

In [None]:
len(context)

In [None]:
input = tokenizer(context, return_tensors="pt", padding="longest", truncation=True)

In [None]:
answer_encoded = tokenizer(answer, return_tensors="pt", padding="longest", truncation=True)

In [None]:
answer_encoded.input_ids

In [None]:
input_ids_list = input.input_ids.tolist()

In [None]:
answer_ids_list = answer_encoded.input_ids.tolist()

In [None]:
answer_ids_list = [item[1:item.index(tokenizer.encode(tokenizer.sep_token)[1])] for item in answer_ids_list]

In [None]:
answer_ids_list

In [None]:
start_idx = [[idx 
              for idx in range(len(input_ids_list[i]) - len(answer_ids_list[i]) + 1)
                                   if input_ids_list[i][idx: idx + len(answer_ids_list[i])] == answer_ids_list[i]]
             for i in range(len(answer_ids_list))]

In [None]:
start_idx

In [None]:
end_idx = [[start_idx[i][0] + len(answer_ids_list[i])] for i in range(len(start_idx))]

In [None]:
end_idx

In [None]:
start_idx = torch.tensor(start_idx)

In [None]:
start_idx.shape

In [None]:
end_idx = torch.tensor(end_idx)

In [None]:
end_idx.shape

In [None]:
tokenizer.decode(input_ids[61:61+len(answer_list)])

In [None]:
tokenizer.decode(input_ids[1][start_idx[1]:end_idx[1]])

## Checking usage of tuples with Tokenizers

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
test_tuple = ("This is a sentence", "This is a sentence", "This is also a sentence")

In [None]:
type(test_tuple)

In [None]:
input = tokenizer(test_tuple, return_tensors="pt", padding="longest", truncation=True)

In [None]:
input.input_ids.shape

## Check DataModule and Model Compatibility

In [17]:
import sys

In [18]:
sys.path.append("/home/workboots/Repos/squad-2.0")

In [19]:
from src.datamodules.squad_datamodule import SQuADDataModule

In [20]:
from src.models.transformer_encoder_qa import TransformerEncoderQuestionAnswering

In [21]:
import torch
from functools import partial

In [22]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [23]:
optimizer = partial(torch.optim.SGD,lr=0.001)

In [24]:
model = TransformerEncoderQuestionAnswering(optimizer=optimizer,scheduler=None)

In [25]:
datamodule = SQuADDataModule(data_dir="/home/workboots/Repos/squad-2.0/data/cross_validation/5_fold/fold_1",
                             batch_size=4, num_workers=os.cpu_count(),pin_memory=False)

In [26]:
datamodule.prepare_data()
datamodule.setup(stage="fit")
train_loader = datamodule.train_dataloader()

In [27]:
model.to("cuda")

RuntimeError: CUDA unknown error - this may be due to an incorrectly set up environment, e.g. changing env variable CUDA_VISIBLE_DEVICES after program start. Setting the available devices to be zero.

In [11]:
for batch in iter(train_loader):
    q, a, a_start, c, idx = batch
    print(a_start)
    result = model.training_step(batch)
    print(result)
    break

tensor([101, 193,   3,   6])
{'loss': tensor(5.6758, grad_fn=<DivBackward0>), 'start_logits': tensor([[[-4.3277e-01],
         [-8.2399e-01],
         [-1.0148e+00],
         [-7.8357e-01],
         [ 2.5657e-01],
         [ 3.7574e-01],
         [-4.4211e-01],
         [-4.8079e-01],
         [-8.3104e-01],
         [-1.6637e+00],
         [-5.5683e-01],
         [-5.9360e-01],
         [-1.5995e-01],
         [-9.7002e-01],
         [-1.2303e+00],
         [-6.0933e-01],
         [-7.1651e-01],
         [-2.7751e-01],
         [-8.0283e-01],
         [-1.9213e+00],
         [-6.8030e-01],
         [-3.1174e-01],
         [-1.0694e+00],
         [ 6.7795e-02],
         [-1.1725e+00],
         [-9.9545e-01],
         [ 1.5383e-01],
         [-4.3724e-01],
         [-1.1556e+00],
         [-1.3367e+00],
         [-3.6823e-01],
         [-6.6669e-01],
         [-6.9790e-01],
         [-1.4481e+00],
         [-7.3576e-01],
         [-3.8087e-01],
         [-1.6029e+00],
         [-5.4137e

  rank_zero_warn(


In [12]:
result.keys()

dict_keys(['loss', 'start_logits', 'end_logits'])

In [16]:
result["start_logits"]

tensor([[[-4.3277e-01],
         [-8.2399e-01],
         [-1.0148e+00],
         [-7.8357e-01],
         [ 2.5657e-01],
         [ 3.7574e-01],
         [-4.4211e-01],
         [-4.8079e-01],
         [-8.3104e-01],
         [-1.6637e+00],
         [-5.5683e-01],
         [-5.9360e-01],
         [-1.5995e-01],
         [-9.7002e-01],
         [-1.2303e+00],
         [-6.0933e-01],
         [-7.1651e-01],
         [-2.7751e-01],
         [-8.0283e-01],
         [-1.9213e+00],
         [-6.8030e-01],
         [-3.1174e-01],
         [-1.0694e+00],
         [ 6.7795e-02],
         [-1.1725e+00],
         [-9.9545e-01],
         [ 1.5383e-01],
         [-4.3724e-01],
         [-1.1556e+00],
         [-1.3367e+00],
         [-3.6823e-01],
         [-6.6669e-01],
         [-6.9790e-01],
         [-1.4481e+00],
         [-7.3576e-01],
         [-3.8087e-01],
         [-1.6029e+00],
         [-5.4137e-01],
         [-7.2079e-01],
         [-6.2858e-01],
         [-4.8498e-01],
         [-1.233