# **A toy example for HW7 Bert QA**

If you have any questions, feel free to email us at ntu-ml-2021spring-ta@googlegroups.com

# Install transformers
Documentation for the toolkit:　https://huggingface.co/transformers/

In [None]:
!pip install transformers==4.5.0

# Import Packages

In [None]:
import torch
from transformers import AdamW, BertTokenizerFast, BertForQuestionAnswering

# Load Model and Tokenizer
A list of avaliable pretrained models: https://huggingface.co/models

In [None]:
# model_name can be either: models in huggingface model hub or models saved using save_pretrained
model_name = 'bert-base-chinese'
model = BertForQuestionAnswering.from_pretrained(model_name)

In [None]:
chi_tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
eng_tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Tokenize

In [None]:
chi_paragraph = '李宏毅幾班大金。2021 ML'
tokens = chi_tokenizer.tokenize(chi_paragraph)
print(tokens)
chi_tokenizer.convert_tokens_to_ids(tokens)

In [None]:
eng_paragraph = 'Lee Hung-yi which class Daikin.'
tokens = eng_tokenizer.tokenize(eng_paragraph)
print(tokens)
eng_tokenizer.convert_tokens_to_ids(tokens)

# Encode vs Decode

In [None]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
encoded = chi_tokenizer.encode(question, paragraph)
decoded = chi_tokenizer.decode(encoded)
print(encoded)
print(decoded)

# Model Inputs

In [None]:
inputs = chi_tokenizer(question, paragraph, return_tensors='pt')
# Indices of input sequence tokens in the vocabulary
print('Input ids:      ', inputs['input_ids'])
# Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
print('Token type ids: ', inputs['token_type_ids'])
# Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
print('Attention mask: ', inputs['attention_mask'])

# Testing (Chinese)

In [None]:
question = '李宏毅幾班?'
paragraph = '李宏毅幾班大金。'
inputs = chi_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = chi_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

# Training (Chinese)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [None]:
output = model(**inputs, start_positions=torch.tensor([13]), end_positions=torch.tensor([14]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()

# Testing (English)

In [None]:
question = "Why does Jeanie like Tom?"
paragraph = "Jeanie likes Tom because Tom is good at deep learning."
inputs = eng_tokenizer(question, paragraph, return_tensors='pt')

with torch.no_grad():
    output = model(**inputs)
# output = model(input_ids=inputs['input_ids'], token_type_ids=inputs['token_type_ids'], attention_mask=inputs['attention_mask'])

print("start_logits: ")
print(output.start_logits)

print("end_logits: ")
print(output.end_logits)

start = torch.argmax(output.start_logits)
end = torch.argmax(output.end_logits)
print("start position: ", start.item())
print("end position:   ", end.item())

predict_id = inputs['input_ids'][0][start : end + 1]
print("predict_id:     ", predict_id)

predict_answer = eng_tokenizer.decode(predict_id)
print("predict_answer: ", predict_answer)

# Training (English)
For Question Answering, loss is the sum of cross entropy between the model prediction and correct answer

In [None]:
output = model(**inputs, start_positions=torch.tensor([14]), end_positions=torch.tensor([19]))
print("loss: ", output.loss)

optimizer = AdamW(model.parameters(), lr=1e-4)
output.loss.backward()
optimizer.step()