In [12]:
!pip install transformers==4.26.0
!pip install accelerate==0.16.0

Collecting accelerate==0.16.0
  Using cached accelerate-0.16.0-py3-none-any.whl (199 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.2.0
    Uninstalling accelerate-0.2.0:
      Successfully uninstalled accelerate-0.2.0
Successfully installed accelerate-0.16.0


In [13]:
import math
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset 
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
from transformers import get_linear_schedule_with_warmup

from tqdm.auto import tqdm

device = torch.device("cuda", 1) if torch.cuda.is_available() else "cpu"

# Fix random seed for reproducibility
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
same_seeds(7414)

In [14]:
# Change "fp16_training" to True to support automatic mixed precision training (fp16)
fp16_training = True
fp16 = True
if fp16_training:
    %pip install accelerate==0.2.0
    from accelerate import Accelerator
    accelerator = Accelerator(fp16)
    device = accelerator.device

# Documentation for the toolkit:  https://huggingface.co/docs/accelerate/

Collecting accelerate==0.2.0
  Using cached accelerate-0.2.0-py3-none-any.whl (47 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.16.0
    Uninstalling accelerate-0.16.0:
      Successfully uninstalled accelerate-0.16.0
Successfully installed accelerate-0.2.0
Note: you may need to restart the kernel to use updated packages.


In [15]:
model = AutoModelForQuestionAnswering.from_pretrained("luhua/chinese_pretrain_mrc_macbert_large").to(device)
tokenizer = AutoTokenizer.from_pretrained("luhua/chinese_pretrain_mrc_macbert_large")

# You can safely ignore the warning message (it pops up because new prediction heads for QA are initialized randomly)

In [16]:
def read_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)
    return data["questions"], data["paragraphs"]

train_questions, train_paragraphs = read_data("./hw7_train.json")
dev_questions, dev_paragraphs = read_data("./hw7_dev.json")
test_questions, test_paragraphs = read_data("./hw7_test.json")

In [17]:
# Tokenize questions and paragraphs separately
# 「add_special_tokens」 is set to False since special tokens will be added when tokenized questions and paragraphs are combined in datset __getitem__ 

train_questions_tokenized = tokenizer([train_question["question_text"] for train_question in train_questions], add_special_tokens=False)
dev_questions_tokenized = tokenizer([dev_question["question_text"] for dev_question in dev_questions], add_special_tokens=False)
test_questions_tokenized = tokenizer([test_question["question_text"] for test_question in test_questions], add_special_tokens=False) 

train_paragraphs_tokenized = tokenizer(train_paragraphs, add_special_tokens=False)
dev_paragraphs_tokenized = tokenizer(dev_paragraphs, add_special_tokens=False)
test_paragraphs_tokenized = tokenizer(test_paragraphs, add_special_tokens=False)

# You can safely ignore the warning message as tokenized sequences will be futher processed in datset __getitem__ before passing to model

In [18]:
DOC_STRIDE = None


In [19]:
class QA_Dataset(Dataset):
    def __init__(self, split, questions, tokenized_questions, tokenized_paragraphs):
        self.split = split
        self.questions = questions
        self.tokenized_questions = tokenized_questions
        self.tokenized_paragraphs = tokenized_paragraphs
        self.max_question_len = 40
        self.max_paragraph_len = 350
        
        ##### TODO: Change value of doc_stride #####
        # self.doc_stride = int(0.1 * self.max_paragraph_len)
        self.doc_stride = 2
        ############################################
        global DOC_STRIDE
        DOC_STRIDE = self.doc_stride
        ############################################
        # Input sequence length = [CLS] + question + [SEP] + paragraph + [SEP]
        self.max_seq_len = 1 + self.max_question_len + 1 + self.max_paragraph_len + 1

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        tokenized_question = self.tokenized_questions[idx]
        tokenized_paragraph = self.tokenized_paragraphs[question["paragraph_id"]]

        ##### TODO: Preprocessing #####
        # Hint: How to prevent model from learning something it should not learn

        if self.split == "train":
            # Convert answer's start/end positions in paragraph_text to start/end positions in tokenized_paragraph  
            answer_start_token = tokenized_paragraph.char_to_token(question["answer_start"])
            answer_end_token = tokenized_paragraph.char_to_token(question["answer_end"])

#             # A single window is obtained by slicing the portion of paragraph containing the answer
#             mid = (answer_start_token + answer_end_token) // 2
#             paragraph_start = max(0, min(mid - self.max_paragraph_len // 2, len(tokenized_paragraph) - self.max_paragraph_len))
#             paragraph_end = paragraph_start + self.max_paragraph_len
            
            # A single window is obtained by slicing the portion of paragraph containing the answer
            mid = (answer_start_token + answer_end_token) // 2
            prefix_len = int(random.random() * self.max_paragraph_len)
            postfix_len = self.max_paragraph_len - prefix_len
            paragraph_start, paragraph_end = mid - prefix_len, mid + postfix_len
            if paragraph_start < 0:
                paragraph_end -= paragraph_start
                paragraph_start = 0
            if paragraph_end >= len(tokenized_paragraph):
                paragraph_end = len(tokenized_paragraph) - 1
            
            # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
            input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102] 
            input_ids_paragraph = tokenized_paragraph.ids[paragraph_start : paragraph_end] + [102]
            
            # Convert answer's start/end positions in tokenized_paragraph to start/end positions in the window  
            answer_start_token += len(input_ids_question) - paragraph_start
            answer_end_token += len(input_ids_question) - paragraph_start
            
            # Pad sequence and obtain inputs to model 
            input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
            return torch.tensor(input_ids), torch.tensor(token_type_ids), torch.tensor(attention_mask), answer_start_token, answer_end_token

        # Validation/Testing
        else:
            input_ids_list, token_type_ids_list, attention_mask_list = [], [], []
            
            # Paragraph is split into several windows, each with start positions separated by step "doc_stride"
            for i in range(0, len(tokenized_paragraph), self.doc_stride):
                
                # Slice question/paragraph and add special tokens (101: CLS, 102: SEP)
                input_ids_question = [101] + tokenized_question.ids[:self.max_question_len] + [102]
                input_ids_paragraph = tokenized_paragraph.ids[i : i + self.max_paragraph_len] + [102]
                
                # Pad sequence and obtain inputs to model
                input_ids, token_type_ids, attention_mask = self.padding(input_ids_question, input_ids_paragraph)
                
                input_ids_list.append(input_ids)
                token_type_ids_list.append(token_type_ids)
                attention_mask_list.append(attention_mask)
            
            return torch.tensor(input_ids_list), torch.tensor(token_type_ids_list), torch.tensor(attention_mask_list)

    def padding(self, input_ids_question, input_ids_paragraph):
        # Pad zeros if sequence length is shorter than max_seq_len
        padding_len = self.max_seq_len - len(input_ids_question) - len(input_ids_paragraph)
        # Indices of input sequence tokens in the vocabulary
        input_ids = input_ids_question + input_ids_paragraph + [0] * padding_len
        # Segment token indices to indicate first and second portions of the inputs. Indices are selected in [0, 1]
        token_type_ids = [0] * len(input_ids_question) + [1] * len(input_ids_paragraph) + [0] * padding_len
        # Mask to avoid performing attention on padding token indices. Mask values selected in [0, 1]
        attention_mask = [1] * (len(input_ids_question) + len(input_ids_paragraph)) + [0] * padding_len
        
        return input_ids, token_type_ids, attention_mask

train_set = QA_Dataset("train", train_questions, train_questions_tokenized, train_paragraphs_tokenized)
dev_set = QA_Dataset("dev", dev_questions, dev_questions_tokenized, dev_paragraphs_tokenized)
test_set = QA_Dataset("test", test_questions, test_questions_tokenized, test_paragraphs_tokenized)

train_batch_size = 4

# Note: Do NOT change batch size of dev_loader / test_loader !
# Although batch size=1, it is actually a batch consisting of several windows from the same QA pair
train_loader = DataLoader(train_set, batch_size=train_batch_size, shuffle=True, pin_memory=True)
dev_loader = DataLoader(dev_set, batch_size=1, shuffle=False, pin_memory=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False, pin_memory=True)

In [20]:
def evaluate(data, output, paragraph, paragraph_tokenized):
    ##### TODO: Postprocessing #####
    # There is a bug and room for improvement in postprocessing 
    # Hint: Open your prediction file to see what is wrong 
    
    answer = ''
    max_prob = float('-inf')
    num_of_windows = data[0].shape[1]
    
    paragraph_start_index = 0
    paragraph_end_index = 0
    
    for k in range(num_of_windows):
        # Obtain answer by choosing the most probable start position / end position
#         start_prob, start_index = torch.max(output.start_logits[k], dim=0)
#         end_prob, end_index = torch.max(output.end_logits[k], dim=0)
        
#         # Probability of answer is calculated as sum of start_prob and end_prob
#         prob = start_prob + end_prob

        mask = (data[1][0][k].bool() & data[2][0][k].bool()).to(device)
    
        masked_output_start = torch.masked_select(output.start_logits[k], mask)
        masked_output_start = masked_output_start[:-1]
        
        start_prob, start_index = torch.max(masked_output_start, dim=0)
        
        masked_output_end = torch.masked_select(output.end_logits[k], mask)
        masked_output_end = masked_output_end[start_index: -1]
        
        end_prob, end_index = torch.max(masked_output_end, dim=0)
        
        end_index += start_index
        
        # Probability of answer is calculated as sum of start_prob and end_prob
        prob = start_prob + end_prob
        masked_data = torch.masked_select(data[0][0][k].to(device), mask)[:-1]
        
        # Replace answer if calculated probability is larger than previous windows
        if (prob > max_prob) and (start_index <= end_index <= (start_index + 50)):
            max_prob = prob
            paragraph_start_index = start_index.item() + (DOC_STRIDE * k)
            paragraph_end_index = end_index.item() + (DOC_STRIDE * k)
            # Convert tokens to chars (e.g. [1920, 7032] --> "大 金")
#             answer = tokenizer.decode(data[0][0][k][start_index : end_index + 1])
            answer = tokenizer.decode(masked_data[start_index : end_index + 1])
            
#     # 轉換 [UNK]        
#     if "[UNK]" in answer:
#         # 原始答案 #
#         print(f"原始答案: {answer}")
#         ##########
#         char_count = 0
#         start_flag = False

#         for i, token in enumerate(paragraph_tokenized):
#             if token in ('[UNK]', '[CLS]', '[SEP]'):
#                 if i == paragraph_start_index:
#                     new_start = char_count
#                 if i == paragraph_end_index:
#                     new_end = char_count
#                 char_count += 1
#             else:
#                 for char in token:
#                     if i == paragraph_start_index and not start_flag:
#                         new_start = char_count
#                         start_flag = True
#                     if i == paragraph_end_index:
#                         new_end = char_count
#                     if char == "#":
#                         continue
#                     else:
#                         while char_count < len(paragraph) and char != paragraph[char_count]:
#                             char_count += 1
#                         char_count += 1
            
#         answer = paragraph[new_start: new_end+1]
#         print(f"修正後答案: {answer}")
#         print(paragraph_start_index, paragraph_end_index)
#         print(new_start, new_end)
#         print("-"*50)

    
    ##########
    char_count = 0
    start_flag = False

    for i, token in enumerate(paragraph_tokenized):
        if token in ('[UNK]', '[CLS]', '[SEP]'):
            if i == paragraph_start_index:
                new_start = char_count
            if i == paragraph_end_index:
                new_end = char_count
            char_count += 1
        else:
            for char in token:
                if i == paragraph_start_index and not start_flag:
                    new_start = char_count
                    start_flag = True
                if i == paragraph_end_index:
                    new_end = char_count
                if char == "#":
                    continue
                else:
                    while char_count < len(paragraph) and char != paragraph[char_count]:
                        char_count += 1
                    char_count += 1
    # 後處理
    # 1.轉換 [UNK]        
    if "[UNK]" in answer:
        # 原始答案 #
        print(f"原始答案: {answer}")
        # 修正後答案 #
        answer = paragraph[new_start: new_end+1]
        print(f"修正後答案: {answer}")
        print("-"*50)

###########################################################
    
#     # 2.把沒抓到的引號補回來
#     # 尾巴引號漏掉
#     if (new_end+1) < len(paragraph):
#         if paragraph[new_start] == "「" and paragraph[new_end+1] == "」":
#             answer = paragraph[new_start: new_end+2]
#     # 頭尾引號都漏掉
#     if new_start > 0 and (new_end+1) < len(paragraph):
#         if paragraph[new_start-1] == "「" and paragraph[new_end+1] == "」":
#             answer = paragraph[new_start-1: new_end+2]
#     # 尾巴隔一個標點符號才接引號
#     if (new_end+2) < len(paragraph):
#         if paragraph[new_start] == "「" and paragraph[new_end+2] == "」":
#             answer = paragraph[new_start: new_end+3]
#     if new_start > 0 and (new_end+2) < len(paragraph):
#         if paragraph[new_start-1] == "「" and paragraph[new_end+2] == "」":
#             answer = paragraph[new_start-1: new_end+3]
    
#     # 3.把「為了」與「因為」抓回來
#     if new_start > 1:
#         if paragraph[new_start-2: new_start] == "為了":
#             answer = paragraph[new_start-2: new_end+1]
#         if paragraph[new_start-2: new_start] == "因為":
#             answer = paragraph[new_start-2: new_end+1]

    
#     if new_start > 0:
#         if paragraph[new_start-1] == "因":
#             answer = paragraph[new_start-1: new_end+1]

###########################################################
        
    # 最後移除空格 (e.g. "大 金" --> "大金")
    answer = answer.replace(' ', '')
    
###########################################################

#     # 把「為了」與「因為」拿掉
#     if len(answer) > 2:
#         if answer[:2] == "為了" or answer[:2] == "因為":
#             answer = answer[2:]
    
#     # 把「」拿掉
#     if len(answer) > 2:
#         if answer[0] == "「" and answer[-1] == "」":
#             answer = answer[1:-1]
    
###########################################################

    if len(answer) > 1:
        if "「" not in answer and answer[-1] == "」":
            answer = answer[:-1]
#     if 3 > len(answer) > 1:
#         if answer[0] == "第":
#             i = 1
#             while i < len(answer) and answer[i] in list("123456789一二三四五六七八九"):
#                 i += 1
#             answer = answer[1:i]
    
    return answer

In [21]:
num_epoch = 5  # 3
validation = True  # True
logging_step = 500
learning_rate = 5e-6
# accum_iter = 8

optimizer = AdamW(model.parameters(), lr=learning_rate)

##### TODO: Apply linear learning rate decay #####
total_steps = len(train_loader) * num_epoch
# warmup_steps = int(0.05 * total_steps)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)
##################################################

if fp16_training:
    model, optimizer, train_loader = accelerator.prepare(model, optimizer, train_loader) 

model.train()

print("Start Training ...")

for epoch in range(num_epoch):
    step = 1
    train_loss = train_acc = 0
    
    for batch_idx, data in enumerate(tqdm(train_loader)):
        # Load all data into GPU
        data = [i.to(device) for i in data]
        
        # Model inputs: input_ids, token_type_ids, attention_mask, start_positions, end_positions (Note: only "input_ids" is mandatory)
        # Model outputs: start_logits, end_logits, loss (return when start_positions/end_positions are provided)  
        output = model(input_ids=data[0], token_type_ids=data[1], attention_mask=data[2], start_positions=data[3], end_positions=data[4])

        # Choose the most probable start position / end position
        start_index = torch.argmax(output.start_logits, dim=1)
        end_index = torch.argmax(output.end_logits, dim=1)

        # Prediction is correct only if both start_index and end_index are correct
        train_acc += ((start_index == data[3]) & (end_index == data[4])).float().mean()
        train_loss += output.loss

        if fp16_training:
            accelerator.backward(output.loss)
        else:
            output.loss.backward()

        ##### TODO: Apply linear learning rate decay #####
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        step += 1
        ##################################################
        
        # Print training loss and accuracy over past logging step
        if step % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step} | loss = {train_loss.item() / logging_step:.3f}, acc = {train_acc / logging_step:.3f}")
            train_loss = train_acc = 0
            
    if validation:
        print("Evaluating Dev Set ...")
        model.eval()
        with torch.no_grad():
            dev_acc = 0
            for i, data in enumerate(tqdm(dev_loader)):
                output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
                # prediction is correct only if answer text exactly matches
                dev_acc += evaluate(data, output, dev_paragraphs[dev_questions[i]['paragraph_id']], dev_paragraphs_tokenized[dev_questions[i]['paragraph_id']].tokens) == dev_questions[i]["answer_text"]
            print(f"Validation | Epoch {epoch + 1} | acc = {dev_acc / len(dev_loader):.3f}")
        model.train()

# Save a model and its configuration file to the directory 「saved_model」 
# i.e. there are two files under the direcory 「saved_model」: 「pytorch_model.bin」 and 「config.json」
# Saved model can be re-loaded using 「model = BertForQuestionAnswering.from_pretrained("saved_model")」
print("Saving Model ...")
model_save_dir = "./testmodel6" 
model.save_pretrained(model_save_dir)
# part1 - 加速 (有幫助加速) - simple submit
    # 991/991 [05:52<00:00, 3.45it/s]
    # to
    # 991/991 [03:00<00:00, 6.08it/s]

    # Epoch 1 | Step 900 | loss = 0.586, acc = 0.748
    # Validation | Epoch 1 | acc = 0.509
    # to
    # Epoch 1 | Step 900 | loss = 0.576, acc = 0.759
    # Validation | Epoch 1 | acc = 0.538

# part2 - LR Schedule (訓練結果有變好，驗證沒有)
    # Validation | Epoch 1 | acc = 0.539
    
# part3 - doc_stride to 0.5 (變好很多)
    # Validation | Epoch 1 | acc = 0.667

# part4 - random split training answer window (變好很多)
    # Validation | Epoch 1 | acc = 0.733
    
# part5 - Doc Length from 150 to 300 (好一點點)
    # Validation | Epoch 1 | acc = 0.727
    
# part6 - 換模型
    # Validation | Epoch 1 | acc = 0.750

Start Training ...


  0%|          | 25/6730 [00:17<1:19:23,  1.41it/s]


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\william_H\anaconda3\envs\hw7\lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\william_H\AppData\Local\Temp\ipykernel_26476\1671329177.py", line 49, in <module>
    optimizer.step()
  File "c:\Users\william_H\anaconda3\envs\hw7\lib\site-packages\accelerate\optimizer.py", line 140, in step
  File "c:\Users\william_H\anaconda3\envs\hw7\lib\site-packages\torch\optim\lr_scheduler.py", line 69, in wrapper
    return wrapped(*args, **kwargs)
  File "c:\Users\william_H\anaconda3\envs\hw7\lib\site-packages\torch\optim\optimizer.py", line 280, in wrapper
    out = func(*args, **kwargs)
  File "c:\Users\william_H\anaconda3\envs\hw7\lib\site-packages\transformers\optimization.py", line 360, in step
    exp_avg.mul_(beta1).add_(grad, alpha=(1.0 - beta1))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback 

In [None]:
print("Evaluating Test Set ...")

result = []

model.eval()
with torch.no_grad():
    for i, data in enumerate(tqdm(test_loader)):
        output = model(input_ids=data[0].squeeze(dim=0).to(device), token_type_ids=data[1].squeeze(dim=0).to(device),
                       attention_mask=data[2].squeeze(dim=0).to(device))
        result.append(evaluate(data, output, test_paragraphs[test_questions[i]['paragraph_id']], test_paragraphs_tokenized[test_questions[i]['paragraph_id']].tokens))

result_file = "./testresult6.csv"
with open(result_file, 'w') as f:
    f.write("ID,Answer\n")
    for i, test_question in enumerate(test_questions):
        # Replace commas in answers with empty strings (since csv is separated by comma)
        # Answers in kaggle are processed in the same way
        f.write(f"{test_question['id']},{result[i].replace(',','')}\n")

print(f"Completed! Result is in {result_file}")