In [1]:
!pip install datasets transformers
!pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.26.0-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading respo

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import json
from pathlib import Path
import torch
from torch.utils.data import DataLoader
from transformers import AdamW
import numpy as np

In [6]:
def read_squad_train(path,data_index,context_size):
  """
  Read data format squad 
  -> return contexts, questions, answers of sentenses
  """
  path = Path(path)
  with open(path, 'rb') as f:
      squad_dict = json.load(f)

  contexts = []
  questions = []
  answers = []
  size=0
  for group in squad_dict['data'][data_index-1:data_index]:
      for passage in group['paragraphs']:
        if size <= context_size:
          context = passage['context']
          for qa in passage['qas']:
              question = qa['question']
              for answer in qa['answers']:
                if answer['answer_start']>=0:
                  contexts.append(context)
                  questions.append(question)
                  answers.append(answer)
          size+=1
        else: break

  return contexts, questions, answers
def read_squad_val(path):
  """
  Read data format squad 
  -> return contexts, questions, answers of sentenses
  """
  path = Path(path)
  with open(path, 'rb') as f:
      squad_dict = json.load(f)

  contexts = []
  questions = []
  answers = []
  for group in squad_dict['data']:
      for passage in group['paragraphs']:
          context = passage['context']
          for qa in passage['qas']:
              question = qa['question']
              for answer in qa['answers']:
                  contexts.append(context)
                  questions.append(question)
                  answers.append(answer)

  return contexts, questions, answers
def add_token_positions(encodings, answers):
  """
  Convert answer start- end from string to token
  """
  start_positions = []
  end_positions = []
  for i in range(len(answers)):
    start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
    end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
    # if None, the answer passage has been truncated
    if start_positions[-1] is None:
        start_positions[-1] = tokenizer.model_max_length
    if end_positions[-1] is None:
        end_positions[-1] = tokenizer.model_max_length
  encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
def add_end_idx(answers, contexts):
  """
  Squad format chỉ có id start của câu trả lời
  -> Hàm này dùng để tạo id end của câu trả lời
  """
  for answer, context in zip(answers, contexts):
    gold_text = answer['text']
    start_idx = answer['answer_start']
    end_idx = start_idx + len(gold_text)

    # sometimes squad answers are off by a character or two – fix this
    if context[start_idx:end_idx] == gold_text:
        answer['answer_end'] = end_idx
    elif context[start_idx-1:end_idx-1] == gold_text:
        answer['answer_start'] = start_idx - 1
        answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
    elif context[start_idx-2:end_idx-2] == gold_text:
        answer['answer_start'] = start_idx - 2
        answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import AutoTokenizer
checkpoint = 'twmkn9/distilbert-base-uncased-squad2'
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint) 
model.load_state_dict(torch.load('/content/drive/MyDrive/DoAnCK/model/distilbert0-15000.bin'))
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
test_contexts, test_questions, test_answers = read_squad_val('/content/drive/MyDrive/DoAnCK/Dataset/Mailong25/train-v2.0.json')
add_end_idx(test_answers, test_contexts)
new_test_contexts=[]
new_test_questions=[]
new_test_answers=[]
for i in range(len(test_answers)):
  if test_answers[i]['answer_start']!=test_answers[i]['answer_end']:
    new_test_contexts.append(test_contexts[i])
    new_test_questions.append(test_questions[i])
    new_test_answers.append(test_answers[i])
(test_contexts, test_questions, test_answers)=(new_test_contexts, new_test_questions, new_test_answers)
test_encodings = tokenizer(test_contexts, test_questions, truncation=True, padding=True, return_offsets_mapping=True,)
add_token_positions(test_encodings, test_answers)
test_dataset = SquadDataset(test_encodings)

RuntimeError: ignored