# QA Labor law training phoBERT

1. Install lib for model phoBERT

In [None]:
!pip install datasets rouge_score
!pip install accelerate -U

2. Mount drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


3. Import lib using for train phoBERT

In [15]:
from datasets import Dataset
import json
import torch
import re
from transformers import RobertaTokenizerFast, DefaultDataCollator, TrainingArguments, Trainer, AutoModelForQuestionAnswering
from sklearn.model_selection import train_test_split

3. Choose using model

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained("vinai/phobert-base")
tokenizer = RobertaTokenizerFast.from_pretrained("vinai/phobert-base")

4. Support function prepare dataset

4.1 Filter long context

In [5]:
def filter_long_contexts(data):
    long_contexts = []
    for squadData in data:
      for article in squadData['data']:
          for paragraph in article['paragraphs']:
              if len(paragraph['context']) > 511:
                  long_contexts.append({
                      'type': 'context',
                      'text': paragraph['context']
                  })
                  continue  # Bỏ qua kiểm tra câu hỏi nếu context đã dài hơn 258 ký tự
              for qa in paragraph['qas']:
                  if len(qa['question']) > 258:
                      long_contexts.append({
                          'type': 'question',
                          'text': qa['question']
                      })
    return long_contexts

Filter long contexts

In [6]:
input_file = '/content/drive/MyDrive/models/qa_train.json'
with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

# Lọc các context dài hơn 258 ký tự
long_entries = filter_long_contexts(data)

# Lưu các context đã lọc vào file mới
output_file = '/content/drive/MyDrive/models/filtered_squad_contexts.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(long_entries, f, ensure_ascii=False, indent=4)

4.2 Clean text

In [7]:
def clean_text(text):
    # Loại bỏ các ký tự không cần thiết
    text = re.sub(r'[.,?]', '', text)
    return text.lower()

4.3 Finding answer start

In [8]:
def find_answer_start(context, answer):
    cleaned_context = clean_text(context)
    cleaned_answer = clean_text(answer)

    # Tìm toàn bộ câu trả lời
    start = cleaned_context.find(cleaned_answer)
    if start != -1:
        return start

    # Tìm 2 chữ đầu tiên
    words = cleaned_answer.split()
    if len(words) >= 2:
        two_words = ' '.join(words[:2])
        start = cleaned_context.find(two_words)
        if start != -1:
            return start

    # Tìm 3 chữ đầu tiên
    if len(words) >= 3:
        three_words = ' '.join(words[:3])
        start = cleaned_context.find(three_words)
        if start != -1:
            return start

    # Bỏ chữ "có" và tìm 2 chữ đầu tiên
    if words[0] == 'có':
        if len(words) >= 3:
            two_words_without_first = ' '.join(words[1:3])
            start = cleaned_context.find(two_words_without_first)
            if start != -1:
                return start
    # Bỏ chữ "KHÔNG" và tìm 2 chữ đầu tiên
    if words[0] == 'không':
        if len(words) >= 3:
            two_words_without_first = ' '.join(words[1:3])
            start = cleaned_context.find(two_words_without_first)
            if start != -1:
                return start
    # Bỏ chữ "Để" và tìm 2 chữ đầu tiên
    if words[0] == 'để':
        if len(words) >= 3:
            two_words_without_first = ' '.join(words[1:3])
            start = cleaned_context.find(two_words_without_first)
            if start != -1:
                return start
    return -1  # Nếu không tìm thấy

4.4 Update answer start for data set

In [12]:
def update_answer_starts(data):
    not_found_answers = []
    for squadData in data:
      for article in squadData['data']:
          for paragraph in article['paragraphs']:
              context = paragraph['context']
              for qa in paragraph['qas']:
                  for answer in qa['answers']:
                      answer_text = answer['text']
                      answer_start = find_answer_start(context, answer_text)
                      if answer_start == -1:
                          not_found_answers.append({
                              "context": context,
                              "answer": answer_text
                          })
                      answer['answer_start'] = answer_start
    return data, not_found_answers

Apply update answer start for data set

In [13]:
with open('/content/drive/MyDrive/models/qa_train.json', 'r', encoding='utf-8') as file:
    squad_data = json.load(file)

updated_data, not_found_answers = update_answer_starts(squad_data)

with open('/content/drive/MyDrive/models/updated_squad_data.json', 'w', encoding='utf-8') as file:
    json.dump(updated_data, file, ensure_ascii=False, indent=2)

with open('/content/drive/MyDrive/models/not_found_answers.json', 'w', encoding='utf-8') as file:
    json.dump(not_found_answers, file, ensure_ascii=False, indent=2)

4.5. Load preprocess data set

In [14]:
def load_and_preprocess_squad(input_file):
  with open(input_file, 'r', encoding='utf-8') as f:
      data = json.load(f)

  contexts = []
  questions = []
  answers = []
  for dataJson in data:
    for article in dataJson['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text'] if qa['answers'] else None
                answerStart = qa['answers'][0]['answer_start'] if qa['answers'] else None
                if answer is None:
                  print(question)
                if answer is None:
                  print(question)
                if answer:
                    contexts.append(context)
                    questions.append(question)
                    answers.append({
                      "text": [answer.lower()],
                      "start": [answerStart]
                    })

  # Kiểm tra độ dài của các cột
  assert len(contexts) == len(questions) == len(answers)

  # Tạo từ điển dữ liệu
  dataset = {
      'context': contexts,
      'question': questions,
      'answer': answers
  }

  return dataset

4.6 Split data

In [16]:
def split_data(dataset, test_size=0.1):
    contexts = dataset['context']
    questions = dataset['question']
    answers = dataset['answer']

    train_contexts, val_contexts, train_questions, val_questions, train_answers, val_answers = train_test_split(
        contexts, questions, answers, test_size=test_size, random_state=42
    )

    train_dataset = {
        'context': train_contexts,
        'question': train_questions,
        'answer': train_answers
    }

    val_dataset = {
        'context': val_contexts,
        'question': val_questions,
        'answer': val_answers
    }

    return train_dataset, val_dataset

4.7 Preprocess function map data set

In [17]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=258,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["start"][0]
        end_char = answer["start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

5. Load train, eval data

In [18]:
dataset = load_and_preprocess_squad("/content/drive/MyDrive/models/qa_train.json")

train_dataset, val_dataset = split_data(dataset, test_size=0.1)

In [19]:
print(f"Train dataset size: {len(train_dataset['answer'])}")
print(f"Validation dataset size: {len(val_dataset['answer'])}")

Train dataset size: 2655
Validation dataset size: 295


In [20]:
dataset_train = Dataset.from_dict(train_dataset)
dataset_eval = Dataset.from_dict(val_dataset)

6. Tokenize data training, eval

In [None]:
tokenized_squad = dataset_train.map(preprocess_function, batched=True, remove_columns=dataset_train.column_names)
tokenized_squad_eval = dataset_eval.map(preprocess_function, batched=True, remove_columns=dataset_eval.column_names)

In [22]:
from transformers import DefaultDataCollator

7. Training args

In [None]:
data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="phobert_law",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad,
    eval_dataset=tokenized_squad_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
  )

8. Train model

In [None]:
trainer.train()

9. Saving model

In [None]:
trainer.save_pretrained('/content/data')
tokenizer.save_pretrained('/content/data')