# QA Labor law training file

## Preprocessing data functions

* Lib for preprocessing data

In [8]:
import json
import re

1. Load data

In [9]:
def load_all_data(input_file: str) -> dict:
    """
    Loads the data from the given input file.

    Args:
        input_file (str): The path to the input file containing the data.

    Returns:
        dict: The loaded data as a dictionary
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

2. Clean text

In [10]:
def clean_text(text):
    """
    Cleans the given text by removing newlines, numeric annotations, special characters, and converting it to lowercase.

    Args:
        text (str): The text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

3. Preprocess data

In [12]:
def preprocess_data(data):
    """
    Preprocesses the given data and extracts contexts, questions, and answers, context_raws, question_raws.

    Args:
      data (dict): The input data containing articles, paragraphs, and questions.

    Returns:
      tuple: A tuple containing three lists - contexts, questions, and answers.
        - contexts (list): A list of preprocessed contexts.
        - questions (list): A list of preprocessed questions.
        - answers (list): A list of dictionaries, each containing the preprocessed answer text and its start position.
        - context_raws (list): A list of raw contexts.
        - question_raws (list): A list of raw questions.
    """
    contexts = []
    context_raws = []
    questions = []
    answers = []
    question_raws = []
    for dataSquad in data:
        for article in dataSquad['data']:
            for paragraph in article['paragraphs']:
                context = clean_text(paragraph['context'])
                for qa in paragraph['qas']:
                    question = clean_text(qa['question'])
                    # Do ngữ liệu nhiều chỗ define thiếu is_impossible nên sẽ mặc định là False
                    is_impossible = qa.get('is_impossible', False)
                    if not is_impossible:
                        for answer in qa['answers']:
                            answer_text = clean_text(answer['text'])
                            answer_start = answer['answer_start']
                            contexts.append(context)
                            context_raws.append(paragraph['context'])
                            questions.append(question)
                            question_raws.append(qa['question'])
                            answers.append({
                                'text': answer_text,
                                'start': answer_start
                            })
    return contexts, questions, answers, context_raws, question_raws

4. Tokenize input

In [13]:
def tokenize_texts(texts):
    """
    Tokenizes a list of texts using the word_tokenize function.

    Args:
      texts (list): A list of texts to be tokenized.

    Returns:
      list: A list of tokenized texts.
    """
    return [text.split() for text in texts]

* Running preprocessing data

In [14]:
data_path = '/content/data/qa_train.json'
squad_data = load_all_data(data_path)
contexts, questions, answers, contextRaws, questionRaws = preprocess_data(squad_data)
tokenized_contexts = tokenize_texts(contexts)
tokenized_questions = tokenize_texts(questions)

In [None]:
# Testing data
# tokenized_questions[:100]

## Word2vec model

1. Install lib for model word2vec

In [None]:
!pip install gensim

2. Lib import using word2vec

In [17]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

3. Create args model word2vec, build up lib for model and training

In [19]:
model_word2vec_question = Word2Vec(vector_size=100, window=10, min_count=1, sg=1, workers=4)
model_word2vec_question.build_vocab(tokenized_questions)
model_word2vec_question.train(tokenized_questions, total_examples=model_word2vec_question.corpus_count, epochs=200)
model_word2vec_question.save("/content/model_word2vec_question.model")

## Algorithm BM25 (model BM25)**bold text**

1. Install lib for model BM25

In [None]:
!pip install rank_bm25

2. Import lib using BM25

In [21]:
from rank_bm25 import BM25Okapi

3. Create model BM25

In [22]:
bm25Questions = BM25Okapi(tokenized_questions)

## phoBERT

1. Install lib for model phoBERT

In [None]:
!pip install datasets rouge_score
!pip install accelerate -U

2. Import lib using for train phoBERT

In [38]:
import datasets
import transformers
from datasets import Dataset
import json
import torch
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering, DefaultDataCollator, TrainingArguments, Trainer, AutoModelForQuestionAnswering
from transformers import DefaultDataCollator

3. Choose using model

In [None]:
model = RobertaForQuestionAnswering.from_pretrained("vinai/phobert-base")
tokenizer = RobertaTokenizerFast.from_pretrained("vinai/phobert-base")

4. Support function prepare dataset

4.1. Load preprocess data set

In [29]:
def load_and_preprocess_squad(input_file):
  with open(input_file, 'r', encoding='utf-8') as f:
      data = json.load(f)

  contexts = []
  questions = []
  answers = []
  for dataJson in data:
    for article in dataJson['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text'] if qa['answers'] else None
                answerStart = qa['answers'][0]['answer_start'] if qa['answers'] else None
                if answer is None:
                  print(question)
                if answer is None:
                  print(question)
                if answer:
                    contexts.append(context)
                    questions.append(question)
                    answers.append({
                      "text": [answer.lower()],
                      "start": [answerStart]
                    })

  # Kiểm tra độ dài của các cột
  assert len(contexts) == len(questions) == len(answers)

  # Tạo từ điển dữ liệu
  dataset = {
      'context': contexts,
      'question': questions,
      'answer': answers
  }

  return dataset

4.2 Preprocess function

In [30]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=128,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["start"][0]
        end_char = answer["start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


5. Load train, eval data

In [33]:
dataset = load_and_preprocess_squad("/content/data/qa_train.json")
data_val = load_and_preprocess_squad("/content/data/eval.json")

dataset_train = Dataset.from_dict(dataset)
dataset_eval = Dataset.from_dict(data_val)

In [None]:
# print dataset train
print(dataset_train)
# print dataset eval
print(dataset_eval)

6. Tokenize data training, eval

In [None]:
tokenized_squad = dataset_train.map(preprocess_function, batched=True, remove_columns=dataset_train.column_names)
tokenized_squad_eval = dataset_eval.map(preprocess_function, batched=True, remove_columns=dataset_eval.column_names)

In [None]:
from transformers import DefaultDataCollator

7. Training args

In [None]:
data_collator = DefaultDataCollator()

training_args = TrainingArguments(
    output_dir="phobert_law",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=150,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad,
    eval_dataset=tokenized_squad_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

8. Train model

In [None]:
trainer.train()

9. Saving model

In [None]:
trainer.save_pretrained('/content/data')
tokenizer.save_pretrained('/content/data')