In [None]:
pip install transformers datasets rouge_score

In [None]:
import datasets
import transformers
from datasets import Dataset
import json

In [None]:
from transformers import RobertaTokenizerFast

# Use model
tokenizer = RobertaTokenizerFast.from_pretrained("vinai/phobert-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [2]:

def load_and_preprocess_squad(input_file):
  with open(input_file, 'r', encoding='utf-8') as f:
      data = json.load(f)

  contexts = []
  questions = []
  answers = []
  for dataJson in data:
    for article in dataJson['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]['text'] if qa['answers'] else None
                if answer:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)

  # Kiểm tra độ dài của các cột
  assert len(contexts) == len(questions) == len(answers)

  # Tạo từ điển dữ liệu
  dataset = {
      'context': contexts,
      'question': questions,
      'answer': answers
  }

  return dataset

In [None]:
dataset = load_and_preprocess_squad("/content/data/qa_train.json")
data_val = load_and_preprocess_squad("/content/data/eval.json")

In [None]:
dataset_train = Dataset.from_dict(dataset)
dataset_eval = Dataset.from_dict(data_val)

In [None]:
dataset_train

In [None]:
batch_size=8  # change to 16 for full training
encoder_max_length=64
decoder_max_length=32

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["answer"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

train_data = dataset_train.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns= dataset_train.column_names
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = dataset_eval.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=dataset_eval.column_names
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [None]:
from transformers import EncoderDecoderModel

phoBert2PhoBert = EncoderDecoderModel.from_encoder_decoder_pretrained("vinai/phobert-base", "vinai/phobert-base")

In [None]:
# set special tokens
phoBert2PhoBert.config.decoder_start_token_id = tokenizer.bos_token_id
phoBert2PhoBert.config.eos_token_id = tokenizer.eos_token_id
phoBert2PhoBert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
phoBert2PhoBert.config.vocab_size = phoBert2PhoBert.config.decoder.vocab_size
phoBert2PhoBert.config.max_length = 64
phoBert2PhoBert.config.min_length = 56
phoBert2PhoBert.config.no_repeat_ngram_size = 3
phoBert2PhoBert.config.early_stopping = True
phoBert2PhoBert.config.length_penalty = 2.0
phoBert2PhoBert.config.num_beams = 4

In [None]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EncoderDecoderModel

In [None]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./baseline",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=80,
    predict_with_generate=True,
    overwrite_output_dir=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=phoBert2PhoBert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()