In [24]:
import datasets
import transformers
from datasets import Dataset
import json

In [2]:
from transformers import RobertaTokenizerFast

# Use model 
tokenizer = RobertaTokenizerFast.from_pretrained("vinai/phobert-base")
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token


def tokenize_function(examples):
    return tokenizer(examples["context"], padding="max_length", truncation=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'PhobertTokenizer'. 
The class this function is called from is 'RobertaTokenizerFast'.


In [3]:
# Function read file train
def read_data(filePath):
    f = open(filePath,encoding='utf-8')
    fileRead = json.load(f)
    context = []
    question = []
    answer = []
    for object in fileRead:
        for qa in object["qas"]:
            if(qa["question"] is None):
               print(object["context"].lower())
            context.append(object["context"].lower())
            question.append(qa["question"].lower())
            answer.append(qa["answer"]["text"][0].lower())
    dataset = {
    "context": context,
    "question": question,
    "answer": answer
    }
    return dataset

In [4]:
dataset = read_data("./QA_data/qa_train.json")
eval_dataset = read_data("./QA_data/qa_eval.json")

In [5]:
dataset_train = Dataset.from_dict(dataset)
dataset_eval = Dataset.from_dict(eval_dataset)

In [6]:
dataset_train

Dataset({
    features: ['context', 'question', 'answer'],
    num_rows: 1001
})

In [7]:
batch_size=8  # change to 16 for full training
encoder_max_length=64
decoder_max_length=32

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["answer"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

train_data = dataset_train.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns= dataset_train.column_names
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = dataset_eval.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=dataset_eval.column_names
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

                                                                

In [8]:
from transformers import EncoderDecoderModel

phoBert2PhoBert = EncoderDecoderModel.from_encoder_decoder_pretrained("vinai/phobert-base", "vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForCausalLM were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['roberta.encoder.layer.7.crossattention.self.query.weight', 'roberta.encoder.layer.10.crossattention.self.query.weight', 'roberta.encod

In [9]:
# set special tokens
phoBert2PhoBert.config.decoder_start_token_id = tokenizer.bos_token_id
phoBert2PhoBert.config.eos_token_id = tokenizer.eos_token_id
phoBert2PhoBert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
phoBert2PhoBert.config.vocab_size = phoBert2PhoBert.config.decoder.vocab_size
phoBert2PhoBert.config.max_length = 64
phoBert2PhoBert.config.min_length = 56
phoBert2PhoBert.config.no_repeat_ngram_size = 3
phoBert2PhoBert.config.early_stopping = True
phoBert2PhoBert.config.length_penalty = 2.0
phoBert2PhoBert.config.num_beams = 4

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EncoderDecoderModel

In [11]:
# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

  


In [12]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./baseline",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=80,
    predict_with_generate=True,
    overwrite_output_dir=True,
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=phoBert2PhoBert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

  0%|          | 0/10080 [00:00<?, ?it/s]You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  "You have modified the pretrained model configuration to control generation. This is a"


{'loss': 0.4626, 'learning_rate': 4.751984126984127e-05, 'epoch': 3.97}


                                                        
  5%|▍         | 500/10080 [1:21:22<20:22:37,  7.66s/it]

{'eval_loss': 23.7374267578125, 'eval_rouge2_precision': 0.0084, 'eval_rouge2_recall': 0.0221, 'eval_rouge2_fmeasure': 0.0117, 'eval_runtime': 459.8674, 'eval_samples_per_second': 0.402, 'eval_steps_per_second': 0.052, 'epoch': 3.97}


 10%|▉         | 1000/10080 [2:29:24<20:49:03,  8.25s/it] 

{'loss': 0.0196, 'learning_rate': 4.503968253968254e-05, 'epoch': 7.94}


                                                         
 10%|▉         | 1000/10080 [2:37:24<20:49:03,  8.25s/it]

{'eval_loss': 23.230838775634766, 'eval_rouge2_precision': 0.0155, 'eval_rouge2_recall': 0.0425, 'eval_rouge2_fmeasure': 0.0214, 'eval_runtime': 479.8254, 'eval_samples_per_second': 0.386, 'eval_steps_per_second': 0.05, 'epoch': 7.94}


 15%|█▍        | 1500/10080 [3:46:53<18:42:25,  7.85s/it]  

{'loss': 0.0401, 'learning_rate': 4.255952380952381e-05, 'epoch': 11.9}


                                                         
 15%|█▍        | 1500/10080 [3:55:02<18:42:25,  7.85s/it]

{'eval_loss': 23.966543197631836, 'eval_rouge2_precision': 0.0084, 'eval_rouge2_recall': 0.0247, 'eval_rouge2_fmeasure': 0.012, 'eval_runtime': 488.0567, 'eval_samples_per_second': 0.379, 'eval_steps_per_second': 0.049, 'epoch': 11.9}


 20%|█▉        | 2000/10080 [5:05:28<18:05:18,  8.06s/it]  

{'loss': 0.0047, 'learning_rate': 4.007936507936508e-05, 'epoch': 15.87}


                                                         
 20%|█▉        | 2000/10080 [5:13:12<18:05:18,  8.06s/it]

{'eval_loss': 25.438758850097656, 'eval_rouge2_precision': 0.0041, 'eval_rouge2_recall': 0.0192, 'eval_rouge2_fmeasure': 0.0064, 'eval_runtime': 463.3636, 'eval_samples_per_second': 0.399, 'eval_steps_per_second': 0.052, 'epoch': 15.87}


 25%|██▍       | 2500/10080 [6:19:52<16:56:41,  8.05s/it]  

{'loss': 0.0003, 'learning_rate': 3.759920634920635e-05, 'epoch': 19.84}


                                                         
 25%|██▍       | 2500/10080 [6:27:32<16:56:41,  8.05s/it]

{'eval_loss': 26.10679817199707, 'eval_rouge2_precision': 0.0053, 'eval_rouge2_recall': 0.0125, 'eval_rouge2_fmeasure': 0.0073, 'eval_runtime': 459.8962, 'eval_samples_per_second': 0.402, 'eval_steps_per_second': 0.052, 'epoch': 19.84}


 30%|██▉       | 3000/10080 [7:35:40<16:09:33,  8.22s/it]  

{'loss': 0.0003, 'learning_rate': 3.511904761904762e-05, 'epoch': 23.81}


                                                         
 30%|██▉       | 3000/10080 [7:43:34<16:09:33,  8.22s/it]

{'eval_loss': 25.789236068725586, 'eval_rouge2_precision': 0.0181, 'eval_rouge2_recall': 0.0367, 'eval_rouge2_fmeasure': 0.0233, 'eval_runtime': 473.5094, 'eval_samples_per_second': 0.391, 'eval_steps_per_second': 0.051, 'epoch': 23.81}


 35%|███▍      | 3500/10080 [8:51:21<14:52:41,  8.14s/it]  

{'loss': 0.0001, 'learning_rate': 3.263888888888889e-05, 'epoch': 27.78}


                                                         
 35%|███▍      | 3500/10080 [8:59:16<14:52:41,  8.14s/it]

{'eval_loss': 26.67833137512207, 'eval_rouge2_precision': 0.0259, 'eval_rouge2_recall': 0.0998, 'eval_rouge2_fmeasure': 0.0394, 'eval_runtime': 474.5596, 'eval_samples_per_second': 0.39, 'eval_steps_per_second': 0.051, 'epoch': 27.78}


 40%|███▉      | 4000/10080 [10:06:49<13:55:44,  8.25s/it] 

{'loss': 0.0016, 'learning_rate': 3.0158730158730158e-05, 'epoch': 31.75}



 40%|███▉      | 4000/10080 [10:14:53<13:55:44,  8.25s/it]

{'eval_loss': 25.463247299194336, 'eval_rouge2_precision': 0.0173, 'eval_rouge2_recall': 0.0498, 'eval_rouge2_fmeasure': 0.025, 'eval_runtime': 484.1404, 'eval_samples_per_second': 0.382, 'eval_steps_per_second': 0.05, 'epoch': 31.75}


 45%|████▍     | 4500/10080 [11:24:11<12:57:56,  8.36s/it]  

{'loss': 0.0034, 'learning_rate': 2.767857142857143e-05, 'epoch': 35.71}


                                                          
 45%|████▍     | 4500/10080 [11:32:22<12:57:56,  8.36s/it]

{'eval_loss': 24.88330841064453, 'eval_rouge2_precision': 0.0016, 'eval_rouge2_recall': 0.0056, 'eval_rouge2_fmeasure': 0.0024, 'eval_runtime': 491.5398, 'eval_samples_per_second': 0.376, 'eval_steps_per_second': 0.049, 'epoch': 35.71}


 50%|████▉     | 5000/10080 [12:41:19<11:46:59,  8.35s/it]  

{'loss': 0.0, 'learning_rate': 2.5198412698412697e-05, 'epoch': 39.68}



 50%|████▉     | 5000/10080 [12:49:18<11:46:59,  8.35s/it]

{'eval_loss': 27.238750457763672, 'eval_rouge2_precision': 0.0009, 'eval_rouge2_recall': 0.0031, 'eval_rouge2_fmeasure': 0.0014, 'eval_runtime': 478.6436, 'eval_samples_per_second': 0.387, 'eval_steps_per_second': 0.05, 'epoch': 39.68}


 55%|█████▍    | 5500/10080 [13:57:30<9:47:32,  7.70s/it]   

{'loss': 0.0, 'learning_rate': 2.271825396825397e-05, 'epoch': 43.65}


                                                         
 55%|█████▍    | 5500/10080 [14:05:00<9:47:32,  7.70s/it]

{'eval_loss': 27.46596336364746, 'eval_rouge2_precision': 0.0043, 'eval_rouge2_recall': 0.016, 'eval_rouge2_fmeasure': 0.0066, 'eval_runtime': 450.5833, 'eval_samples_per_second': 0.411, 'eval_steps_per_second': 0.053, 'epoch': 43.65}


 60%|█████▉    | 6000/10080 [15:18:29<9:54:37,  8.74s/it]   

{'loss': 0.0007, 'learning_rate': 2.023809523809524e-05, 'epoch': 47.62}



 60%|█████▉    | 6000/10080 [15:26:32<9:54:37,  8.74s/it]

{'eval_loss': 26.749732971191406, 'eval_rouge2_precision': 0.0024, 'eval_rouge2_recall': 0.0039, 'eval_rouge2_fmeasure': 0.0029, 'eval_runtime': 482.9817, 'eval_samples_per_second': 0.383, 'eval_steps_per_second': 0.05, 'epoch': 47.62}


 64%|██████▍   | 6500/10080 [16:35:51<8:45:00,  8.80s/it]   

{'loss': 0.0001, 'learning_rate': 1.775793650793651e-05, 'epoch': 51.59}


                                                         
 64%|██████▍   | 6500/10080 [16:44:44<8:45:00,  8.80s/it]

{'eval_loss': 26.940311431884766, 'eval_rouge2_precision': 0.0063, 'eval_rouge2_recall': 0.019, 'eval_rouge2_fmeasure': 0.0088, 'eval_runtime': 533.0976, 'eval_samples_per_second': 0.347, 'eval_steps_per_second': 0.045, 'epoch': 51.59}


 69%|██████▉   | 7000/10080 [17:53:21<6:54:58,  8.08s/it]   

{'loss': 0.0048, 'learning_rate': 1.527777777777778e-05, 'epoch': 55.56}



 69%|██████▉   | 7000/10080 [18:01:14<6:54:58,  8.08s/it]

{'eval_loss': 26.46912384033203, 'eval_rouge2_precision': 0.0089, 'eval_rouge2_recall': 0.0371, 'eval_rouge2_fmeasure': 0.0139, 'eval_runtime': 472.5046, 'eval_samples_per_second': 0.392, 'eval_steps_per_second': 0.051, 'epoch': 55.56}


 74%|███████▍  | 7500/10080 [19:09:20<5:51:49,  8.18s/it]   

{'loss': 0.0, 'learning_rate': 1.2797619047619047e-05, 'epoch': 59.52}



 74%|███████▍  | 7500/10080 [19:17:07<5:51:49,  8.18s/it]

{'eval_loss': 27.472139358520508, 'eval_rouge2_precision': 0.0019, 'eval_rouge2_recall': 0.0096, 'eval_rouge2_fmeasure': 0.003, 'eval_runtime': 467.264, 'eval_samples_per_second': 0.396, 'eval_steps_per_second': 0.051, 'epoch': 59.52}


 79%|███████▉  | 8000/10080 [20:25:10<4:44:32,  8.21s/it]   

{'loss': 0.0, 'learning_rate': 1.0317460317460318e-05, 'epoch': 63.49}



 79%|███████▉  | 8000/10080 [20:33:05<4:44:32,  8.21s/it]

{'eval_loss': 27.67497444152832, 'eval_rouge2_precision': 0.002, 'eval_rouge2_recall': 0.0103, 'eval_rouge2_fmeasure': 0.0032, 'eval_runtime': 474.3036, 'eval_samples_per_second': 0.39, 'eval_steps_per_second': 0.051, 'epoch': 63.49}


 84%|████████▍ | 8500/10080 [21:40:59<3:36:59,  8.24s/it]  

{'loss': 0.0, 'learning_rate': 7.837301587301588e-06, 'epoch': 67.46}



 84%|████████▍ | 8500/10080 [21:48:45<3:36:59,  8.24s/it]

{'eval_loss': 27.80921173095703, 'eval_rouge2_precision': 0.002, 'eval_rouge2_recall': 0.0103, 'eval_rouge2_fmeasure': 0.0032, 'eval_runtime': 466.4708, 'eval_samples_per_second': 0.397, 'eval_steps_per_second': 0.051, 'epoch': 67.46}


 89%|████████▉ | 9000/10080 [22:56:24<2:27:13,  8.18s/it]  

{'loss': 0.0001, 'learning_rate': 5.357142857142857e-06, 'epoch': 71.43}



 89%|████████▉ | 9000/10080 [23:04:15<2:27:13,  8.18s/it]

{'eval_loss': 27.635251998901367, 'eval_rouge2_precision': 0.0023, 'eval_rouge2_recall': 0.0108, 'eval_rouge2_fmeasure': 0.0036, 'eval_runtime': 470.5177, 'eval_samples_per_second': 0.393, 'eval_steps_per_second': 0.051, 'epoch': 71.43}


 94%|█████████▍| 9500/10080 [24:11:57<1:18:29,  8.12s/it]  

{'loss': 0.0, 'learning_rate': 2.876984126984127e-06, 'epoch': 75.4}


                                                         
 94%|█████████▍| 9500/10080 [24:19:48<1:18:29,  8.12s/it]

{'eval_loss': 27.716400146484375, 'eval_rouge2_precision': 0.0142, 'eval_rouge2_recall': 0.0596, 'eval_rouge2_fmeasure': 0.0219, 'eval_runtime': 471.74, 'eval_samples_per_second': 0.392, 'eval_steps_per_second': 0.051, 'epoch': 75.4}


 99%|█████████▉| 10000/10080 [25:27:39<10:53,  8.17s/it]   

{'loss': 0.0, 'learning_rate': 3.9682539682539683e-07, 'epoch': 79.37}



 99%|█████████▉| 10000/10080 [25:35:31<10:53,  8.17s/it]

{'eval_loss': 27.784379959106445, 'eval_rouge2_precision': 0.0043, 'eval_rouge2_recall': 0.0164, 'eval_rouge2_fmeasure': 0.0066, 'eval_runtime': 472.2021, 'eval_samples_per_second': 0.392, 'eval_steps_per_second': 0.051, 'epoch': 79.37}


100%|██████████| 10080/10080 [25:46:33<00:00,  9.21s/it]   

{'train_runtime': 92792.9874, 'train_samples_per_second': 0.863, 'train_steps_per_second': 0.109, 'train_loss': 0.026708488502735567, 'epoch': 80.0}





TrainOutput(global_step=10080, training_loss=0.026708488502735567, metrics={'train_runtime': 92792.9874, 'train_samples_per_second': 0.863, 'train_steps_per_second': 0.109, 'train_loss': 0.026708488502735567, 'epoch': 80.0})

In [14]:
trainer.save_model()

In [25]:
# Function read file test
def read_data_test(filePath):
    f = open(filePath,encoding='utf-8')
    fileRead = json.load(f)
    question = []
    answer = []
    for object in fileRead:
        question.append(object["question"].lower())
        answer.append(object["answer"].lower())
    dataset = {
    "question": question,
    "answer": answer
    }
    return dataset

In [26]:
model = EncoderDecoderModel.from_pretrained("C:\\Users\\ADMIN\\Desktop\\dataset\\IR_QA\\baseline\\checkpoint-10000")
test_data = read_data_test("./QA_data/qa_test.json")
test_data = Dataset.from_dict(test_data)
batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_summary(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["question"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["question"])

pred_str = results["pred"]
label_str = results["answer"]


                                                           

In [27]:
rouge = datasets.load_metric("rouge")
rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

Score(precision=0.004604832669527387, recall=0.013055369533849028, fmeasure=0.006500846777531212)


In [28]:
from nltk.translate.bleu_score import sentence_bleu
#function to get score of results
def getScore(ref, candi):
    score = sentence_bleu(ref, candi)
    return float('{:.4f}'.format(score))

In [29]:
reference= []
for i in test_data["answer"]:
    reference.append(i.split())

In [30]:
scores = []
for i in pred_str:
    scores.append(getScore(reference,i.split()))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
