In [None]:
!pip install underthesea
!pip install evaluate 
!pip install rouge_score
!pip install sentence_transformers

In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer, MBartForConditionalGeneration, AutoConfig, TrainingArguments, Trainer
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split as tts
import pandas as pd
import os
import json
from datasets import load_dataset
import torch.nn as nn
from copy import deepcopy


# Initialize APP

In [2]:
model_path = "vinai/bartpho-word"
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
class SoftPrompt(nn.Module):
    def __init__(self, encoder, plm_embed: nn.Embedding, n_prompt: int = 1000,
                 embed_size:int=1024):
        super().__init__()
        self.plm_embed = plm_embed
        #self.encoder = encoder
        self.n_prompt = n_prompt
        self.list_prompts = [nn.parameter.Parameter(torch.randn(embed_size, dtype=torch.float)) for i in range(n_prompt)]
        #for prompt in self.list_prompts:
        #    nn.init.kaiming_uniform_(prompt)
        self.list_prompts = nn.ParameterList(self.list_prompts)
        self.attent = nn.MultiheadAttention(embed_dim=embed_size, num_heads=32, batch_first=True)
        
    def inject(self, tokens):
        attention_mask = (tokens != 1).float()
        ori = self.plm_embed(tokens)
        features = ori #self.encoder(tokens, attention_mask=attention_mask)[0]
        list_prompts = torch.cat([i.unsqueeze(0) for i in self.list_prompts]).unsqueeze(0).repeat(tokens.size(0), 1, 1)
        features, _ = self.attent(features, list_prompts, list_prompts)
        return features + ori
        
    def forward(self, tokens):
        return self.inject(tokens)
        

In [4]:
soft = SoftPrompt(deepcopy(model.get_encoder()), model.get_encoder().embed_tokens).to("cuda")
soft(torch.tensor([[4,5,6]], device="cuda"))

tensor([[[ 0.0766,  0.0526, -0.0167,  ..., -0.0088, -0.0227, -0.0014],
         [ 0.0542,  0.0680,  0.0145,  ..., -0.0073, -0.0226,  0.0119],
         [ 0.0629,  0.0345,  0.0362,  ..., -0.0631, -0.0319, -0.0133]]],
       device='cuda:0', grad_fn=<AddBackward0>)

In [5]:
model.set_input_embeddings(SoftPrompt(deepcopy(model.get_encoder()), model.get_encoder().embed_tokens))

# Load dataset

In [6]:
dataset = load_dataset(path="OpenHust/vietnamese-summarization", data_files="herding_bio_medicine.csv")

Found cached dataset csv (/home/jupyter/.cache/huggingface/datasets/OpenHust___csv/OpenHust--vietnamese-summarization-0917cc8d0d28c72d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
small= dataset["train"]

train, test = small.train_test_split(train_size=0.8, seed=0).values()
train, dev = train.train_test_split(test_size=0.125, seed=0).values()

In [8]:
train["Document"][0]

'Cho bác sĩ biết bạn đang nghi ngờ bản thân bị viêm dạ dày và yêu cầu tập trung kiểm tra vùng bụng. Mang theo danh sách ghi rõ triệu chứng bạn đang gặp phải cho bác sĩ xem. Bác sĩ sẽ tìm ra “triệu chứng đáng báo động” cho thấy bạn cần được chăm sóc khẩn cấp. Triệu chứng báo động mà bạn cần cho bác sĩ biết gồm có:  Nôn ra máu hoặc mật Phân có màu đen như hắc ín (đại tiện máu đen) Chán ăn, biếng ăn hoặc sụt cân (nhiều hơn 3 kg) Thiếu máu (dấu hiệu da tái, mệt mỏi, ốm yếu hoặc chóng mặt) Cảm giác chướng bụng Cho bác sĩ biết nếu bạn trên 55 tuổi. Mẫu máu sẽ được bác sĩ đưa đến phòng thí nghiệm để phân tích. Tại phòng thí nghiệm, chuyên viên sẽ tiến hành các xét nghiệm sau:  Xét nghiệm máu toàn bộ (CBC) để kiểm tra bệnh thiếu máu Xét nghiệm Amylase và Lipase để sàng lọc bệnh tuyến tụy Xét nghiệm chức năng gan và chức năng thận để đánh giá tình trạng mất nước và các nguyên nhân khác gây ra triệu chứng nếu bạn nôn mửa Xét nghiệm Guaiac phân để tìm máu ẩn (không nhìn thấy trong phân)  Xét nghi

In [9]:
dev

Dataset({
    features: ['Unnamed: 0', 'Document', 'Summary', 'Dataset'],
    num_rows: 1066
})

In [10]:
from underthesea import pos_tag

def add_prompt(text):
    words = pos_tag(text)
    txt = ""
    for word in words:
        if word[1] in ["N", "V", "A"]:
            txt += word[0] + " {} ".format(word[1])
       
        else:
            txt += word[0] + " "
    return txt

def split_source_target(db):
    docs = [add_prompt(i) for i in db["Document"]]
    sums = [i for i in db["Summary"]]
    return docs, sums

In [11]:
def encode_dataset(db, max_length, batch_size):
    tokenized = {"input_ids":[], "attention_mask":[]}
    for i in tqdm(range(0, len(db), batch_size)):
        encoded = tokenizer(db[i:i+batch_size], max_length=max_length, padding="max_length", truncation=True)
        tokenized["input_ids"] += encoded["input_ids"]
        tokenized["attention_mask"] += encoded["attention_mask"]
    return tokenized

In [12]:
def tokenize_sample_data(data):
    # Max token size is 14536 and 215 for inputs and labels, respectively.
    # Here I restrict these token size.
    docs = data["Document"]
    sums = data["Summary"]
    input_feature = tokenizer(docs, truncation=True, max_length=1024)
    label = tokenizer(sums, truncation=True, max_length=1024)
    return {
    "input_ids": input_feature["input_ids"],
    "attention_mask": input_feature["attention_mask"],
    "labels": label["input_ids"],
    }

train = train.map(
  tokenize_sample_data,
  remove_columns=["Summary", "Document", "Dataset"],
  batched=True,
  batch_size=128)

test = test.map(
  tokenize_sample_data,
  remove_columns=["Summary", "Document", "Dataset"],
  batched=True,
  batch_size=128)

dev = dev.map(
  tokenize_sample_data,
  remove_columns=["Summary", "Document", "Dataset"],
  batched=True,
  batch_size=128)

Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/OpenHust___csv/OpenHust--vietnamese-summarization-0917cc8d0d28c72d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-c94bd23634033845.arrow
Loading cached processed dataset at /home/jupyter/.cache/huggingface/datasets/OpenHust___csv/OpenHust--vietnamese-summarization-0917cc8d0d28c72d/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-fd8133ca44802265.arrow


Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
  tokenizer,
  model=model,
  return_tensors="pt")

# Trainer

In [20]:
import nltk
import numpy as np
import evaluate
from datasets import load_metric

metric = evaluate.load("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    #predictions = predictions[:, :-1]
    #import pdb
    #pdb.set_trace()
    predictions[predictions == -100] = 1
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    #labels = labels[labels !=-100]
    labels = np.where(labels != -100, labels, tokenizer.eos_token_id)
    
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True,)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

def compute_bleu(eval_preds):
    
    y_true = eval_preds.label_ids
    y_pred = eval_preds.predictions
    y_true = np.where(y_true != -100, y_true, tokenizer.pad_token_id)
    y_pred = np.where(y_pred != -100, y_true, tokenizer.pad_token_id)
    #import pdb
    #pdb.set_trace()
    metric = load_metric('bleu')
    #import pdb
    #pdb.set_trace()
    try:
        y_true = tokenizer.batch_decode(y_true, skip_special_tokens=True)
        y_pred = tokenizer.batch_decode(y_pred, skip_special_tokens=True)
        y_true = [[i.split()] for i in y_true]
        y_pred= [i.split() for i in y_pred]
        report = metric.compute(predictions=y_pred, references=y_true)
    except Exception as e:
        print(e)
        import pdb
        pdb.set_trace()
    bleu = report['bleu'] * 100
    print(bleu)
    return {"bleu":bleu}

In [21]:
args = Seq2SeqTrainingArguments(output_dir="OpenHust/open-bart-herding-1024-no-prompt",
                                evaluation_strategy="epoch",
                                save_strategy="epoch",
                               per_device_train_batch_size=2,
                               per_device_eval_batch_size=2,
                               learning_rate=1e-4,
                               weight_decay=1e-2,
                               load_best_model_at_end =True,
                               predict_with_generate=True,
                               num_train_epochs=5,
                               logging_strategy="epoch",
                               generation_max_length=1024,
                                save_total_limit = 1,
                               fp16=True,)

trainer = Seq2SeqTrainer(model=model, 
                        args=args,
                         data_collator=data_collator,
                        train_dataset=train,
                        eval_dataset=dev,
                        tokenizer=tokenizer,
                        compute_metrics=compute_metrics,)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using amp half precision backend


In [16]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [17]:
test

Dataset({
    features: ['Unnamed: 0', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 2131
})

In [18]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: Unnamed: 0. If Unnamed: 0 are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 7455
  Num Epochs = 5
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 18640
  args.max_grad_norm,


Epoch,Training Loss,Validation Loss
1,3.2121,2.765098
2,2.117,2.411663
3,1.2869,2.351274
4,0.664,2.427753
5,0.2942,2.513626


The following columns in the evaluation set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: Unnamed: 0. If Unnamed: 0 are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 2
Saving model checkpoint to kaggle/working/other/OpenHust/open-bart-herding-1024-no-prompt/checkpoint-3728
Configuration saved in kaggle/working/other/OpenHust/open-bart-herding-1024-no-prompt/checkpoint-3728/config.json
Model weights saved in kaggle/working/other/OpenHust/open-bart-herding-1024-no-prompt/checkpoint-3728/pytorch_model.bin
tokenizer config file saved in kaggle/working/other/OpenHust/open-bart-herding-1024-no-prompt/checkpoint-3728/tokenizer_config.json
Special tokens file saved in kaggle/working/other/OpenHust/open-bart-herding-1024-no-prompt/checkpoint-3728/special_tokens_map.json
added tokens file saved in kaggle/working/other/OpenHus

TrainOutput(global_step=18640, training_loss=1.514847581069357, metrics={'train_runtime': 6605.0956, 'train_samples_per_second': 5.643, 'train_steps_per_second': 2.822, 'total_flos': 5.402978719572787e+16, 'train_loss': 1.514847581069357, 'epoch': 5.0})

In [22]:
trainer.evaluate(eval_dataset=test)

The following columns in the evaluation set  don't have a corresponding argument in `MBartForConditionalGeneration.forward` and have been ignored: Unnamed: 0. If Unnamed: 0 are not expected by `MBartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2131
  Batch size = 2


{'eval_loss': 2.252601146697998,
 'eval_rouge1': 0.5816,
 'eval_rouge2': 0.3122,
 'eval_rougeL': 0.4205,
 'eval_rougeLsum': 0.4204,
 'eval_gen_len': 50.3585,
 'eval_runtime': 1414.994,
 'eval_samples_per_second': 1.506,
 'eval_steps_per_second': 0.753}

# Test cases

In [23]:
import gc
gc.collect()
import torch
torch.cuda.empty_cache()


In [None]:
trainer.evaluate(eval_dataset=dev_dataset)

In [None]:
def generate(inputs, num_returns=1):
    inputs = tokenizer.encode(inputs, return_tensors="pt", max_length = 1024, padding = True, truncation = True).to(device)
    # outputs = model.generate(inputs, max_length = 1024, num_beams = 10, )
    outputs = model.generate(inputs, generation_config=genConfig)
    #outputs = model.generate(inputs, max_length = 1024, num_beams = 5,
    #                        num_beam_groups = 5, num_return_sequences = num_returns, no_repeat_ngram_size = 3)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
doc = train["Document"][100]
print(doc)

In [None]:
generate(doc)

In [None]:
train["Summary"][100]

In [None]:
help(trainer.train)