<a href="https://colab.research.google.com/github/trminhnam/CodeBERT/blob/master/Evaluate_BLOOM_BLOOMZ_for_Machine_Translation_on_IWSLT15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing libraries

In [None]:
!pip install -q peft transformers datasets evaluate seqeval wandb
!pip install accelerate -q
# !pip install bitsandbytes -q
# !python -m bitsandbytes

In [2]:
!git config --global credential.helper store
!huggingface-cli login --token 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 --add-to-git-credential

Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
# !pip install -q sacrebleu
# import locale
# locale.getpreferredencoding = lambda: "UTF-8"

## Importing libraries

In [18]:
import torch
import os
import json
import html
import evaluate
import warnings
import numpy as np

from tqdm import tqdm
from datasets import load_dataset
from peft import PeftConfig, PeftModel
from torch.utils.data import DataLoader
from transformers import default_data_collator
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, PromptTuningInit, PromptTuningConfig, TaskType, PeftType, LoraConfig

from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

warnings.filterwarnings("ignore")

device = "cuda"
model_name_or_path = "bigscience/bloomz-560m"
tokenizer_name_or_path = "bigscience/bloomz-560m"

dataset_name = "mt_eng_vietnamese"
dataset_config_name = "iwslt2015-en-vi"

SOURCE_LANG, TARGET_LANG = dataset_config_name.split('-')[1:]
FULL_LANG_MAPPING = {
    "en": "English",
    "vi": "Vietnamese",
}
FULL_SOURCE_LANG, FULL_TARGET_LANG = FULL_LANG_MAPPING[SOURCE_LANG], FULL_LANG_MAPPING[TARGET_LANG]

MAX_LENGTH = 128
BATCH_SIZE = 8

SEED = 42

## Utils

In [19]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.device_count() > 0:
        torch.cuda.manual_seed_all(seed)

def loading_dataset_with_name(dataset_name, dataset_config_name):
    dataset = load_dataset(dataset_name, dataset_config_name)
    # print(dataset["train"][0])
    if "validation" not in dataset:
        dataset["validation"] = load_dataset(
            dataset_name,
            split=f"train[:10%]",
        )

        dataset["train"] = load_dataset(
            dataset_name,
            split=f"train[10%:20%]",
        )

        dataset["train"] = load_dataset(
            dataset_name,
            split=f"train[20%:]",
        )
    return dataset

def generate_and_tokenize_prompt_for_evaluation(examples, tokenizer, prompt_template, max_length, src_lang, tgt_lang):
    full_src_lang, full_tgt_lang = FULL_LANG_MAPPING[src_lang], FULL_LANG_MAPPING[tgt_lang]

    if not isinstance(examples['translation'], list):
        source_sentences = [html.unescape(examples['translation'][src_lang])]
        target_sentences = [html.unescape(examples['translation'][tgt_lang])]
    else:
        source_sentences = [html.unescape(data[src_lang]) for data in examples['translation']]
        target_sentences = [html.unescape(data[tgt_lang]) for data in examples['translation']]

    # print(source_sentences)
    # print(target_sentences)

    batch_size = len(source_sentences)
    # print(batch_size)

    inputs = [
        prompt_template.format(
            full_source_lang=full_src_lang,
            source_sent=source_sentences[i],
            full_target_lang=full_tgt_lang,
        )
        for i in range(batch_size)
    ]

    model_inputs = tokenizer(inputs)
    labels = tokenizer(target_sentences)

    for i in range(batch_size):
        # get input ids of current index
        sample_input_ids = model_inputs["input_ids"][i]

        # add the end of string token
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]

        # create attention mask for input ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])

    # create padding
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]

        # padding input ids to max length
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids

        # padding attention mask to max length
        model_inputs["attention_mask"][i] = [0] * (
            max_length - len(sample_input_ids)
        ) + model_inputs["attention_mask"][i]

        # padding label ids to max length
        labels["input_ids"][i] = [-100] * (
            max_length - len(label_input_ids)
        ) + label_input_ids

        # convert to torch tensor
        model_inputs["input_ids"][i] = torch.tensor(
            model_inputs["input_ids"][i][:max_length]
        )
        model_inputs["attention_mask"][i] = torch.tensor(
            model_inputs["attention_mask"][i][:max_length]
        )
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# post process function to decode the output ids and get the translated text from index: max_length
def postprocess(predictions, labels, max_seq_len=None):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    # predictions is in shape (batch_size, seq_len)
    # we cut at the seq_len dimension to get the actual predictions because the model is trained with pre padding to max length
    if max_seq_len:
        predictions = predictions[:, max_seq_len:]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels
    return model_inputs

def evaluate_model_with_bleu(model, data_loader, max_length):
    model.eval()
    bleu_metric = evaluate.load("bleu")

    with torch.no_grad():
        for batch in tqdm(data_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            generated_tokens = model.generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=MAX_LENGTH*2,
                do_sample=True,
                top_p=0.95,
                top_k=60,
                temperature=0.9,
            )

            labels = batch["labels"]

            decoded_preds, decoded_labels = postprocess(generated_tokens, labels, max_seq_len=max_length)
            decoded_labels = [x[0] for x in decoded_labels]

            bleu_metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    bleu_result = bleu_metric.compute()
    # print(f"BLEU score on validation set: {bleu_result['bleu']*100:.2f}")
    return bleu_result

## Load tokenizer, model and PEFT parameters

In [20]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(model_name_or_path, load_in_8bit=False)
model = model.to(device)


# peft_model_id = "tmnam20/lora_16_8_0.1_lora_only_5e-5"
# config = PeftConfig.from_pretrained(peft_model_id)
# inference_model = AutoModelForCausalLM.from_pretrained(
#     config.base_model_name_or_path
# )
# tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
# model = PeftModel.from_pretrained(inference_model, peft_model_id)
# model.print_trainable_parameters()

## Pipeline

In [25]:
def eval_machine_translation(model, tokenizer, dataset_names, prompt_templates, max_length):
    for dataset_name in dataset_names:
        if dataset_names.get(dataset_name, None) is None:
            dataset_names[dataset_name] = [None]
        for dataset_config_name in dataset_names[dataset_name]:
            # loading dataset
            print("#" * 150)
            print(f"Running evaluation on dataset: {dataset_name}.{dataset_config_name}")
            src_lang, tgt_lang = dataset_config_name.split('-')[1:]
            full_src_lang, full_tgt_lang = FULL_LANG_MAPPING[SOURCE_LANG], FULL_LANG_MAPPING[TARGET_LANG]
            print()

            for template in prompt_templates:
                # reload dataset to apply new prompt
                dataset = loading_dataset_with_name(dataset_name, dataset_config_name)
                datasets_to_eval = {}
                if "validation" in dataset:
                    datasets_to_eval['validation'] = dataset['validation']
                if "test" in dataset:
                    datasets_to_eval['test'] = dataset['test']

                # tokenizing subsets
                for k, v in datasets_to_eval.items():
                    v = v.map(
                        generate_and_tokenize_prompt_for_evaluation,
                        batched=True,
                        num_proc=4,
                        remove_columns=v.column_names,
                        desc=f"Creating prompt and tokenizing on {k} subset",
                        load_from_cache_file=False,
                        fn_kwargs={
                            "tokenizer": tokenizer,
                            "prompt_template": template,
                            "max_length": MAX_LENGTH,
                            "src_lang": src_lang,
                            "tgt_lang": tgt_lang,
                        }
                    )
                    datasets_to_eval[k] = v
                    len(v[100]['input_ids']) == max_length
                    # print(tokenizer.decode(v[100]['input_ids']))
                    assert len(v[100]['labels']) == max_length
                    assert v[100]['input_ids'].count(3) == v[100]['attention_mask'].count(0)

                eval_bleu = {}
                for k, v in datasets_to_eval.items():
                    dataloader = DataLoader(
                        v, collate_fn=default_data_collator, batch_size=BATCH_SIZE
                    )

                    bleu_result = evaluate_model_with_bleu(model, dataloader, max_length)
                    bleu_score = bleu_result['bleu']
                    eval_bleu[k] = bleu_score

                # print result
                print(f'#' * 50)
                print(f"Dataset: {dataset_name}.{dataset_config_name}")
                print(f'{src_lang} --> {tgt_lang}')
                print(f'Prompt template: {template}')
                for k, v in eval_bleu.items():
                    print(f"BLEU score on {k} set: {v*100:.2f}")
                print()

            print("#" * 100)


## Running pipeline

In [None]:
set_seed(SEED)
datasets = {
    "mt_eng_vietnamese": ["iwslt2015-en-vi", "iwslt2015-vi-en"]
    "tmnam20/PhoMT": ["phomt-en-vi", "phomt-vi-en"]
}
translation_prompt_templates = [
    "Translate the following text from {full_source_lang} to {full_target_lang} {source_sent} ",
    "Translate the following text from {full_source_lang} to {full_target_lang}: \"{source_sent}\". The {full_target_lang} text is: "
]

eval_machine_translation(model, tokenizer, datasets, translation_prompt_templates, MAX_LENGTH)

######################################################################################################################################################
Running evaluation on dataset: tmnam20/PhoMT.phomt-en-vi



Downloading builder script:   0%|          | 0.00/4.87k [00:00<?, ?B/s]

Downloading and preparing dataset pho_mt/phomt-en-vi to /root/.cache/huggingface/datasets/tmnam20___pho_mt/phomt-en-vi/1.0.0/28105c5f339b00b7069d0f5fc2e75a12386b6ea94bd4675faca47925930907ce...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset pho_mt downloaded and prepared to /root/.cache/huggingface/datasets/tmnam20___pho_mt/phomt-en-vi/1.0.0/28105c5f339b00b7069d0f5fc2e75a12386b6ea94bd4675faca47925930907ce. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Creating prompt and tokenizing on validation subset (num_proc=4):   0%|          | 0/18719 [00:00<?, ? example…

Creating prompt and tokenizing on test subset (num_proc=4):   0%|          | 0/19151 [00:00<?, ? examples/s]

100%|██████████| 2340/2340 [1:38:51<00:00,  2.53s/it]
 62%|██████▏   | 1477/2394 [50:26<25:17,  1.65s/it]

### Checking output of the processing method

In [None]:
# generate_and_tokenize_prompt_for_evaluation(
#     {
#         "translation": {
#             'en': 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
#             'vi': 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .'
#         }
#     }
# )

In [13]:
# print(len(eval_dataset[100]['input_ids']))
# print(tokenizer.decode(eval_dataset[100]['input_ids']))
# print(len(eval_dataset[100]['labels']))
# print(eval_dataset[100]['input_ids'].count(3))
# print(eval_dataset[100]['attention_mask'].count(0))

128
<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>Translate the following text from English to Vietnamese Samuel is 16 . He 's tall . He 's very handsome . 
128


## Evaluate

In [20]:
# # print an example of predictions and labels
# model.eval()
# with torch.no_grad():
#     for batch in val_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
#         generated_tokens = model.generate(
#             batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             max_length=max_length*2,
#             do_sample=True,
#             top_p=0.95,
#             top_k=60,
#             temperature=0.9,
#         )

#         labels = batch["labels"]

#         print("All generated token:" + json.dumps(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True), indent=4, ensure_ascii=False))
#         decoded_preds, decoded_labels = postprocess(generated_tokens, labels, max_seq_len=max_length)
#         decoded_labels = [x[0] for x in decoded_labels]

#         print("Decoded_preds:" + json.dumps(decoded_preds, indent=4, ensure_ascii=False))
#         print("Decoded_labels:" + json.dumps(decoded_labels, indent=4, ensure_ascii=False))

#         break

All generated token:[
    "Translate the following text from English to Vietnamese When I was little , I thought my country was the best on the planet , and I grew up singing a song called \" Nothing To Envy . \"  Tôi nghĩ khi còn nhỏ, tôi nghĩ quốc gia của tôi là nước tốt nhất trên trái đất và tôi đã lớn lên hát một bài hát nói “Không nên sợ”.",
    "Translate the following text from English to Vietnamese And I was very proud .  Và tôi rất vui được làm vậy.",
    "Translate the following text from English to Vietnamese In school , we spent a lot of time studying the history of Kim Il-Sung , but we never learned much about the outside world , except that America , South Korea , Japan are the enemies .  Trong trường học, chúng ta dành nhiều thời gian nghiên cứu lịch sử của Kim Il-Sung, nhưng chúng ta chưa học được gì về thế giới ngoài nước Mỹ, Nhật Bản, nhưng lại là kẻ thù.",
    "Translate the following text from English to Vietnamese Although I often wondered about the outside world ,

In [21]:
# # evaluate on val set

# model.eval()
# bleu_metric = evaluate.load("bleu")
# # sacrebleu_metric = evaluate.load("sacrebleu")

# with torch.no_grad():
#     for batch in tqdm(val_dataloader):
#         batch = {k: v.to(device) for k, v in batch.items()}
#         generated_tokens = model.generate(
#             batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             max_length=max_length*2,
#             do_sample=True,
#             top_p=0.95,
#             top_k=60,
#             temperature=0.9,
#         )

#         labels = batch["labels"]

#         decoded_preds, decoded_labels = postprocess(generated_tokens, labels, max_seq_len=max_length)
#         decoded_labels = [x[0] for x in decoded_labels]

#         bleu_metric.add_batch(predictions=decoded_preds, references=decoded_labels)
#         # sacrebleu_metric.add_batch(predictions=decoded_preds, references=decoded_labels)

# bleu_result = bleu_metric.compute()
# print(f"BLEU score on validation set: {bleu_result['bleu']*100:.2f}")
# # sacrebleu_result = sacrebleu_metric.compute()
# # print(f"SacreBLEU score on validation set: {sacrebleu_result['score']:.2f}")

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

100%|██████████| 318/318 [10:36<00:00,  2.00s/it]


BLEU score on validation set: 12.13


In [22]:
# # evaluate on test set

# model.eval()
# bleu_metric = evaluate.load("bleu")
# # sacrebleu_metric = evaluate.load("sacrebleu")

# with torch.no_grad():
#     for batch in tqdm(test_dataloader):
#         batch = {k: v.to(device) for k, v in batch.items()}
#         generated_tokens = model.generate(
#             batch["input_ids"],
#             attention_mask=batch["attention_mask"],
#             max_length=max_length*2,
#             do_sample=True,
#             top_p=0.95,
#             top_k=60,
#             temperature=0.9,
#         )

#         labels = batch["labels"]

#         decoded_preds, decoded_labels = postprocess(generated_tokens, labels, max_seq_len=max_length)
#         decoded_labels = [x[0] for x in decoded_labels]

#         bleu_metric.add_batch(predictions=decoded_preds, references=decoded_labels)
#         # sacrebleu_metric.add_batch(predictions=decoded_preds, references=decoded_labels)

# bleu_result = bleu_metric.compute()
# print(f"BLEU score on test set: {bleu_result['bleu']*100:.2f}")
# # sacrebleu_result = sacrebleu_metric.compute()
# # print(f"SacreBLEU score on test set: {sacrebleu_result['score']:.2f}")

 56%|█████▋    | 179/318 [06:08<04:46,  2.06s/it]


KeyboardInterrupt: ignored

In [None]:
metric = evaluate.load("bleu")
metric.add_batch(
    predictions=[
        "I go to school by public transportation",
        "They sleep at night today",
    ],
    references=[
        "I go to school by bus",
        "They sleep at night"
    ]
)

results = metric.compute()
print(f"BLEU score: {results['bleu']:.2f}")