In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -U adapter-transformers sentencepiece
!pip install datasets
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0


In [2]:
import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset, TensorDataset
from transformers import AutoTokenizer

In [3]:

base_model = "bigscience/mt0-base"
tokenizer = AutoTokenizer.from_pretrained(base_model)

prefix = 'Summarize: '


def encode_batch(examples):

    text_column = 'Document'
    summary_column = 'Summary'
    padding = "max_length"

    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[summary_column][i]:
            inputs.append(examples[text_column][i])
            targets.append(examples[summary_column][i])

    inputs = [prefix + inp for inp in inputs]

    model_inputs = tokenizer(inputs, max_length=512, padding=padding, truncation=True)
    labels = tokenizer(targets, max_length=512, padding=padding, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

Downloading (…)okenizer_config.json:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

In [4]:
from datasets import load_dataset, DatasetDict

dataset = load_dataset("OpenHust/vietnamese-summarization")


trainvalid_test = dataset['train'].train_test_split(test_size=0.2, seed=0)
train_valid = trainvalid_test['train'].train_test_split(test_size=0.125, seed=0)
dataset = DatasetDict({
    'train': train_valid['train'],
    'test': trainvalid_test['test'],
    'valid': train_valid['test']
})

Downloading readme:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset csv/OpenHust--vietnamese-summarization to /root/.cache/huggingface/datasets/OpenHust___csv/OpenHust--vietnamese-summarization-4b7575271bea4124/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/33.6M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/OpenHust___csv/OpenHust--vietnamese-summarization-4b7575271bea4124/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
train_dataset = dataset["train"]
val_dataset = dataset["valid"]

In [6]:
train_dataset

Dataset({
    features: ['Document', 'Summary', 'Dataset'],
    num_rows: 7455
})

In [7]:
val_dataset

Dataset({
    features: ['Document', 'Summary', 'Dataset'],
    num_rows: 1066
})

In [8]:
from datasets import load_dataset


def load_split(dataset, max_items):

    dataset = dataset.filter(lambda _, idx: idx < max_items, with_indices=True)
    dataset = dataset.map(
        encode_batch,
        batched=True,
        remove_columns=dataset.column_names,
   )
    # set the format to torch
    dataset.set_format(type="torch", columns=["input_ids", "labels"])
    
    return dataset

In [9]:
from transformers import AutoModelForSeq2SeqLM
import numpy as np

model = AutoModelForSeq2SeqLM.from_pretrained(
    base_model
)


Downloading (…)lve/main/config.json:   0%|          | 0.00/798 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [10]:

!pip install rouge_score


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=e26a72eb46c19cb468fa5059b339e1d0a3070373cef5cfc8e97c688f71b456ef
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [11]:
rouge_score = datasets.load_metric("rouge")
import evaluate

#rouge_score = evaluate.load("rouge")
import numpy as np



  rouge_score = datasets.load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [12]:
import numpy as np
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import TrainingArguments, Seq2SeqTrainer, TrainerCallback , Seq2SeqTrainingArguments

# small batch size to fit in memory
batch_size = 1

training_args = Seq2SeqTrainingArguments(
    learning_rate=3e-4,
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1000,
    output_dir="./training_output",
    overwrite_output_dir=False,
    evaluation_strategy="epoch",
    predict_with_generate=True,
    save_steps = 3000,
    save_total_limit = 5,
    push_to_hub = True,
    generation_max_length = 1024,
    push_to_hub_model_id = "tuna_mt0_v1.1",
    push_to_hub_token = "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",
    remove_unused_columns=False

)

# create the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics,
    # load the dataset
    train_dataset=load_split(train_dataset, len(train_dataset)),
    eval_dataset=load_split(val_dataset, len(val_dataset)),
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Map:   0%|          | 0/7455 [00:00<?, ? examples/s]

/content/./training_output is already a clone of https://huggingface.co/anhdt-dsai-02/tuna_mt0_v1.1. Make sure you pull the latest changes with `repo.git_pull()`.


In [None]:
import nltk
nltk.download('punkt')

In [None]:
trainer.train()

***** Running training *****
  Num examples = 7455
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 7455
  Number of trainable parameters = 582401280
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


Saving model checkpoint to ./training_output/checkpoint-3000
Configuration saved in ./training_output/checkpoint-3000/config.json
Configuration saved in ./training_output/checkpoint-3000/generation_config.json
Model weights saved in ./training_output/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in ./training_output/checkpoint-3000/tokenizer_config.json
Special tokens file saved in ./training_output/checkpoint-3000/special_tokens_map.json
Copy vocab file to ./training_output/checkpoint-3000/spiece.model
tokenizer config file saved in ./training_output/tokenizer_config.json
Special tokens file saved in ./training_output/special_tokens_map.json
Copy vocab file to ./training_output/spiece.model
Adding files tracked by Git LFS: ['tokenizer.json']. This may take a bit of time if the files are large.


Epoch,Training Loss,Validation Loss


In [None]:
#model.push_to_hub("anhdt-dsai-02/tuna_mt0_v1.1")

In [None]:
trainer.evaluate()

In [None]:
val_dataset[1]

In [None]:
num_validation = 10

validation_dataset = load_split(val_dataset, num_validation)

for i in range(num_validation):
    # load the input and label
    input_ids = validation_dataset[i]['input_ids'].unsqueeze(0).to(0)
    label_ids = validation_dataset[i]['labels'].unsqueeze(0).to(0)
    # use the model to generate the output
    output = model.generate(input_ids, max_length=1024)
    # convert the tokens to text
    input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
    output_text = tokenizer.decode(output[0], skip_special_tokens=True)
    label_text = tokenizer.decode(label_ids[0], skip_special_tokens=True)
    
    print('Input:', input_text)
    print('Output:', output_text)
    print('Label:', label_text)
    print('---')

In [None]:
import gc
import torch
gc.collect()
torch.cuda.empty_cache()