In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install evaluate
!pip install transformers
!pip install -U accelerate
!pip install rouge_score
!pip install wandb

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.12.0
    Uninstalling accelerate-0.12.0:
      Successfully uninstalled accelerate-0.12.0
Successfully installed accelerate-0.19.0
[0mCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l- \ done
[?25h

In [3]:
import os

import evaluate
import numpy as np
import nltk
import torch
import math
import wandb

from nltk.tokenize import sent_tokenize
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_scheduler
)
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
def show_samples(dataset, num_samples=5, seed=42):
    sample = dataset["train"].shuffle(seed=seed).select(range(num_samples))
    for example in sample:
        print(f"\n>> Title: {example['title']}")
        print(f">> Abstract: {example['abstract']}")
        print(f">> News: {example['article']}")
        print(len(example['article']))

In [5]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

In [6]:
### ==== LOAD DATA ==== ###
# load Vietnamese Text Summarization in this repo: https://huggingface.co/datasets/ithieund/VietNews-Abs-Sum
DATA_FILES = {
    'train': 'processed/train_desegmented.jsonl',
    'validation': 'processed/valid_desegmented.jsonl',
    'test': 'processed/test_desegmented.jsonl'
}
raw_dataset = load_dataset("ithieund/VietNews-Abs-Sum", data_files=DATA_FILES) # load desegmented parts

Downloading and preparing dataset json/ithieund--VietNews-Abs-Sum to /root/.cache/huggingface/datasets/json/ithieund--VietNews-Abs-Sum-9d6b9f588f315aff/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/ithieund--VietNews-Abs-Sum-9d6b9f588f315aff/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# TODO: preprocessing - deduplication, filtering characters, text normalization

In [8]:
### ==== LOAD MODEL ==== ###
# Choose T5 family such as mT5, ViT5, mBART-50
MODEL_NAME = 'VietAI/vit5-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/820k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.12k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/904M [00:00<?, ?B/s]

In [9]:
### === TOKENIZE DATA & PREPARE INPUT/OUTPUT === ###
# Let's see output of tokenizer
tmp_inputs = tokenizer(raw_dataset['train'][0]['abstract'])
tmp_inputs_ids = tokenizer.convert_ids_to_tokens(tmp_inputs.input_ids)
# print(tmp_inputs)
# print(tmp_inputs_ids)

# Let's tokenize input & output for model
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 50
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['article'],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )
    labels = tokenizer(
        examples['abstract'],
        max_length=MAX_OUTPUT_LENGTH,
        truncation=True,
    )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Then use .map() to tokenize (train, valid, test)
tokenized_dataset = raw_dataset.map(
    preprocess_function, 
    batched=True, 
    remove_columns=raw_dataset['train'].column_names, # remove these columns to make it work with map fn
    num_proc=os.cpu_count() # count number of cpus
)

   

#0:   0%|          | 0/50 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/50 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/12 [00:00<?, ?ba/s]

#1:   0%|          | 0/12 [00:00<?, ?ba/s]

    

#0:   0%|          | 0/12 [00:00<?, ?ba/s]

#1:   0%|          | 0/12 [00:00<?, ?ba/s]

In [10]:
# Then prepare batches
# use DataCollatorForSeq2Seq designed for seq2seq problem
# might use other DataCollator for other tasks
# such as DataCollatorForTokenClassification, DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
# find more https://huggingface.co/docs/transformers/main_classes/data_collator#data-collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100, # ignore these labels when models compute loss_fn
    pad_to_multiple_of=8, # maximize the usage of tensors
)
# see output of DataCollator
# labels contains -100 value
tmp_features = [tokenized_dataset["train"][i] for i in range(5)]
tmp_features_collator = data_collator(tmp_features)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [11]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [12]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mthanhduycao1202[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [13]:
### === NEW!!! DEFINE DATA LOADER === ###
# DEBUG: sample only few documents
# MAX_SAMPLES = 100
train_dataset = tokenized_dataset["train"]
eval_dataset = tokenized_dataset["validation"]
test_dataset = tokenized_dataset["test"]

BATCH_SIZE = 8
train_dataloader = DataLoader(
    train_dataset, 
    shuffle=True, 
    collate_fn=data_collator, 
    batch_size=BATCH_SIZE
)
eval_dataloader = DataLoader(
    eval_dataset, 
    shuffle=True, 
    collate_fn=data_collator, 
    batch_size=BATCH_SIZE
)

### === NEW!!! DEFINE OPTIMIZER & SCHEDULER === ###
LEARNING_RATE = 2e-5
NUM_WARMUP_STEPS = 50
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
lr_scheduler = get_scheduler(
    name='constant',
    optimizer=optimizer,
    num_warmup_steps=NUM_WARMUP_STEPS,
)

### === NEW!!!: DEFINE ACCELERATOR === ###
# Prepare everything with our accelerator
GRADIENT_ACC_STEPS = 2
accelerator = Accelerator(gradient_accumulation_steps=GRADIENT_ACC_STEPS, log_with="wandb")

model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

hps = {"num_epochs": 2, "learning_rate": LEARNING_RATE, "batch_size": BATCH_SIZE}
accelerator.init_trackers(
    "viT5_pretrain_cinnamonAI",
    config=hps,
    init_kwargs={
        "wandb": {
            "notes": "testing accelerate pipeline",
            "tags": ["tag_a", "tag_b"],
            "entity": "thanhduycao1202",
        }
    },
)


### ==== DEFINE METRICS ==== ###
# for text summarization, ROUGE score is commonly used
# ROUGE-1, ROUGE-2, ROUGE-L are commonly reported in paper
metric = evaluate.load("rouge")
# provide compute_metrics() to evaluate model during training
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    # Replace -100 in the labels as we can't decode them
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # A simple post-processing: ROUGE expects a newline after each sentence
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    # Compute ROUGE scores
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels
    )
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230530_055122-gf9jdwvz[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mscarlet-tree-13[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/thanhduycao1202/viT5_pretrain_cinnamonAI[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/thanhduycao1202/viT5_pretrain_cinnamonAI/runs/gf9jdwvz[0m


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [14]:
### ==== DEFINE TRAINING LOOP ==== ###
NUM_EPOCHS = 2
NUM_STEPS_PER_EPOCH = math.ceil(len(train_dataloader) / GRADIENT_ACC_STEPS)
NUM_TRAINING_STEPS = NUM_EPOCHS * NUM_STEPS_PER_EPOCH
OUTPUT_DIR = 'textsum_with_accelerate'
LOG_STEPS = 500
EVAL_STEPS = 2000
max_grad_norm = 1.0

best_loss = np.inf
best_loss = None

train_iter = 0
# define progress_bar for monitoring
progress_bar = tqdm(range(NUM_TRAINING_STEPS))

# define training loop
for epoch in range(NUM_EPOCHS):
    print(f'This is EPOCH: {epoch}')
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        train_iter += 1
        with accelerator.accumulate(model): # NEW!!! for gradient accumulation
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss) # NEW!!!

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            accelerator.log({"learning_rate": lr_scheduler.get_last_lr()}, step=train_iter)

            if accelerator.sync_gradients:
                progress_bar.update(1)
                accelerator.clip_grad_norm_(model.parameters(), max_grad_norm)


                
            if train_iter % LOG_STEPS == 0:
                print(f"Step: {step}, Loss: {loss}")
                accelerator.log({"train_loss": loss}, step=train_iter)

            if train_iter % EVAL_STEPS == 0:
                # Calculate eval loss and other metrics
                model.eval()
                eval_loss = 0.0
                for eval_step, eval_batch in enumerate(eval_dataloader):
                    with torch.no_grad():
                        outputs = model(**eval_batch)
                        eval_loss += outputs.loss
                eval_loss = eval_loss / (eval_step + 1)
                print(f"Step:{step}, Eval_Loss: {eval_loss}")
                accelerator.log({"train_loss": loss, "valid_loss": eval_loss}, step=train_iter)
                if (best_loss is None) or (eval_loss < best_loss):
                    best_loss = eval_loss
                    accelerator.wait_for_everyone()
                    unwrapped_model = accelerator.unwrap_model(model)
                    unwrapped_model.save_pretrained(
                        OUTPUT_DIR, save_function=accelerator.save
                    )

                model.train()

    # TODO: Evaluation
    model.eval()

    # TODO: Compute metrics

#     # Save model
#     # make sure all processes are joined
#     accelerator.wait_for_everyone()    
#     # remove all special model wrappers added during the distributed process
#     unwrapped_model = accelerator.unwrap_model(model)
#     # save
#     unwrapped_model.save_pretrained(OUTPUT_DIR, save_function=accelerator.save)
    
accelerator.end_training()

  0%|          | 0/12392 [00:00<?, ?it/s]

This is EPOCH: 0
Step: 499, Loss: 1.907934546470642
Step: 999, Loss: 2.350522994995117
Step: 1499, Loss: 1.648470163345337
Step: 1999, Loss: 2.085169553756714
Step:1999, Eval_Loss: 1.830841064453125
Step: 2499, Loss: 1.7466886043548584
Step: 2999, Loss: 1.9760586023330688
Step: 3499, Loss: 2.5985357761383057
Step: 3999, Loss: 2.2522454261779785
Step:3999, Eval_Loss: 1.7980762720108032
Step: 4499, Loss: 1.784857988357544
Step: 4999, Loss: 2.0073435306549072
Step: 5499, Loss: 2.4773383140563965
Step: 5999, Loss: 1.898173451423645
Step:5999, Eval_Loss: 1.7718571424484253
Step: 6499, Loss: 1.9947859048843384
Step: 6999, Loss: 1.9051259756088257
Step: 7499, Loss: 1.9814847707748413
Step: 7999, Loss: 1.6113377809524536
Step:7999, Eval_Loss: 1.7563807964324951
Step: 8499, Loss: 1.8056665658950806
Step: 8999, Loss: 1.8261990547180176
Step: 9499, Loss: 1.9169121980667114
Step: 9999, Loss: 1.9716260433197021
Step:9999, Eval_Loss: 1.7451715469360352
Step: 10499, Loss: 1.4725311994552612
Step: 109

[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m: train_loss ▄▇▃▅▃█▆▄▅▄▅▄▅▄▄▄▅▂▄▄▄▃▃▄▅▄▄▄▆▄▁▄▃▅▁▅▃▄▄▃
[34m[1mwandb[0m: valid_loss █▆▅▄▃▃▃▂▃▂▂▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m: train_loss 1.65272
[34m[1mwandb[0m: valid_loss 1.69785
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33mscarlet-tree-13[0m at: [34m[4mhttps://wandb.ai/thanhduycao1202/viT5_pretrain_cinnamonAI/runs/gf9jdwvz[0m
[34m[1mwandb[0m: Synced 6 W&B file(s), 0 media file(s), 0 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20230530_055122-gf9jdwvz/logs[0m


In [15]:
accelerator.end_training()