In [1]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()
    
print_gpu_utilization()

GPU memory occupied: 382 MB.


In [2]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer, AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import notebook_login
import torch
from datasets import load_dataset, load_metric

torch.cuda.empty_cache()
print_gpu_utilization()
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load bart-large-xsum as the pretrined model
model_ckpt = "facebook/bart-large-xsum"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Now load the samsum dataset
dataset_samsum = load_dataset("samsum")
print_gpu_utilization()

GPU memory occupied: 382 MB.


Reusing dataset samsum (/home/tanvir/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

GPU memory occupied: 382 MB.


In [3]:
# Time to tokenize our input
# Ideally we want a bigger max_length for dialogue tokens. However, the pretrained model (facebook/bart-large-xsum)
# we are using has a max_position_embeddings of 1024. So we need to restrict to 1024
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["dialogue"], max_length = 1024, truncation = True)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch["summary"], max_length = 256, truncation = True)
    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)
columns = ["input_ids", "labels", "attention_mask"]
dataset_samsum_pt.set_format(type = "torch", columns = columns)
print_gpu_utilization()

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

GPU memory occupied: 382 MB.


In [4]:
# Now we need to create a data collator. This function is called in the Trainer just before the batch is fed through the model
# Then initialize the trainer with the collator
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
training_args = TrainingArguments(
    output_dir = 'meeting-summary', num_train_epochs = 1, warmup_steps = 500, per_device_train_batch_size = 1, per_device_eval_batch_size = 1,
    weight_decay = 0.01, logging_steps = 10, push_to_hub = True, evaluation_strategy = 'steps', eval_steps = 500, save_steps = 1e6, gradient_accumulation_steps = 16)
trainer = Trainer(model = model, args = training_args, tokenizer = tokenizer, data_collator = seq2seq_data_collator, 
                 train_dataset = dataset_samsum_pt["train"], eval_dataset = dataset_samsum_pt["validation"])

print_gpu_utilization()

/home/tanvir/work/huggingface-starter/meeting-summary is already a clone of https://huggingface.co/tanviraumi/meeting-summary. Make sure you pull the latest changes with `repo.git_pull()`.


GPU memory occupied: 2927 MB.


In [5]:
# Log in to the hub before training the model
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# Finally we train and push to the hub
print_gpu_utilization()
trainer.train()
print_gpu_utilization()
trainer.push_to_hub("Training Complete")
print("Done pushing to hub")

The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: dialogue, summary, id. If dialogue, summary, id are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14732
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 230


GPU memory occupied: 2927 MB.




Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to meeting-summary
Configuration saved in meeting-summary/config.json


GPU memory occupied: 11553 MB.


Model weights saved in meeting-summary/pytorch_model.bin
tokenizer config file saved in meeting-summary/tokenizer_config.json
Special tokens file saved in meeting-summary/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 32.0k/1.51G [00:00<?, ?B/s]

Upload file training_args.bin: 100%|##########| 3.11k/3.11k [00:00<?, ?B/s]

remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/tanviraumi/meeting-summary
   16af174..6caa5bf  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Sequence-to-sequence Language Modeling', 'type': 'text2text-generation'}, 'dataset': {'name': 'samsum', 'type': 'samsum', 'args': 'samsum'}}
remote: Enforcing permissions...        
remote: Allowed refs: all        
To https://huggingface.co/tanviraumi/meeting-summary
   6caa5bf..73ad6cf  main -> main



Done pushing to hub
