**Set Up Dependencies**

In [1]:
!pip install transformers torch datasets;

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.9 MB/s[0m eta [36m0:00:

In [2]:
!pip install accelerate;



**Loading Model from Hugging Face**

In [3]:
# Loading Model from HuggingFace
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-small", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

**Memory Management**

In [4]:
# Check if GPU is available
import torch
print(torch.cuda.is_available())

True


In [5]:
# Move the model to GPU, to accelerate training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

**Loading Dataset from Hugging face**

In [6]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
dataset = ds['train'] # Gives training split

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

**Preprocessing Dataset**

In [7]:
# Split the dataset into training and testing subsets
dataset_split = dataset.train_test_split(test_size=0.1)

# Further reduce the training set size for faster testing during development
small_train_dataset = dataset_split['train'].train_test_split(test_size=0.99)['train']
eval_dataset = dataset_split['test']

In [8]:
def preprocess_function(examples):
    inputs = ['summarize: ' + doc for doc in examples['article']]
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)

    # Tokenize summaries (labels)
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(examples['highlights'], max_length=128, padding='max_length', truncation=True)

      model_inputs["labels"] = labels["input_ids"]

    model_inputs = {k: torch.tensor(v) for k, v in model_inputs.items()}
    return model_inputs

In [9]:
# Tokenize the small training dataset
tokenized_train_dataset = small_train_dataset.map(preprocess_function, batched=True)

# Tokenize the evaluation dataset
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2584 [00:00<?, ? examples/s]



Map:   0%|          | 0/28712 [00:00<?, ? examples/s]

In [10]:
# Moving to GPU
for batch in tokenized_train_dataset:
    batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

In [11]:
# Move Preprocessed Data to GPU
tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

**Setting Parameters and Preparing for Training**

In [12]:
from transformers import Seq2SeqTrainingArguments

# Defining Training Parameters
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',              # Directory to save the model checkpoints
    evaluation_strategy="epoch",         # Evaluate the model at the end of every epoch
    per_device_train_batch_size =2,
    per_device_eval_batch_size=2,        # Batch size for evaluation
    gradient_accumulation_steps=2,
    weight_decay=0.01,                   # Regularization to prevent overfitting
    save_total_limit=3,                  # Only keep the last 3 checkpoints
    num_train_epochs=3,                  # Number of training epochs
    predict_with_generate=True,          # Enable text generation during evaluation
    logging_dir="./logs",                 # Directory for storing training logs
    fp16=True
)



In [13]:
from transformers import Seq2SeqTrainer

# Create the trainer object
trainer = Seq2SeqTrainer(
    model=model,                         # The model to be trained
    args=training_args,                  # The training arguments defined earlier
    train_dataset=tokenized_train_dataset,  # The tokenized training dataset
    eval_dataset=tokenized_eval_dataset,    # The tokenized evaluation dataset
    tokenizer=tokenizer                  # The tokenizer to handle input and output
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [14]:
torch.cuda.empty_cache()

In [15]:
# Memory Management for PyTorch CUDA
import os
!set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [16]:
# Troubleshooting
print(os.environ.get('PYTORCH_CUDA_ALLOC_CONF'))

expandable_segments:True


**Training, Evaluation and Model Testing**

In [17]:
# Model Training, or Fine-Tuning
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,


TrainOutput(global_step=1938, training_loss=0.0, metrics={'train_runtime': 1918.2517, 'train_samples_per_second': 4.041, 'train_steps_per_second': 1.01, 'total_flos': 1441023192465408.0, 'train_loss': 0.0, 'epoch': 3.0})

In [18]:
# Model Evaluation
metrics = trainer.evaluate()

print(metrics)

{'eval_loss': nan, 'eval_runtime': 515.0106, 'eval_samples_per_second': 55.75, 'eval_steps_per_second': 27.875, 'epoch': 3.0}


In [19]:
# Summarization Function that uses the model
def summarize(text):
    # Prefix the input text with "summarize: "
    prefixed_text = "summarize: " + text

    # Tokenize the input with appropriate max length and truncation
    inputs = tokenizer(prefixed_text, return_tensors="pt", max_length=512, padding="max_length", truncation=True).to(device)

    # Generate summary using the fine-tuned model
    summary_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)

    # Decode and return the generated summary
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [20]:
input_text = """
The US stock market had its worst day of the year, with the Dow Jones Industrial Average dropping by more than 800 points, or 3%, on Tuesday. Investors are concerned about the impact of inflation and potential interest rate hikes by the Federal Reserve. The tech-heavy Nasdaq also fell sharply, losing over 4% of its value. Experts warn that if inflation continues to rise, the central bank may have to increase rates more aggressively, which could lead to slower economic growth. Despite these challenges, some analysts remain optimistic that the market will recover in the long term.
"""

summary = summarize(input_text)
print(summary)

Stocks in the US stock market have fallen sharply on Tuesday, with the Dow Jones Industrial Average falling by more than 800 points.


**Deploying Model onto Hugging Face**

In [21]:
!pip install huggingface_hub;



In [24]:
# Logging into Hugging Face
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Assuming `model` and `tokenizer` are already defined in your notebook
save_directory = "./basic-flan-t5-summarizer"

# Save the fine-tuned model and tokenizer locally
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./basic-flan-t5-summarizer/tokenizer_config.json',
 './basic-flan-t5-summarizer/special_tokens_map.json',
 './basic-flan-t5-summarizer/spiece.model',
 './basic-flan-t5-summarizer/added_tokens.json')

In [28]:
from huggingface_hub import HfApi

# Set your repository name (make sure it matches your Hugging Face username)
repo_name = "varshivenkatesh/basic-flan-t5-text-summarizer"

# Push the model and tokenizer files to the Hugging Face Hub
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/varshivenkatesh/basic-flan-t5-text-summarizer/commit/826616ae2e70057db069442f304ace7b700d9a1c', commit_message='Upload tokenizer', commit_description='', oid='826616ae2e70057db069442f304ace7b700d9a1c', pr_url=None, pr_revision=None, pr_num=None)