<a href="https://colab.research.google.com/github/venkata55s/ai-agents/blob/main/clip2-summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1 - Import required libraries
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [None]:
# 2 - Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")

In [3]:
def summarize_text(text):
  # Step 2: Tokenize the input text
  # Add a directive prompt "summarize: Focus on key impacts and industries:" to guide the model for better summarization.
  # Tokenize the text, convert it to PyTorch tensors, and truncate it to a maximum length of 512 tokens if necessary.
  inputs = tokenizer.encode(
      "summarize: Focus on key impacts and industries: " + text,
      return_tensors="pt",
      max_length=512,
      truncation=True
  )

  # Step 3: Generate a summary
  # Generate a summary from the model using specific parameters:
  # max_length: Limit the summary to a maximum of 40 tokens to keep it concise.
  # min_length: Ensure the summary is at least 10 tokens long.
  # length_penalty: Penalize longer outputs to prioritize brevity.
  # num_beams: Use beam search with 5 beams for higher-quality text generation.
  # early_stopping: Stop generation early when an acceptable output is found.
  outputs = model.generate(
      inputs,
      max_length=40,
      min_length=10,
      length_penalty=3.5,
      num_beams=5,
      early_stopping=True
  )

  # Step 4: Decode the model output
  # Decode the generated tokens back into human-readable text and remove special tokens (e.g., <pad>, <eos>).
  summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

  # Step 5: Post-process the summary
  # Deduplicate sentences in the summary to improve clarity and readability.
  unique_sentences = list(dict.fromkeys(summary.split(". ")))
  return ". ".join(unique_sentences)

In [None]:
if __name__ == "__main__":
  # Step 6: Define the input text
  # Provide a sample text to summarize.
  sample_text = (
      "Artificial intelligence is a rapidly growing field that involves the creation of "
      "intelligent machines capable of performing tasks that typically require human intelligence. "
      "It is being used in various industries, including healthcare, finance, and transportation, "
      "to improve efficiency and solve complex problems."
  )

  # Step 7: Generate and print the summary
  # Call the summarize_text function with the sample text and print the original text and its summary.
  print("Original Text:")
  print(sample_text)
  print("\nSummary:")
  print(summarize_text(sample_text))