<a href="https://colab.research.google.com/github/vishy-2004/text-summarizer-neurathon/blob/main/neurathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [7]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset

In [9]:
import os

# Disable W&B logging
os.environ["WANDB_DISABLED"] = "true"

In [13]:

# Load Model & Tokenizer
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

# Load & Preprocess Dataset
def load_dataset(csv_path):
    df = pd.read_csv(csv_path)[["dialogue", "summary"]].dropna()
    return Dataset.from_pandas(df)

def preprocess(examples):
    inputs = ["summarize: " + text for text in examples["dialogue"]]
    labels = tokenizer(examples["summary"], padding="max_length", truncation=True, max_length=100)
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=256)

    # Ensure correct label format for training
    model_inputs["labels"] = [[-100 if t == tokenizer.pad_token_id else t for t in lbl] for lbl in labels["input_ids"]]

    return model_inputs

# Load and preprocess data
dataset = load_dataset("samsum-test (1).csv").map(preprocess, batched=True)
train_dataset, val_dataset = dataset.train_test_split(test_size=0.2).values()

# Training Setup
training_args = TrainingArguments(
    output_dir="./t5_summarization",
    eval_strategy="epoch",
    logging_strategy="steps",  # 🔥 Ensures training loss is logged
    logging_steps=500,  # Logs training loss every 500 steps
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    fp16=torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 7,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

# Save Model
model.save_pretrained("./fine_tuned_t5")
tokenizer.save_pretrained("./fine_tuned_t5")
print("Training completed and model saved!")

# ------------------------ Inference ------------------------

# Load Fine-Tuned Model
model = T5ForConditionalGeneration.from_pretrained("./fine_tuned_t5").to("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = T5Tokenizer.from_pretrained("./fine_tuned_t5")

def generate_summary(text, audience="general", keywords=None, style="balanced"):
    """
    Generate a customized summary.

    :param text: Input text to summarize
    :param audience: "general", "technical", "executive", "legal", "research"
    :param keywords: List of key terms to focus on
    :param style: "precise" (greedy), "balanced" (beam search), "creative" (sampling)
    :return: Generated summary
    """
    length_map = {"technical": (50, 150), "executive": (20, 50), "legal": (40, 120), "research": (40, 120)}
    min_len, max_len = length_map.get(audience, (30, 80))

    if audience in ["legal", "research"] and keywords:
        text = f"summarize with focus on {', '.join(keywords)}: {text}"

    inputs = tokenizer(f"summarize for {audience}: {text}", return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}  # Move to correct device

    decoding_params = {
        "precise": {"length_penalty": 2.0, "repetition_penalty": 2.5, "no_repeat_ngram_size": 3},
        "creative": {"do_sample": True, "top_k": 30, "top_p": 0.9, "temperature": 0.7, "repetition_penalty": 2.5, "no_repeat_ngram_size": 3},
        "balanced": {"length_penalty": 1.5, "num_beams": 3, "early_stopping": True, "repetition_penalty": 2.5, "no_repeat_ngram_size": 3}
    }[style]

    summary_ids = model.generate(**inputs, min_length=min_len, max_length=max_len, **decoding_params)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Example Usage
text = """The legal implications of AI-generated content are being debated globally.
New regulations are being proposed to ensure AI transparency, data privacy, and ethical responsibility.
Legal experts warn about intellectual property challenges in AI-generated works."""

print("\nGeneral Summary:", generate_summary(text, audience="general"))
print("\nTechnical Summary:", generate_summary(text, audience="technical"))
print("\nExecutive Summary:", generate_summary(text, audience="executive"))
print("\nLegal Summary:", generate_summary(text, audience="legal", keywords=["legal risks"]))


Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,2.264579


Epoch,Training Loss,Validation Loss
1,No log,2.264579
2,No log,2.209448


Training completed and model saved!

General Summary: new regulations are being proposed to ensure AI transparency, data privacy and ethical responsibility. legal experts warn about intellectual property challenges in AI-generated works.

Technical Summary: new regulations are being proposed to ensure AI transparency, data privacy and ethical responsibility. legal experts warn about intellectual property challenges in AI-generated works. I am not aware of the implications of artificial intelligence for AI-based content. but it is not clear whether or not they will be subjected to strict rules.

Executive Summary: new regulations are being proposed to ensure AI transparency, data privacy and ethical responsibility. legal experts warn about intellectual property challenges in AI-generated works.

Legal Summary: legal implications of AI-generated content are being debated globally. new regulations are being proposed to ensure AI transparency, data privacy and ethical responsibility. Legal