1. Install Dependencies

In [30]:
!pip install transformers datasets torch requests beautifulsoup4 --quiet

2. Download Dataset

In [31]:
from google.colab import userdata
userdata.get('HuggingFace')

'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'

In [32]:
from datasets import load_dataset

# Load CNN/DailyMail summarization dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")
train_data = dataset["train"].select(range(1000))   # small sample for faster training
test_data  = dataset["test"].select(range(200))


3. Preprocess the Data

In [33]:
from transformers import AutoTokenizer

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_data(example):
    inputs = ["summarize: " + doc for doc in example["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    labels = tokenizer(example["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_data.map(preprocess_data, batched=True)
tokenized_test  = test_data.map(preprocess_data, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

4. Fine-tune the Model

In [34]:
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq
)
import os

# Disable W&B
os.environ["WANDB_DISABLED"] = "true"

# Model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",   # ✅ Corrected
    learning_rate=2e-4,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=1,
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,           # ✅ Added
    data_collator=data_collator,   # ✅ Added
)

# Train
trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.0257,2.164441


TrainOutput(global_step=500, training_loss=2.1757991523742675, metrics={'train_runtime': 85.4255, 'train_samples_per_second': 11.706, 'train_steps_per_second': 5.853, 'total_flos': 134474768056320.0, 'train_loss': 2.1757991523742675, 'epoch': 1.0})

5. Save the Model

In [35]:
trainer.save_model("web_summarizer")
tokenizer.save_pretrained("web_summarizer")

('web_summarizer/tokenizer_config.json',
 'web_summarizer/special_tokens_map.json',
 'web_summarizer/spiece.model',
 'web_summarizer/added_tokens.json',
 'web_summarizer/tokenizer.json')

6. Summarizer Setup + Website Text Extraction

In [36]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline

# Load summarizer
summarizer = pipeline("summarization", model="web_summarizer", tokenizer="web_summarizer")

# --- Fetch website text ---
def get_website_text(url):
    try:
        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract readable paragraphs
        paragraphs = [p.get_text() for p in soup.find_all("p")]
        text = " ".join(paragraphs)
        text = " ".join(text.split())  # clean up spaces
        return text
    except Exception as e:
        print(f"❌ Error fetching website: {e}")
        return ""


Device set to use cuda:0


Handle Long Websites Automatically

In [37]:
def split_text(text, max_words=800):
    words = text.split()
    for i in range(0, len(words), max_words):
        yield " ".join(words[i:i+max_words])

def summarize_long_text(text):
    chunks = list(split_text(text))
    print(f"🧩 Split into {len(chunks)} chunks")

    summaries = []
    for i, chunk in enumerate(chunks):
        print(f"⏳ Summarizing chunk {i+1}/{len(chunks)}...")
        summary = summarizer(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(summary[0]['summary_text'])

    # Combine partial summaries
    combined_summary = " ".join(summaries)

    # Optional: summarize again to compress
    final_summary = summarizer(combined_summary, max_length=200, min_length=60, do_sample=False)
    return final_summary[0]['summary_text']


8. Smart Website Summarization

In [40]:
url = input("🔗 Enter website URL: ")
article_text = get_website_text(url)

if len(article_text.strip()) == 0:
    print("⚠️ No readable text found on this website.")
else:
    print("\n🧾 Summary:\n")
    sentences = final_summary.split(". ")
    for i, sentence in enumerate(sentences, start=1):
        if sentence.strip():
            print(f"• {sentence.strip()}.")


🔗 Enter website URL: https://en.wikipedia.org/wiki/Iron_Man

🧾 Summary:

• HDToday is the best site to watch movies and TV series online for free.
• No ads - Free and Fast streaming server - No account required to watch - One click streaming.
• Huge library with over 400,000 videos - 400,000 downloads of movies ..


9. Download Model

In [41]:
!zip -r web_summarizer.zip ./web_summarizer

  adding: web_summarizer/ (stored 0%)
  adding: web_summarizer/tokenizer.json (deflated 74%)
  adding: web_summarizer/tokenizer_config.json (deflated 95%)
  adding: web_summarizer/spiece.model (deflated 48%)
  adding: web_summarizer/generation_config.json (deflated 27%)
  adding: web_summarizer/special_tokens_map.json (deflated 85%)
  adding: web_summarizer/model.safetensors (deflated 9%)
  adding: web_summarizer/config.json (deflated 63%)
  adding: web_summarizer/training_args.bin (deflated 53%)
