# Unzip File from Model Notebook

In [None]:
!unzip -q bart_summarizer_finetuned.zip -d ./bart_summarizer_finetuned


# Install Packages

In [None]:
!pip -q install -U transformers accelerate torch
!pip -q install -U newspaper3k lxml_html_clean
!pip -q install ipywidgets

# Import newspaper3k and widgets

In [None]:
import newspaper
from newspaper import Article
from google.colab import output
output.enable_custom_widget_manager()

# Load the Model from Directory

In [None]:
import os
from pathlib import Path
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
save_dir = PROJECT_ROOT / "models" / "bart_summarizer_finetuned"

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(save_dir).to(device)
model.eval()

print("Loaded on:", device)

# Fetching the Title and Contents of Article from Link

In [None]:
def fetch_article(url: str, language: str = "en", char_cap: int = 30000):
    art = Article(url, language=language)
    art.download()
    art.parse()
    text = (art.text or "").strip()

    # cap to avoid super long pages (Yahoo can be huge)
    if char_cap is not None and len(text) > char_cap:
        text = text[:char_cap]

    return {
        "url": url,
        "title": (art.title or "").strip(),
        "text": text
    }


# Summarize text using the Pretrained Model

In [None]:
def summarize_texts(
    texts,
    max_new_tokens: int = 200,
    min_new_tokens: int = 40,
    num_beams: int = 4,
):
  with torch.no_grad():
    cleaned = [t if (t and t.strip()) else "" for t in texts]

    max_input_len = getattr(model.config, "max_position_embeddings", 1024)  # BART is typically 1024

    inputs = tokenizer(
        cleaned,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_input_len,
    ).to(device)

    output_ids = model.generate(
        **inputs,
        num_beams=num_beams,
        max_new_tokens=max_new_tokens,
        min_new_tokens=min_new_tokens,
        length_penalty=1.0,
        no_repeat_ngram_size=3,
        early_stopping=True,
    )

    return [tokenizer.decode(ids, skip_special_tokens=True).strip() for ids in output_ids]


# Summarizing the Article from a Link

In [None]:
import re
from typing import List
def summarize_links(
    urls,
    language: str = "en",
    batch_size: int = 4,   # increase if GPU can handle it
):
    """
    Variable number of links -> one summary per link.
    """
    # 1) scrape
    articles = [fetch_article(u, language=language) for u in urls]

    # 2) summarize in batches
    summaries = []
    texts = [a["text"] for a in articles]
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_summaries = summarize_texts(batch)
        summaries.extend(batch_summaries)

    # 3) pack results
    results = []
    for art, summ in zip(articles, summaries):
        results.append({
            "url": art["url"],
            "title": art["title"],
            "chars_extracted": len(art["text"]),
            "summary": summ if art["text"] else "Could not extract article text (site may block scraping)."
        })
    return results

# Creating Widget Interface to paste links

Articles to use:
https://finance.yahoo.com/news/why-shares-wix-com-stock-204914840.html

https://finance.yahoo.com/news/exxon-mobil-xom-forecasts-lower-205226995.html

https://finance.yahoo.com/news/chevron-cvx-talks-expand-oil-205238211.html

In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output

MAX_LINKS = 3

boxes = [widgets.Text(
    placeholder=f"Paste link {i+1} (optional)",
    description=f"URL {i+1}:",
    layout=widgets.Layout(width="90%")
) for i in range(MAX_LINKS)]

btn = widgets.Button(description="Summarize", button_style="primary")
out = widgets.Output()

def on_click(_):
    with out:
        clear_output()
        urls = [b.value.strip() for b in boxes if b.value.strip()]
        if not urls:
            print("Please enter at least one URL.")
            return

        results = summarize_links(urls, batch_size=1)  # your existing function

        for i, r in enumerate(results, 1):
            print(f"\n--- Article {i} ---")
            print("Title:", r["title"])
            print("URL:", r["url"])
            print("Extracted chars:", r["chars_extracted"])
            print("Summary:", r["summary"])

btn.on_click(on_click)

display(*boxes, btn, out)
