In [1]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from urllib.parse import urlparse
import re

print("Loading summarization model... (This may take 30–60 seconds)")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
print("Model loaded successfully!\n")


def extract_text_from_url(url):
    """
    Fetches and cleans text content from a given webpage URL.
    Works best for blogs, reports, and article pages.
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
    }

    try:
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching URL: {e}")
        return None

    soup = BeautifulSoup(response.text, "html.parser")

    # Remove unwanted tags (scripts, styles, etc.)
    for tag in soup(["script", "style", "header", "footer", "nav", "form", "aside"]):
        tag.extract()

    # Extract paragraphs
    text = " ".join(p.get_text() for p in soup.find_all("p"))

    # Clean up text
    text = re.sub(r"\s+", " ", text)
    return text.strip()

def summarize_text(text, max_chunk=1024):
    """
    Splits long text into chunks and summarizes each part.
    """
    if not text:
        return "No text found to summarize."

    # Split text into manageable chunks for the model
    words = text.split()
    summaries = []

    for i in range(0, len(words), max_chunk):
        chunk = " ".join(words[i:i + max_chunk])
        try:
            summary = summarizer(chunk, max_length=200, min_length=50, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        except Exception as e:
            print(f"Skipping chunk due to error: {e}")
            continue

    return " ".join(summaries)

if __name__ == "__main__":
    print("Web Page Text Summarizer\n")
    url = input("Enter the URL of a report, article, or research paper:\n> ").strip()

    if not urlparse(url).scheme:
        url = "https://" + url

    print("\nExtracting text from webpage...")
    text = extract_text_from_url(url)

    if not text or len(text) < 300:
        print("Not enough readable text found on this page. Try another URL.")
    else:
        print("Text extracted successfully!\n")
        print(f"Extracted text preview (first 500 characters):\n{text[:500]}...\n")

        print("Generating summary... Please wait...\n")
        summary = summarize_text(text)

        print("Final Summary:\n")
        print(summary)


  from .autonotebook import tqdm as notebook_tqdm


Loading summarization model... (This may take 30–60 seconds)



Device set to use cpu


Model loaded successfully!

Web Page Text Summarizer


Extracting text from webpage...
Text extracted successfully!

Extracted text preview (first 500 characters):
Introduction to Operating System Types of Operating Systems Kernel in Operating System System Call What happens when we turn on computer? Introduction of Process Management CPU Scheduling in Operating Systems Introduction to Process Synchronization Solutions to Process Synchronization Problems Classical IPC Problems Introduction of Deadlock in Operating System Handling Deadlocks Multithreading in OS - Different Models Introduction to memory and memory units Memory Management in Operating System ...

Generating summary... Please wait...

Final Summary:

In an operating system that uses paging, a page replacement algorithm is needed when a page fault occurs and no free page frame is available. In this case, one of the existing pages in memory must be replaced with the new page. Optimal page replacement is perfect, but not poss