In [1]:
import requests
from bs4 import BeautifulSoup
import json


In [2]:
# Constants
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [3]:
def fetch_website_content(url):
    """
    Fetch the textual content of a website, ignoring irrelevant elements.
    :param url: The URL of the website to summarize.
    :return: Cleaned and relevant text content of the website.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Ensure request was successful
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove irrelevant elements like <script>, <style>, etc.
        for element in soup(["script", "style", "header", "footer", "nav", "aside"]):
            element.decompose()

        # Extract relevant text from <p>, <h1>, <h2>, <h3>, <li>, etc.
        text_elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li'])
        text = ' '.join([elem.get_text(strip=True) for elem in text_elements])

        # Optional: Remove excessive whitespace
        cleaned_text = ' '.join(text.split())
        return cleaned_text

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the website content: {e}")
        return None

In [4]:
def split_text_into_chunks(text, max_length=2000):
    """
    Splits the text into smaller chunks, ensuring each chunk is within the maximum length.
    :param text: The input text to split.
    :param max_length: The maximum length of each chunk.
    :return: A list of text chunks.
    """
    words = text.split()
    chunks = []
    current_chunk = []

    for word in words:
        if len(" ".join(current_chunk + [word])) <= max_length:
            current_chunk.append(word)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]

    # Add the last chunk if it exists
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [5]:
def summarize_text(text):
    """
    Send text to the Ollama API to summarize, handling streaming responses.
    :param text: The text to summarize.
    :return: The summary returned by the Ollama API.
    """
    payload = {
        "model": MODEL,
        "messages": [
            {"role": "system", "content": "Summarize the following text."},
            {"role": "user", "content": text}
        ]
    }
    try:
        # Enable streaming by setting stream=True
        response = requests.post(OLLAMA_API, headers=HEADERS, json=payload, stream=True)
        response.raise_for_status()

        # Combine the streamed chunks into a single message
        full_message = ""
        for line in response.iter_lines(decode_unicode=True):
            if line:  # Process only non-empty lines
                try:
                    data = json.loads(line)
                    message_content = data.get("message", {}).get("content", "")
                    full_message += message_content
                except json.JSONDecodeError:
                    print("Invalid JSON in response line:", line)

        return full_message.strip() if full_message else "No summary returned."

    except requests.exceptions.RequestException as e:
        print(f"Error communicating with the Ollama API: {e}")
        return None

In [6]:
def summarize_large_text(text, max_length=2000):
    """
    Splits large text into smaller chunks and summarizes each chunk separately.
    :param text: The large text to summarize.
    :param max_length: The maximum length of each chunk.
    :return: A combined summary of the entire text.
    """
    chunks = split_text_into_chunks(text, max_length)
    summaries = []

    print(f"Text split into {len(chunks)} chunks for summarization.")
    for i, chunk in enumerate(chunks):
        print(f"Summarizing chunk {i+1} of {len(chunks)}...")
        summary = summarize_text(chunk)
        summaries.append(summary)

    # Combine all summaries into a final summary
    final_summary = " ".join(summaries)
    return final_summary

In [7]:
def website_summarizer(url):
    """
    Fetches content from a website and summarizes it.
    :param url: The URL of the website to summarize.
    :return: The summarized text.
    """
    print(f"Fetching content from: {url}")
    website_content = fetch_website_content(url)
    if not website_content:
        return "Failed to fetch website content."
    
    print("Summarizing the content...")
    summary = summarize_large_text(website_content)
    return summary

In [8]:
if __name__ == "__main__":
    # Example URL
    url = input("Enter a website URL to summarize: ")
    summary = website_summarizer(url)
    print("\nSummary:\n")
    print(summary)

Enter a website URL to summarize:  https://www.teachermagazine.com/sea_en/articles/a-student-diary-project-improving-literacy-skills-and-wellbeing


Fetching content from: https://www.teachermagazine.com/sea_en/articles/a-student-diary-project-improving-literacy-skills-and-wellbeing
Summarizing the content...
Text split into 2 chunks for summarization.
Summarizing chunk 1 of 2...
Summarizing chunk 2 of 2...

Summary:

The Ar Ridha Al Salaam School, an environmentally friendly Islamic school in Indonesia, is implementing a student diary project to improve literacy skills and wellbeing. Due to the pandemic, children were limited to home environments, leading to boredom. To address this issue, the school introduced a diary writing activity where students write about their daily lives for one week at a time, sharing their thoughts and ideas in simple language. The program was highly successful, with almost all students, including those from Grade 1, participating and showing enthusiasm. The project has been shown to be an effective way to improve literacy skills while also providing students with a platform to express themselves and co