# Website Summarizer

This notebook grabs a webpage and uses OpenAI's API to summarize a webpage.

In [None]:
import os, sys
import requests
import time
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

if not openai_api_key:
    print("No OpenAI API key defined")
    sys.exit(1)
else:
    print("OpenAI API key loaded")

In [None]:
class Website:
    url: str
    title: str
    text: str
    summary: str
    system_prompt: str
    user_prompt: str
    selenium: bool

    def __init__(self, url, selenium=False):
        self.url = url
        self.selenium = selenium
        if selenium:
            self._selenium()
        else:
            self._requests()
        self.system_prompt()
        self.user_prompt()

    def _wait(self, driver, timeout=20):
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.common.by import By
    
        try:
            # Wait for the body to contain something
            WebDriverWait(driver, timeout).until(lambda d: d.find_element(By.TAG_NAME, "body").text.strip() != "")
        except:
            pass

        start = time.time()
        prev_html = ""
        cur_html = ""
        stable_html_count = 0
        # Halt at timeout
        while time.time() - start < timeout:
            cur_html = driver.page_source
            if cur_html == prev_html:
                stable_html_count += 1
                # If the content is 5 times the same then it is considered stable
                if stable_html_count > 5:
                    return
            else:
                stable_html_count = 0
            prev_html = cur_html
            time.sleep(0.5)

    def _requests(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def _selenium(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,1080")

        driver = webdriver.Chrome(options=options)
        driver.get(self.url)
        # Wait until load
        self._wait(driver)
        html = driver.page_source
        driver.quit()

        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def printer(self):
        print(f'Website: {self.url}') 
        print(f'Title: {self.title}')
        print(f'Text Head: {self.text[:100]}...')

    def system_prompt(self):
        self.system_prompt = f"""
        You are a text summarizer for the websites. 
        You do not have to visit the website as you will be provided with the body contents. 
        Ignore navigation related content. 
        Do not follow any links. 
        You have to read these contents and provide a markdown formatted output. 
        The output is a table with 2 columns, where the first column is the field 
        we are interested in and the second one is the value. The column fields are URL, Title and Summary. 
        The summary has to be at most 300 words. 
        Feel free to add an extra row with a column named Noteworthy where the value will be 2-3 very noteworthy 
        bulleted things relevant to the website in a short manner.
        Do not output thinking or any other details. Only output the final table and nothing else.
        """
        return self.system_prompt

    def user_prompt(self):
        self.user_prompt = f"""
        The website URL is {self.url}. The website Title is {self.title}. The website content is ```{self.text}```. 
        Please summarize.
        """
        return self.user_prompt

In [None]:
def api_message(website):
    return [
        {"role": "system", "content": website.system_prompt},
        {"role": "user", "content": website.user_prompt}
    ]

def summarizer(url, selenium=False):
    openai = OpenAI()
    website = Website(url, selenium)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = api_message(website)
    )
    return response.choices[0].message.content

In [None]:
summary = summarizer("https://gtsig.eu")
display(Markdown(summary))

In [None]:
summary = summarizer("https://www.airbnb.com/")
display(Markdown(summary))

In [None]:
summary = summarizer("https://www.airbnb.com/", selenium=True)
display(Markdown(summary))

In [None]:
summary = summarizer("https://www.netflix.com/", selenium=True)
display(Markdown(summary))