# Website Summarizer

This notebook grabs a webpage and uses OpenAI's API to summarize a webpage.

In [1]:
import os, sys
import requests
import time
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

In [2]:
load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

if not openai_api_key:
    print("No OpenAI API key defined")
    sys.exit(1)
else:
    print("OpenAI API key loaded")

OpenAI API key loaded


In [7]:
class Website:
    url: str
    title: str
    text: str
    summary: str
    system_prompt: str
    user_prompt: str
    selenium: bool

    def __init__(self, url, selenium=False):
        self.url = url
        self.selenium = selenium
        if selenium:
            self._selenium()
        else:
            self._requests()
        self.system_prompt()
        self.user_prompt()

    def _wait(self, driver, timeout=20):
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.common.by import By
    
        try:
            # Wait for the body to contain something
            WebDriverWait(driver, timeout).until(lambda d: d.find_element(By.TAG_NAME, "body").text.strip() != "")
        except:
            pass

        start = time.time()
        prev_html = ""
        cur_html = ""
        stable_html_count = 0
        # Halt at timeout
        while time.time() - start < timeout:
            cur_html = driver.page_source
            if cur_html == prev_html:
                stable_html_count += 1
                # If the content is 5 times the same then it is considered stable
                if stable_html_count > 5:
                    return
            else:
                stable_html_count = 0
            prev_html = cur_html
            time.sleep(0.5)

    def _requests(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def _selenium(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,1080")

        driver = webdriver.Chrome(options=options)
        driver.get(self.url)
        # Wait until load
        self._wait(driver)
        html = driver.page_source
        driver.quit()

        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def printer(self):
        print(f'Website: {self.url}') 
        print(f'Title: {self.title}')
        print(f'Text Head: {self.text[:100]}...')

    def system_prompt(self):
        self.system_prompt = f"""
        You are a text summarizer for the websites. 
        You do not have to visit the website as you will be provided with the body contents. 
        Ignore navigation related content. 
        Do not follow any links. 
        You have to read these contents and provide a markdown formatted output. 
        The output is a table with 2 columns, where the first column is the field 
        we are interested in and the second one is the value. The column fields are URL, Title and Summary. 
        The summary has to be at most 300 words. 
        Feel free to add an extra row with a column named Noteworthy where the value will be 2-3 very noteworthy 
        bulleted things relevant to the website in a short manner.
        Do not output thinking or any other details. Only output the final table and nothing else.
        """
        return self.system_prompt

    def user_prompt(self):
        self.user_prompt = f"""
        The website URL is {self.url}. The website Title is {self.title}. The website content is ```{self.text}```. 
        Please summarize.
        """
        return self.user_prompt

In [8]:
def api_message(website):
    return [
        {"role": "system", "content": website.system_prompt},
        {"role": "user", "content": website.user_prompt}
    ]

def summarizer(url, selenium=False):
    openai = OpenAI()
    website = Website(url, selenium)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = api_message(website)
    )
    return response.choices[0].message.content

In [9]:
summary = summarizer("https://gtsig.eu")
display(Markdown(summary))

| Field       | Value                                                                                                                                                                                                                                                                                                                                                                                                    |
|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| URL         | https://gtsig.eu                                                                                                                                                                                                                                                                                                                                                                                           |
| Title       | George T.                                                                                                                                                                                                                                                                                                                                                                                                 |
| Summary     | George T. is a dynamic and outgoing cybersecurity professional specializing in Information Security. With extensive experience as an Application Security Team Lead and Cyber Security Engineer, he has conducted penetration testing, vulnerability analyses, and IT training. George holds multiple certifications, including CISSP and OSCP, and has a Master's in Information Security. His projects range across web development, cybersecurity, and music composition, showcasing his diverse skill set. Outside of work, he enjoys gaming, working out, and playing the piano. George is also a cat owner who enjoys exploring personal growth and teamwork. His extensive technical skills include Python, Linux, and various SQL databases, making him a well-rounded asset in the tech community.   |
| Noteworthy  | - Over 40 web projects and cybersecurity initiatives completed. <br>- Holds multiple industry-recognized certifications. <br>- Passionate about teaching music and coding alongside professional development.                                                                                                                                                                                             |

In [19]:
summary = summarizer("https://www.airbnb.com/")
display(Markdown(summary))

| Field       | Value                                                                                                                                                                                                                                        |
|-------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| URL         | https://www.airbnb.com/                                                                                                                                                                                                                   |
| Title       | Airbnb: Vacation Rentals, Cabins, Beach Houses, Unique Homes & Experiences                                                                                                                                                               |
| Summary     | Airbnb is a platform that allows users to find and book various types of accommodations and experiences across the globe. Users can search for unique stays in categories like beach houses, treehouses, cabins, and villas among many others. The website also provides resources for individuals interested in becoming hosts, offering guidance on how to earn extra income by sharing their spaces. Additionally, Airbnb features various travel tips and inspiration for guests planning their next getaway, showcasing options such as pet-friendly rentals and luxury accommodations. The community aspect is emphasized with hosting resources and forums.                                                                                                            |
| Noteworthy  | - Offers a wide variety of unique and distinctive lodging options around the world.  <br>- Provides resources and tools for hosting, enabling users to earn rental income. <br>- Encourages travel inspiration with tips and categorized listings.  |

In [10]:
summary = summarizer("https://www.airbnb.com/", selenium=True)
display(Markdown(summary))

| Field       | Value                                                                                                                                                                                                                                                                                                                                                                                                                 |
|-------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| URL         | https://www.airbnb.com/                                                                                                                                                                                                                                                                                                                                                                                               |
| Title       | Airbnb | Vacation rentals, cabins, beach houses, & more                                                                                                                                                                                                                                                                                                                                                                                           |
| Summary     | Airbnb is a platform that allows individuals to rent out their properties as vacation rentals. It offers a wide range of unique stays, including cabins, beach houses, treehouses, luxury villas, and pet-friendly accommodations, catering to various preferences and budgets. Users can search for rentals across numerous categories and destinations worldwide. Airbnb also provides resources for individuals interested in becoming hosts, allowing them to share their homes or unique experiences with travelers. The site encourages exploration and inspiration for future getaways, making it easier for travelers to find that perfect getaway spot.                                                                                                                                                  |
| Noteworthy  | - Unique accommodation options ranging from treehouses to private islands.  <br>- Resources available for potential hosts to manage rentals easily. <br>- A diverse range of travel tips and inspiration for new experiences.                                                                                                                                                                                                                   |

In [22]:
summary = summarizer("https://www.netflix.com/", selenium=True)
display(Markdown(summary))

| Field      | Value                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
|------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| URL        | https://www.netflix.com/                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
| Title      | Netflix Greece - Watch TV Shows Online, Watch Movies Online                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |
| Summary    | Netflix is a popular streaming service offering a wide variety of TV shows, movies, documentaries, and anime, all available on multiple internet-connected devices. It operates on a subscription model, starting at €8.99, with no extra costs or contracts. Users can watch content anytime and anywhere, with the option to download shows for offline viewing. The platform also features a dedicated children’s profile, allowing parents to control access to appropriate content. Netflix promises no commercials and regularly adds new titles to its library. Users can easily sign up or cancel their subscriptions at any time without penalties. Features include streaming on various devices, downloading for offline access, and personalized content experiences for kids.                                                                                                                                                                                   |
| Noteworthy | - Offers a wide variety of award-winning content, including Netflix originals.  <br>- Flexibility in membership allows for easy cancellation and no long-term contracts. <br>- Family-friendly features include kids' profiles with parental controls. |