In [3]:
# imports

import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI

In [4]:
openai = OpenAI(api_key="Api_Key")

In [5]:
# A class to represent a Webpage

class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

In [7]:
# Let's try one out

ed = Website("https://www.kaggle.com/")
print(ed.title)
print(ed.text)

Kaggle: Your Machine Learning and Data Science Community



In [8]:
system_prompt = "You are an assistant and an expert in analysis that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."

In [53]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "The contents of this website is as follows; \
please provide a short summary of this website in markdown. \
If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [54]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]

## Time to bring it together - the API for OpenAI is very simple!

In [55]:
def summarize(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = messages_for(website)
    )
    return response.choices[0].message.content

In [58]:
summarize("https://www.youtube.com/")

'# Summary of YouTube Website\n\nYouTube is a video-sharing platform where users can upload, view, and engage with video content. The website serves a wide range of audiences, including content creators, advertisers, and developers. Key features include a variety of content categories, support for content creators, and policies regarding copyright and privacy.\n\n### Notable Sections:\n- **Content Creators**: Resources and guidelines for individuals who produce videos for the platform.\n- **Advertising**: Information on advertising opportunities available to businesses.\n- **Privacy and Policies**: Detailed policies regarding user privacy and community standards.\n- **YouTube Features**: Insights into new features and updates to enhance user experience.\n\nOverall, YouTube functions as a comprehensive platform that connects viewers, content creators, and advertisers while adhering to legal guidelines and privacy standards.'

In [59]:
def display_summary(url):
    summary = summarize(url)
    display(Markdown(summary))

In [61]:
display_summary("https://www.youtube.com/")

# YouTube Overview

YouTube is a video sharing platform that allows users to upload, share, and view videos. It serves as a social media site where content creators can reach audiences through a wide range of video content. The website offers various features for both audience engagement and content creation.

## Key Sections
- **Content Creation**: Tools and resources for creators to produce and upload videos.
- **Advertising**: Information for businesses looking to advertise on the platform.
- **Legal**: Sections detailing copyright, terms of service, privacy policies, and community guidelines.
- **Support**: Resources for press inquiries, developer support, and user help.

The website also highlights a commitment to user privacy and security, ensuring a safe environment for all users.

*Note: There are no specific news or announcements mentioned in the provided content.*

# Let's try more websites

Note that this will only work on websites that can be scraped using this simplistic approach.

Websites that are rendered with Javascript, like React apps, won't show up. See the community-contributions folder for a Selenium implementation that gets around this.

Also Websites protected with CloudFront (and similar) may give 403 errors - many thanks Andy J for pointing this out.

But many websites will work just fine!

In [None]:
display_summary("https://cnn.com")

In [None]:
display_summary("https://anthropic.com")