In [63]:
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from IPython.display import display, Markdown
from openai import OpenAI
from tqdm import tqdm
import datetime

In [64]:
# Set your OpenAI API key (replace with your actual key or set as environment variable)
openai.api_key = os.getenv("OPENAI_API_KEY")

In [65]:
client = OpenAI(api_key="your-api-key-here")

In [66]:
class WebsiteCrawler:
    def __init__(self, url):
        self.url = url
        self.title = ""
        self.text = ""
        self.scrape()
    
    def scrape(self):
        try:
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] Starting scrape process...")
            
            # EDIT 1: Add progress bar for Chrome setup
            setup_steps = ["Configuring Chrome", "Setting up driver", "Initializing browser"]
            for step in tqdm(setup_steps, desc="🔧 Browser Setup", unit="step"):
                time.sleep(0.5)  # Simulate setup time
                
            # Chrome options
            chrome_options = Options()
            chrome_options.add_argument("--headless")
            chrome_options.add_argument("--no-sandbox")
            chrome_options.add_argument("--disable-dev-shm-usage")
            chrome_options.add_argument("--disable-gpu")
            chrome_options.add_argument("--window-size=1920,1080")
            chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
            
            # Use ChromeDriverManager to automatically download and manage ChromeDriver
            service = Service(ChromeDriverManager().install())
            
            # Create driver with service
            driver = webdriver.Chrome(service=service, options=chrome_options)
            driver.set_page_load_timeout(30)
            
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 🔍 Loading: {self.url}")
            driver.get(self.url)
            
            # EDIT 2: Add progress bar for page loading
            loading_steps = range(10)  # Simulate 10 loading steps
            for _ in tqdm(loading_steps, desc="📥 Loading Page", unit="step"):
                time.sleep(0.5)
            
            # Try to wait for main content
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "main"))
                )
            except Exception:
                try:
                    WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.TAG_NAME, "body"))
                    )
                except Exception:
                    pass  # Continue anyway
            
            # Get title and page source
            self.title = driver.title
            page_source = driver.page_source
            driver.quit()
            
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ✅ Page loaded: {self.title}")
            
            # EDIT 3: Add progress bar for content parsing
            parsing_steps = ["Parsing HTML", "Removing unwanted elements", "Extracting text", "Cleaning content"]
            for step in tqdm(parsing_steps, desc="🔍 Processing Content", unit="step"):
                time.sleep(0.3)
                
            # Parse with BeautifulSoup
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Remove unwanted elements
            for element in soup(["script", "style", "img", "input", "button", "nav", "footer", "header"]):
                element.decompose()
            
            # Get main content
            main = soup.find('main') or soup.find('article') or soup.find('.content') or soup.find('body')
            if main:
                self.text = main.get_text(separator="\n", strip=True)
            else:
                self.text = soup.get_text(separator="\n", strip=True)
            
            # Clean up text
            lines = [line.strip() for line in self.text.split('\n') if line.strip() and len(line.strip()) > 2]
            self.text = '\n'.join(lines[:200])  # Limit to first 200 lines
            
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 📄 Extracted {len(self.text)} characters")
            
        except Exception as e:
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ Error occurred: {e}")
            self.title = "Error occurred"
            self.text = "Could not scrape website content"


In [67]:
system_prompt = "You are an assistant that analyzes the contents of a website \
and provides a short summary, ignoring text that might be navigation related. \
Respond in markdown."


In [68]:
def user_prompt_for(website):
    user_prompt = f"You are looking at a website titled {website.title}"
    user_prompt += "\nThe contents of this website is as follows; please provide a short summary of this website in markdown. If it includes news or announcements, then summarize these too.\n\n"
    user_prompt += website.text
    return user_prompt

In [69]:
def messages_for(website):
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt_for(website)}
    ]


In [70]:
def summarize_gpt(url):
    """Scrape website and summarize with GPT"""
    print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 🚀 Starting website analysis for: {url}")
    
    site = WebsiteCrawler(url)
    if "Error occurred" in site.title or len(site.text) < 50:
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ Failed to scrape meaningful content from {url}")
        return
    
    print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 🤖 Creating summary...")
    
    try:
        # Debug: Print text length and first 500 characters
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 📊 Text length: {len(site.text)} characters")
        print(f"📝 First 500 characters: {site.text[:500]}...")
        
        # Check if OpenAI API key is set
        if not openai.api_key:
            print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ OpenAI API key not set. Please set your API key.")
            return
        
        # EDIT 4: Add progress bar for API call
        api_steps = ["Preparing request", "Sending to OpenAI", "Processing response", "Formatting output"]
        for step in tqdm(api_steps, desc="🤖 AI Processing", unit="step"):
            time.sleep(0.5)
        
        # Create summary with timeout and error handling
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 🔄 Calling OpenAI API...")
        response = openai.chat.completions.create(
            model="gpt-3.5-turbo",  # You'll need to define MODEL_OPENAI or use this default
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt_for(site)}
            ],
            max_tokens=500,  # Limit response length
            temperature=0.7
        )
        
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ✅ Got response from OpenAI")
        web_summary = response.choices[0].message.content
        
        print(f"\n🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] " + "="*50)
        print("📋 WEBSITE SUMMARY:")
        print("="*50)
        print(web_summary)
        print("="*50 + "\n")
        
        # Also display as markdown if in Jupyter
        try:
            display(Markdown(web_summary))
        except NameError:
            # If not in Jupyter, just print
            pass
            
    except openai.RateLimitError:
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ OpenAI API rate limit exceeded. Please wait and try again.")
    except openai.AuthenticationError:
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ OpenAI API authentication failed. Please check your API key.")
    except openai.APIConnectionError:
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ Failed to connect to OpenAI API. Please check your internet connection.")
    except Exception as e:
        print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] ❌ Error calling OpenAI API: {e}")
        print(f"📄 Scraped content preview: {site.text[:200]}...")
        return

In [71]:
# Test the function
print(f"🕐 [{datetime.datetime.now().strftime('%H:%M:%S')}] 🎯 Starting website analysis...")
summarize_gpt('https://openai.com')
# summarize_gpt('https://stripe.com')
# summarize_gpt('https://vercel.com')
# summarize_gpt('https://react.dev')

🕐 [18:15:12] 🎯 Starting website analysis...
🕐 [18:15:12] 🚀 Starting website analysis for: https://openai.com
🕐 [18:15:12] Starting scrape process...


🔧 Browser Setup: 100%|████████████████████| 3/3 [00:01<00:00,  1.98step/s]


🕐 [18:15:18] 🔍 Loading: https://openai.com


📥 Loading Page: 100%|███████████████████| 10/10 [00:05<00:00,  1.99step/s]


🕐 [18:15:32] ✅ Page loaded: OpenAI


🔍 Processing Content: 100%|███████████████| 4/4 [00:01<00:00,  3.32step/s]


🕐 [18:15:33] 📄 Extracted 3329 characters
🕐 [18:15:37] 🤖 Creating summary...
🕐 [18:15:37] 📊 Text length: 3329 characters
📝 First 500 characters: OpenAI
What can I help with?
Message ChatGPT
Quiz me on vocabulary
Plan a surf trip to Costa Rica in August
India stock market today
Explica por qué el maíz palomitas explota
Teach me Mahjong for beginners
Find hiking boots for wide feet
Explain this code
Was mach ich in Berlin wenn es regnet?
What are some outdoor markets in Mexico City?
Rédigez une note de remerciement
Recommend an easy potluck dish
ハーフマラソンのトレーニングを手伝ってください
Help me improve this job description
Write a Python script
Draw a pictu...


🤖 AI Processing: 100%|████████████████████| 4/4 [00:02<00:00,  1.99step/s]


🕐 [18:15:39] 🔄 Calling OpenAI API...
🕐 [18:15:41] ✅ Got response from OpenAI

📋 WEBSITE SUMMARY:
# Summary
The website is for OpenAI, a company that provides various AI services and products. The site includes features like ChatGPT for interacting with AI, vocabulary quizzes, travel planning assistance, coding help, and more. There are also sections for news, releases, stories, and research related to GPT-5, the latest model. The company focuses on AI advancements and their applications across different domains, including business and healthcare. Recent updates include the introduction of GPT-5, safety evaluations, and partnerships with companies like Oracle.



# Summary
The website is for OpenAI, a company that provides various AI services and products. The site includes features like ChatGPT for interacting with AI, vocabulary quizzes, travel planning assistance, coding help, and more. There are also sections for news, releases, stories, and research related to GPT-5, the latest model. The company focuses on AI advancements and their applications across different domains, including business and healthcare. Recent updates include the introduction of GPT-5, safety evaluations, and partnerships with companies like Oracle.