In [1]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException
from urllib.parse import urlparse, parse_qs
import time
import json

# Set up Firefox WebDriver
service = Service(executable_path="/opt/homebrew/bin/geckodriver")
driver = webdriver.Firefox(service=service)

# Function to track activity and content
def track_activity_with_content():
    tracked_data = []

    try:
        print("Tracking started. Navigate your Firefox browser as usual.")
        
        while True:
            try:
                # Check if the browser window is still open
                current_url = driver.current_url
                current_title = driver.title

                # Capture the entire page content
                page_content = driver.page_source

                # Create an entry for the current browsing session
                entry = {
                    'timestamp': time.time(),
                    'url': current_url,
                    'title': current_title,
                    'page_content': page_content
                }

                # If the current URL is new, add it to the list
                if not tracked_data or tracked_data[-1]['url'] != current_url:
                    tracked_data.append(entry)
                    print(f"Captured content for URL: {current_url}")

            except NoSuchWindowException:
                # Break the loop if the browser window is closed
                print("Browser window closed.")
                break

            # Throttle checks to reduce performance impact
            time.sleep(5)

    except KeyboardInterrupt:
        print("Tracking stopped by user.")

    finally:
        # Save tracked data to a JSON file
        with open('tracked_data_with_content_new.json', 'w') as f:
            json.dump(tracked_data, f, indent=4)

        driver.quit()
        print("Data saved to tracked_data_with_content.json and browser session closed.")

# Open a blank tab to start tracking
driver.get('about:blank')

# Start tracking
track_activity_with_content()

Tracking started. Navigate your Firefox browser as usual.
Captured content for URL: about:blank
Captured content for URL: https://www.youtube.com/
Captured content for URL: https://www.youtube.com/results?search_query=tools+to+use+for+trading
Captured content for URL: https://www.youtube.com/watch?v=e7vzK-fEQks
Browser window closed.
Data saved to tracked_data_with_content.json and browser session closed.


In [3]:
import json
from bs4 import BeautifulSoup

# Load the JSON data from file
with open('tracked_data_with_content_new.json', 'r') as f:
    data = json.load(f)

def parse_google_search_results(html_content):
    """Extract search results from Google search page content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    results = []

    # Find each search result element (specific to Google Search)
    for result in soup.select('div.g'):
        title = result.find('h3').text if result.find('h3') else ''
        link = result.find('a')['href'] if result.find('a') else ''
        results.append({'title': title, 'link': link})

    return results

def parse_amazon_product_listings(html_content):
    """Extract product listings from Amazon search results."""
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []

    # Find each product listing element (specific to Amazon)
    for product in soup.select('.s-main-slot .s-result-item'):
        title = product.find('span', class_='a-size-medium').text if product.find('span', class_='a-size-medium') else ''
        price = product.find('span', class_='a-price-whole').text if product.find('span', class_='a-price-whole') else ''
        products.append({'title': title, 'price': price})

    return products

# Process each entry in the data
for entry in data:
    url = entry['url']
    html_content = entry['page_content']

    print(f"\nProcessing URL: {url}")

    # Check the type of URL and parse accordingly
    if 'google.com/search' in url:
        # Parse Google search results
        search_results = parse_google_search_results(html_content)
        print(f"Google Search Results: {search_results}")

    elif 'amazon.co.uk/s' in url:
        # Parse Amazon product listings
        product_listings = parse_amazon_product_listings(html_content)
        print(f"Amazon Product Listings: {product_listings}")

    else:
        # For other URLs, you may want to extract general information
        soup = BeautifulSoup(html_content, 'html.parser')
        page_text = soup.get_text(strip=True)
        print(f"Page Text (truncated): {page_text[:200]}...")



Processing URL: about:blank
Page Text (truncated): ...

Processing URL: https://www.google.com/
Page Text (truncated): GooglePlease clickhereif you are not redirected within a few seconds.AboutStoreGmailImagesSign inChoose what you’re giving feedback onSee moreDeleteDeleteReport inappropriate predictionsI'm Feeling Cu...

Processing URL: https://www.google.com/search?q=israel+hezbollah+war&sca_esv=8536d3cccb765549&sca_upv=1&source=hp&ei=iO_2Zr_KMfawhbIPiK_voQY&iflsig=AL9hbdgAAAAAZvb9mMMXrGiNBRJJPIJcPUh6Z7o5O7MB&gs_ss=1&oq=&gs_lp=Egdnd3Mtd2l6IgAqAggBMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQLhgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BMhAQABgDGOUCGOoCGIwDGI8BSIcUUABYAHABeACQAQCYAQCgAQCqAQC4AQHIAQCYAgGgAgeoAgqYAweSBwExoAcA&sclient=gws-wiz
Google Search Results: []

Processing URL: https://www.timesofisrael.com/israel-is-pounding-hezbollah-but-in-war-the-iran-b