In [2]:
import json
from bs4 import BeautifulSoup

# Load the JSON data from file
with open('tracked_data_with_content_new.json', 'r') as f:
    data = json.load(f)

def parse_google_search_results(html_content):
    """Extract search results from Google search page content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    results = []

    # Find each search result element (specific to Google Search)
    for result in soup.select('div.g'):
        title = result.find('h3').text if result.find('h3') else ''
        link = result.find('a')['href'] if result.find('a') else ''
        results.append({'title': title, 'link': link})

    return results

def parse_amazon_product_listings(html_content):
    """Extract product listings from Amazon search results."""
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []

    # Find each product listing element (specific to Amazon)
    for product in soup.select('.s-main-slot .s-result-item'):
        title = product.find('span', class_='a-size-medium').text if product.find('span', class_='a-size-medium') else ''
        price = product.find('span', class_='a-price-whole').text if product.find('span', class_='a-price-whole') else ''
        products.append({'title': title, 'price': price})

    return products

# Process each entry in the data
for entry in data:
    url = entry['url']
    html_content = entry['page_content']

    print(f"\nProcessing URL: {url}")

    # Check the type of URL and parse accordingly
    if 'google.com/search' in url:
        # Parse Google search results
        search_results = parse_google_search_results(html_content)
        print(f"Google Search Results: {search_results}")

    elif 'amazon.co.uk/s' in url:
        # Parse Amazon product listings
        product_listings = parse_amazon_product_listings(html_content)
        print(f"Amazon Product Listings: {product_listings}")

    else:
        # For other URLs, you may want to extract general information
        soup = BeautifulSoup(html_content, 'html.parser')
        page_text = soup.get_text(strip=True)
        print(f"Page Text (truncated): {page_text[:200]}...")



Processing URL: about:blank
Page Text (truncated): ...

Processing URL: https://www.youtube.com/
Page Text (truncated): YouTube•NaN / NaN•NaN / NaNBackGBSkip navigationSearchSearchSign inGBHomeHomeShortsShortsSubscriptionsSubscriptionsYouYouHistoryHistoryYour YouTube History is offYou can turn on watch and search histo...

Processing URL: https://www.youtube.com/results?search_query=tools+to+use+for+trading
Page Text (truncated): tools to use for trading - YouTube•NaN / NaN•NaN / NaNBackGBSkip navigationSearchSearchSign inGBHomeHomeShortsShortsSubscriptionsSubscriptionsYouYouHistoryHistoryYour YouTube History is offYou can tur...

Processing URL: https://www.youtube.com/watch?v=e7vzK-fEQks
Page Text (truncated): YouTube•NaN / NaN•NaN / NaNBackGBSkip navigationSearchSearchSign inGBHomeHomeShortsShortsSubscriptionsSubscriptionsYouYouHistoryHistoryYour YouTube History is offYou can turn on watch and search histo...


In [4]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException
from datetime import datetime
import time
import json

# Set up Firefox WebDriver
service = Service(executable_path="/opt/homebrew/bin/geckodriver")
driver = webdriver.Firefox(service=service)

# Function to capture and store page data
def capture_page_data(url):
    try:
        # Load the page
        driver.get(url)

        # Wait for the page to load
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

        # Get the page URL
        page_url = driver.current_url

        # Get the text content of the page (visible text only)
        page_text = driver.find_element(By.TAG_NAME, "body").text

        # Get the current date and time
        capture_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        # Create a dictionary with the captured data
        page_data = {
            "url": page_url,
            "text": page_text,
            "capture_time": capture_time
        }

        # Store the data in JSON format
        json_filename = "page_data.json"
        with open(json_filename, "w") as json_file:
            json.dump(page_data, json_file, indent=4)

        print(f"Page data captured and stored in {json_filename}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Function to track URL changes and capture new page data
def track_url_changes():
    print("Tracking URL changes... Press Ctrl+C to stop.")
    try:
        previous_url = driver.current_url
        
        while True:
            current_url = driver.current_url
            
            # Check if the URL has changed
            if current_url != previous_url:
                print(f"New page detected: {current_url}")
                
                # Capture data for the new page
                capture_page_data(current_url)
                
                # Update the previous URL to the current one
                previous_url = current_url
            
            # Wait a short period before checking again to avoid excessive CPU usage
            time.sleep(1)

    except KeyboardInterrupt:
        print("Stopping tracking...")
    
    finally:
        # Close the browser when done
        driver.quit()
        print("Browser closed.")

# Example usage: open a page to start tracking
driver.get('about:blank')  # Start with a blank page
track_url_changes()  # Track URL changes and capture data for each new page


Tracking URL changes... Press Ctrl+C to stop.
New page detected: https://www.amazon.co.uk/
Page data captured and stored in page_data.json
New page detected: https://www.amazon.co.uk/s/ref=nb_sb_noss_2?url=search-alias%3Daps&field-keywords=electric+sscrew+driver&crid=6OP69GJEV6QI&sprefix=electric+sscrew+driver%2Caps%2C100
Page data captured and stored in page_data.json
New page detected: https://www.amazon.co.uk/s?k=electric+sscrew+driver&crid=6OP69GJEV6QI&sprefix=electric+sscrew+driver%2Caps%2C100&ref=nb_sb_noss_2
Page data captured and stored in page_data.json
New page detected: https://www.amazon.co.uk/Cordless-Screwdriver-MYMULIKE-Rechargeable-Furniture/dp/B09SPYQXFQ/ref=sr_1_5?crid=6OP69GJEV6QI&dib=eyJ2IjoiMSJ9.OBD8BnFEf4igurLNRScLyDOeGG5E99EJF_E6-tZ1KBJdVtmO6iVqK6Glnqs24s0Wh59CS_LLxEZ38LmwOQkAogUo60MLiYfYyMBBJvFXbhMikZ1UKpZuctxOZFqBVBZlnYaaq3XLAJz1LYe_FMvjyDFz14Wvjg1jh2pMZ3iXq-vANR_REnbvGbaz_7NgSLqNxnElQ8VtBEglNGF59vcjAEKeBXlBno7zPmLTblDRSg2STnWR4MVdwXFagp2WD_XsPbCc8fZThPC-z9RXwY

NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:193:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:679:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:515:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:147:4
GeckoDriver.prototype.getCurrentUrl@chrome://remote/content/marionette/driver.sys.mjs:908:15
despatch@chrome://remote/content/marionette/server.sys.mjs:318:40
execute@chrome://remote/content/marionette/server.sys.mjs:289:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:262:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:263:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:494:20
