In [6]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException
from datetime import datetime
from urllib.parse import urlparse, parse_qs
import time
import json
import re

# Set up Firefox WebDriver
service = Service(executable_path="/opt/homebrew/bin/geckodriver")
driver = webdriver.Firefox(service=service)


# Function to extract search query from the URL
# Function to extract search query from the URL
def extract_search_query(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    # Check common query parameter names for search queries (e.g., q, query, k)
    search_query = query_params.get('q') or query_params.get('query') or query_params.get('k')
    if search_query:
        return search_query[0]  # Return the first match

    # For URLs where search terms are part of the path, e.g., Amazon
    # Extract search query from Amazon's typical search URL structure
    match = re.search(r'/s\?k=([^&]+)', url)
    if match:
        return unquote(match.group(1))

    return None

# Function to track activity and content
def track_activity_with_content():
    page_data = []

    try:
        print("Tracking started. Navigate your Firefox browser as usual.")
        
        while True:
            try:
                # Wait for the page to load
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                # Check if the browser window is still open
                page_url = driver.current_url
                page_title = driver.title

                # Get the text content of the page (visible text only)
                page_text = driver.find_element(By.TAG_NAME, "body").text

                # Get the current date and time
                capture_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                # Extract search query from the URL (if any)
                search_query = extract_search_query(page_url)

                # Create a dictionary with the captured data
                entry = {
                    "url": page_url,
                    "title": page_title,
                    "text": page_text,
                    "capture_time": capture_time,
                    "search_query": search_query if search_query else "No search query"
                }
                

                # If the current URL is new, add it to the list
                if not page_data or page_data[-1]['url'] != page_url:
                    page_data.append(entry)
                    print(f"Captured content for URL: {page_url}")

            except NoSuchWindowException:
                # Break the loop if the browser window is closed
                print("Browser window closed.")
                break

            # Throttle checks to reduce performance impact
            time.sleep(5)

    except KeyboardInterrupt:
        print("Tracking stopped by user.")

    finally:
        # Save tracked data to a JSON file
        with open('page_test4.json', 'w') as f:
            json.dump(page_data, f, indent=4)

        driver.quit()
        print("Data saved to page_test4.json and browser session closed.")

# Open a blank tab to start tracking
driver.get('about:blank')

# Start tracking
track_activity_with_content()

Tracking started. Navigate your Firefox browser as usual.
Captured content for URL: about:blank
Captured content for URL: https://www.google.com/
Captured content for URL: https://www.google.com/search?q=the+incredibles&sca_esv=b2f4af84c6259c63&source=hp&ei=gsYLZ63iHOm0hbIPt7iB4Ac&iflsig=AL9hbdgAAAAAZwvUkpGIUvCl1INW1BXuNp09eunotJ_c&ved=0ahUKEwitq5HutouJAxVpWkEAHTdcAHwQ4dUDCA8&uact=5&oq=the+incredibles&gs_lp=Egdnd3Mtd2l6Ig90aGUgaW5jcmVkaWJsZXMyCxAuGIAEGLEDGIMBMgsQLhiABBixAxiDATILEAAYgAQYsQMYgwEyCBAAGIAEGLEDMgsQABiABBixAxiDATILEC4YgAQYsQMY1AIyCBAuGIAEGLEDMgUQABiABDIFEAAYgAQyCBAuGIAEGLEDSOgtUN0KWIQqcAF4AJABAJgBogGgAZsJqgEEMTMuMrgBA8gBAPgBAZgCEKAC9wmoAgrCAhAQABgDGOUCGOoCGIwDGI8BwgIQEC4YAxjlAhjqAhiMAxiPAcICERAuGIAEGLEDGNEDGIMBGMcBwgIOEC4YgAQYsQMY0QMYxwHCAg4QABiABBixAxiDARiKBcICDhAuGIAEGLEDGIMBGIoFwgIFEC4YgATCAg4QLhiABBixAxjHARivAcICERAuGIAEGLEDGIMBGNQCGIoFwgIREC4YgAQYsQMYgwEYxwEYrwHCAgsQLhiABBjHARivAcICDhAAGIAEGLEDGIMBGMkDwgIOEC4YgAQYsQMYgwEY1AKYAweSBwQxNC4yoAf1mQI&sclient=gws-wiz
Captured 