In [1]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException
from datetime import datetime
from urllib.parse import urlparse, parse_qs
import time
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to extract search query from the URL
def extract_search_query(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    search_query = query_params.get('q') or query_params.get('query') or query_params.get('k')
    if search_query:
        return search_query[0]  # Return the first match

    # Extract search query from URLs with search terms in the path (e.g., Amazon)
    match = re.search(r'/s\?k=([^&]+)', url)
    if match:
        return unquote(match.group(1))

    return None

# # Function to tokenize and remove stop words from text
# def process_text(text):
#     # Tokenize the text, ensuring @handles and emails are treated as single tokens
#     words = re.findall(r'\w+@\w+\.\w+|@\w+|\w+', text)

#     # Modify the filtering criteria to retain email addresses, Twitter handles, and alphanumeric words
#     filtered_words = [word for word in words if word.lower() not in stop_words and 
#                       (word.isalnum() or '@' in word or re.match(r'^[£$€]\d+', word))]

#     # Join the filtered words back into a string
#     return ' '.join(filtered_words)
    

# Function to track activity and content
def track_activity_with_content():
    page_data = []

    try:
        print("Tracking started. Navigate your Chrome browser as usual.")
        
        while True:
            try:
                # Wait for the page to load
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                
                page_url = driver.current_url
                page_title = driver.title

                # Get the text content of the page (visible text only)
                page_text = driver.find_element(By.TAG_NAME, "body").text

                # Process the page text by tokenizing and removing stop words
                # processed_text = process_text(page_text)

                # Get the current date and time
                capture_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                # Extract search query from the URL (if any)
                search_query = extract_search_query(page_url)

                # Create a dictionary with the captured data
                entry = {
                    "url": page_url,
                    "title": page_title,
                    "text": page_text,
                    "capture_time": capture_time,
                    "search_query": search_query if search_query else "No search query"
                }

                # If the current URL is new, add it to the list
                if not page_data or page_data[-1]['url'] != page_url:
                    page_data.append(entry)
                    print(f"Captured content for URL: {page_url}")

            except NoSuchWindowException:
                # Break the loop if the browser window is closed
                print("Browser window closed.")
                break

            # Throttle checks to reduce performance impact
            time.sleep(5)

    except KeyboardInterrupt:
        print("Tracking stopped by user.")

    finally:
        # Save tracked data to a JSON file
        with open('page_test12.json', 'w') as f:
            json.dump(page_data, f, indent=4)

        driver.quit()
        print("Data saved to page_test12.json and browser session closed.")

# Initialize the undetected Chrome driver
driver = uc.Chrome()

# Open a blank tab to start tracking
driver.get('about:blank')

# Start tracking
track_activity_with_content()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tarunkashyap/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tarunkashyap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tracking started. Navigate your Chrome browser as usual.
Captured content for URL: about:blank
Captured content for URL: https://www.google.com/
Captured content for URL: https://www.google.com/search?q=ezekiel&sca_esv=fdf7728f28b59cda&source=hp&ei=ksRQZ426EvvOhbIP-azI0Qs&iflsig=AL9hbdgAAAAAZ1DSouSIx8OXKckUk8cD4uHUFCej2C3U&ved=0ahUKEwiNy8Xbgo-KAxV7Z0EAHXkWMroQ4dUDCBE&uact=5&oq=ezekiel&gs_lp=Egdnd3Mtd2l6IgdlemVraWVsMggQLhiABBixAzIIEC4YgAQYsQMyCBAuGIAEGLEDMgsQLhiABBixAxiDATIFEC4YgAQyBRAAGIAEMggQABiABBixAzIFEAAYgAQyCBAAGIAEGLEDMgUQABiABEiYbVC2GljXX3AFeACQAQCYAYEBoAGHB6oBAzguM7gBA8gBAPgBAZgCEKAC0weoAgrCAgoQABgDGOoCGI8BwgIREC4YgAQYsQMY0QMYgwEYxwHCAgsQABiABBixAxiDAcICDhAAGIAEGLEDGIMBGIoFwgIOEC4YgAQYsQMY0QMYxwHCAg4QLhiABBixAxiDARiKBcICCxAuGIAEGMcBGK8BwgIHEC4YgAQYCsICBxAAGIAEGArCAg0QLhiABBjHARgKGK8BwgIKEC4YgAQYsQMYCsICDRAuGIAEGLEDGIMBGArCAgoQABiABBixAxgKmAMGkgcEMTIuNKAH75QB&sclient=gws-wiz
Captured content for URL: https://www.google.com/search?q=anal+beads&sca_esv=fdf7728f28b59cda&ei=2MRQZ4v7