In [2]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchWindowException
from datetime import datetime
from urllib.parse import urlparse, parse_qs
import time
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')

# Set up Firefox WebDriver
options = Options()
options.set_preference("general.useragent.override", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0")
# options.set_preference("dom.webdriver.enabled", False)
# options.set_preference('useAutomationExtension', False)
service = Service(executable_path="/opt/homebrew/bin/geckodriver")
driver = webdriver.Firefox(service=service, options=options)

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to extract search query from the URL
def extract_search_query(url):
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)

    search_query = query_params.get('q') or query_params.get('query') or query_params.get('k')
    if search_query:
        return search_query[0]  # Return the first match

    # Extract search query from URLs with search terms in the path (e.g., Amazon)
    match = re.search(r'/s\?k=([^&]+)', url)
    if match:
        return unquote(match.group(1))

    return None

# Function to tokenize and remove stop words from text
def process_text(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Remove stop words but keep words that are alphanumeric or include currency symbols
    filtered_words = [word for word in words if word.lower() not in stop_words and (word.isalnum() or re.match(r'^[£$€]\d+', word))]

    # Join the filtered words back into a string
    return ' '.join(filtered_words)

# Function to track activity and content
def track_activity_with_content():
    page_data = []

    try:
        print("Tracking started. Navigate your Firefox browser as usual.")
        
        while True:
            try:
                # Wait for the page to load
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                
                page_url = driver.current_url
                page_title = driver.title

                # Get the text content of the page (visible text only)
                page_text = driver.find_element(By.TAG_NAME, "body").text

                # Process the page text by tokenizing and removing stop words
                processed_text = process_text(page_text)

                # Get the current date and time
                capture_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

                # Extract search query from the URL (if any)
                search_query = extract_search_query(page_url)

                # Create a dictionary with the captured data
                entry = {
                    "url": page_url,
                    "title": page_title,
                    "text": processed_text,
                    "capture_time": capture_time,
                    "search_query": search_query if search_query else "No search query"
                }

                # If the current URL is new, add it to the list
                if not page_data or page_data[-1]['url'] != page_url:
                    page_data.append(entry)
                    print(f"Captured content for URL: {page_url}")

            except NoSuchWindowException:
                # Break the loop if the browser window is closed
                print("Browser window closed.")
                break

            # Throttle checks to reduce performance impact
            time.sleep(5)

    except KeyboardInterrupt:
        print("Tracking stopped by user.")

    finally:
        # Save tracked data to a JSON file
        with open('page_test9.json', 'w') as f:
            json.dump(page_data, f, indent=4)

        driver.quit()
        print("Data saved to page_test9.json and browser session closed.")

# Open a blank tab to start tracking
driver.get('about:blank')

# Start tracking
track_activity_with_content()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tarunkashyap/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tarunkashyap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Tracking started. Navigate your Firefox browser as usual.
Captured content for URL: about:blank
Captured content for URL: https://www.google.com/
Captured content for URL: https://accounts.google.com/v3/signin/identifier?continue=https%3A%2F%2Fwww.google.com%2F&ec=GAZAmgQ&hl=en&ifkv=ARpgrqdjMmliF8Aa8Pd4vh7ZY8FTVcUJNutKbLskCgm7ogqvDMTIPlvWXs4VGzjz1kDNLBmR6834_A&passive=true&flowName=GlifWebSignIn&flowEntry=ServiceLogin&dsh=S34428907%3A1728843711678769&ddm=0
Captured content for URL: https://accounts.google.com/v3/signin/rejected?continue=https%3A%2F%2Fwww.google.com%2F&ddm=0&dsh=S34428907%3A1728843711678769&ec=GAZAmgQ&epd=AX4S1m-IOUvTxDocKmkhq-G6bwTfA3BZc_EkSYHL0HDgM225jRyRGBBH5w&flowEntry=ServiceLogin&flowName=GlifWebSignIn&hl=en&idnf=contact.tarun1911&ifkv=ARpgrqdjMmliF8Aa8Pd4vh7ZY8FTVcUJNutKbLskCgm7ogqvDMTIPlvWXs4VGzjz1kDNLBmR6834_A&rhlk=le&rrk=46
Browser window closed.
Data saved to page_test9.json and browser session closed.


In [7]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options

# Set up Edge options
edge_options = Options()
edge_options.use_chromium = True  # Ensure you're using the Chromium-based version of Edge

# Create a service object with the path to msedgedriver
service = Service('/Users/tarunkashyap/Desktop/LLM_auto/edgedriver_mac64_m1/msedgedriver')  # Replace with your msedgedriver path

# Initialize the Edge WebDriver
driver = webdriver.Edge(service=service, options=edge_options)

# Navigate to a website
driver.get('https://www.google.com')

# Print the page title to confirm it's working
print(driver.title)

# Close the browser
driver.quit()


Google


In [8]:
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options

# Set up Edge WebDriver with options for macOS
edge_options = Options()

# Disable Selenium WebDriver detection
edge_options.add_argument("--disable-blink-features=AutomationControlled")
edge_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0")

# Set up the service using Microsoft Edge Driver
service = Service('/Users/tarunkashyap/Desktop/LLM_auto/edgedriver_mac64_m1/msedgedriver')

# Launch the browser
driver = webdriver.Edge(service=service, options=edge_options)

# Open a website
driver.get("https://www.google.com")

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Set up Chrome WebDriver with options for macOS
chrome_options = Options()

# Disable Selenium WebDriver detection
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:92.0) Gecko/20100101 Firefox/92.0")

# Set up the service using chromedriver
service = Service(executable_path="/opt/homebrew/bin/chromedriver")

# Launch the browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Open a website
driver.get("https://www.google.com")

In [11]:
!pip3 install undetected-chromedriver

Collecting undetected-chromedriver
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting websockets (from undetected-chromedriver)
  Downloading websockets-13.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.8 kB)
Downloading websockets-13.1-cp312-cp312-macosx_11_0_arm64.whl (155 kB)
Building wheels for collected packages: undetected-chromedriver
  Building wheel for undetected-chromedriver (pyproject.toml) ... [?25ldone
[?25h  Created wheel for undetected-chromedriver: filename=undetected_chromedriver-3.5.5-py3-none-any.whl size=47049 sha256=e536072f7a46467274c35e76ae96226ea8f4ce4a274a5f738d07841be3249c97
  Stored in directory: /Users/tarunkashyap/Library/Caches/pip/wheels/c4/f1/aa/9de6cf276210554d91e9c0526864563e850a428c5e76da4914
Successfully built undetected-chromedriver
Installing collected packages: w

In [12]:
import undetected_chromedriver as uc

# Use undetected Chrome WebDriver
driver = uc.Chrome()