In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button and return the HTML content of the page after the click.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be clickable
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking...")
                more_options_button.click()
                self.add_random_delay(2, 4)

                # Wait for the page to load after the click
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                print("Page loaded after clicking More Options button.")

                # Add a delay to ensure the page is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def parse_tbody_to_dataframe(self, tbody_html):
        """
        Parse the tbody HTML and convert it into a pandas DataFrame.

        Args:
            tbody_html (str): The HTML content of the tbody.

        Returns:
            pd.DataFrame: A DataFrame containing the extracted data.
        """
        soup = BeautifulSoup(tbody_html, 'html.parser')
        rows = soup.find_all('tr', class_='table__row--qDMbhLBz')

        data = []
        for row in rows:
            cells = row.find_all('td', class_='table__cell--9cdFfes6')
            row_data = []
            for cell in cells:
                # Extract text from the span if it exists, otherwise extract text from the cell
                span = cell.find('span')
                if span:
                    row_data.append(span.get_text(strip=True))
                else:
                    row_data.append(cell.get_text(strip=True))
            data.append(row_data)

        # Define column names based on the table structure
        columns = [
            "CitationOrigin", "Publication", "Title", "Earliest PriorityDate",
            "PublicationDate", "Applicants", "IPC", "CPC"
        ]

        # Create a DataFrame
        df = pd.DataFrame(data, columns=columns)
        return df

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action
    
    df =pd.read_csv('classification_df.csv')
    # Add a new column to the original DataFrame to store the IPC classes of citations
    df['citations IPC'] = None

    try:
        # Iterate over each row in the original DataFrame
        for index, row in df.iterrows():
            # Construct the URL for the current patent
            url = f"https://worldwide.espacenet.com/patent/search/family/{row['Family number']}/publication/{row['first publication number']}?q=pn%3D{row['first publication number']}"
            print(f"Processing patent: {row['first publication number']}")

            # Get the page HTML
            html = scraper.get_page_html(url, retries=3)
            if html:
                print("Page HTML retrieved successfully.")

                # Click the "More Options" button and get the HTML content after the click
                page_html_after_click = scraper.click_more_options_button(retries=3)
                if page_html_after_click:
                    print(f"More Options button clicked successfully for {row['first publication number']}.")

                    # You can now parse the page HTML after the click as needed
                    # For example, you can use BeautifulSoup to extract specific elements
                    soup = BeautifulSoup(page_html_after_click, 'html.parser')
                    # Add your parsing logic here

                else:
                    print(f"Unable to click the More Options button for {row['first publication number']}.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

        # Display the updated DataFrame
        print(df[['first publication number', 'citations IPC']])

        # Save the updated DataFrame to a CSV file
        #df.to_csv("updated_patents_with_citations_ipc.csv", index=False, encoding="utf-8")
        #print("DataFrame saved to 'updated_patents_with_citations_ipc.csv'.")

KeyboardInterrupt: 

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be clickable
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking...")
                more_options_button.click()
                self.add_random_delay(2, 4)

                # Wait for the dropdown menu to appear
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='menu']"))
                )
                print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Click the "More Options" button and get the HTML content after the click
            page_html_after_click = scraper.click_more_options_button(retries=3)
            if page_html_after_click:
                print("More Options button clicked successfully.")

                # You can now parse the page HTML after the click as needed
                # For example, you can use BeautifulSoup to extract specific elements
                soup = BeautifulSoup(page_html_after_click, 'html.parser')
                # Add your parsing logic here

            else:
                print("Unable to click the More Options button.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
More Options button found. Clicking...
Timed out waiting for the 'More Options' button to load. Retrying (1/3)...
More Options button found. Clicking...
An error occurred while clicking the 'More Options' button: Message: element click intercepted: Element <button class="prod-jss36 prod-jss27" tabindex="0" type="button" aria-haspopup="true" id="more-options-selector--publication-list-header" aria-owns="simple-dropdown">...</button> is not clickable at point (767, 217). Other element would receive the click: <section class="prod-jss36 prod-jss306 prod-jss302 prod-jss303 prod-jss313 prod-jss314" tabindex="-1" role="menuitem" aria-disabled="false" style="padding-right: 8px;">...</section>
  (Session info: chrome=134.0.6998.36)
Stack

In [4]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be clickable
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking...")
                more_options_button.click()
                self.add_random_delay(2, 4)

                # Wait for the dropdown menu to appear
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='menu']"))
                )
                print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()



    def click_download_drop_down(self, retries=3):
        """
        Click the "download" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "download" button to be clickable
                Download_drop_down = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > section:nth-child(1)"))
                )
                print("More Options button found. Clicking...")
                Download_drop_down.click()
                self.add_random_delay(2, 4)

                # Wait for list button
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > li:nth-child(3)"))
                )
                print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Click the "More Options" button and get the HTML content after the click
            page_html_after_click = scraper.click_more_options_button(retries=3)
            if page_html_after_click:
                print("More Options button clicked successfully.")
                page_html_after_click_download = scraper.click_download_drop_down(retries=3)

                # You can now parse the page HTML after the click as needed
                # For example, you can use BeautifulSoup to extract specific elements
                soup = BeautifulSoup(page_html_after_click_download, 'html.parser')
                # Add your parsing logic here

            else:
                print("Unable to click the More Options button.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
More Options button found. Clicking...
Timed out waiting for the 'More Options' button to load. Retrying (1/3)...
More Options button found. Clicking...
An error occurred while clicking the 'More Options' button: Message: element click intercepted: Element <button class="prod-jss36 prod-jss27" tabindex="0" type="button" aria-haspopup="true" id="more-options-selector--publication-list-header" aria-owns="simple-dropdown">...</button> is not clickable at point (767, 217). Other element would receive the click: <section class="prod-jss36 prod-jss306 prod-jss302 prod-jss303 prod-jss313 prod-jss314" tabindex="-1" role="menuitem" aria-disabled="false" style="padding-right: 8px;">...</section>
  (Session info: chrome=134.0.6998.36)
Stack

In [6]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be present
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking using JavaScript...")

                # Use JavaScript to click the button
                self.driver.execute_script("arguments[0].click();", more_options_button)
                self.add_random_delay(2, 4)

                # Wait for the dropdown menu to appear
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss741.prod-jss623.prod-jss626.prod-jss752 > ul > section:nth-child(1)"))
                )
                print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def click_download_drop_down(self, retries=3):
        """
        Click the "download" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "download" button to be clickable
                download_drop_down = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > section:nth-child(1)"))
                )
                print("Download dropdown found. Clicking...")
                download_drop_down.click()
                self.add_random_delay(2, 4)

                # Wait for the list button to appear
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > li:nth-child(3)"))
                )
                print("List button appeared after clicking Download dropdown.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'Download' dropdown to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'Download' dropdown.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'Download' dropdown: {e}")
                return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Click the "More Options" button and get the HTML content after the click
            page_html_after_click = scraper.click_more_options_button(retries=3)
            if page_html_after_click:
                print("More Options button clicked successfully.")

                # Click the "Download" dropdown and get the HTML content after the click
                page_html_after_download = scraper.click_download_drop_down(retries=3)
                if page_html_after_download:
                    print("Download dropdown clicked successfully.")

                    # You can now parse the page HTML after the click as needed
                    soup = BeautifulSoup(page_html_after_download, 'html.parser')
                    # Add your parsing logic here

                else:
                    print("Unable to click the Download dropdown.")
            else:
                print("Unable to click the More Options button.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
More Options button found. Clicking using JavaScript...
Timed out waiting for the 'More Options' button to load. Retrying (1/3)...
More Options button found. Clicking using JavaScript...
Timed out waiting for the 'More Options' button to load. Retrying (2/3)...
More Options button found. Clicking using JavaScript...
Timed out waiting for the 'More Options' button to load. Retrying (3/3)...
Max retries reached. Unable to click the 'More Options' button.
Unable to click the More Options button.
Scraper closed.


In [8]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be clickable
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking...")
                more_options_button.click()
                self.add_random_delay(2, 4)

                # # Wait for the dropdown menu to appear
                # WebDriverWait(self.driver, 30).until(
                #     EC.presence_of_element_located((By.CSS_SELECTOR, "div[role='menu']"))
                # )
                # print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    def click_download_dropdown_option(self, retries=3):
        """
        Click the "Download" dropdown option in the menu.
        If the option is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the option is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "Download" dropdown option to be clickable
                download_option = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > li:nth-child(3)"))
                )
                print("Download dropdown option found. Clicking...")
                download_option.click()
                self.add_random_delay(2, 4)

                # Wait for the action to complete (e.g., a file download or page update)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                print("Download dropdown option clicked successfully.")

                # Add a delay to ensure the action is fully processed
                self.add_random_delay(2, 4)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'Download' dropdown option to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'Download' dropdown option.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'Download' dropdown option: {e}")
                return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Click the "More Options" button and get the HTML content after the click
            page_html_after_click = scraper.click_more_options_button(retries=3)
            if page_html_after_click:
                print("More Options button clicked successfully.")

                # Click the "Download" dropdown option and get the HTML content after the click
                page_html_after_download = scraper.click_download_dropdown_option(retries=3)
                if page_html_after_download:
                    print("Download dropdown option clicked successfully.")

                    # You can now parse the page HTML after the click as needed
                    soup = BeautifulSoup(page_html_after_download, 'html.parser')
                    # Add your parsing logic here

                else:
                    print("Unable to click the Download dropdown option.")
            else:
                print("Unable to click the More Options button.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
More Options button found. Clicking...
More Options button clicked successfully.
Timed out waiting for the 'Download' dropdown option to load. Retrying (1/3)...
Timed out waiting for the 'Download' dropdown option to load. Retrying (2/3)...
Timed out waiting for the 'Download' dropdown option to load. Retrying (3/3)...
Max retries reached. Unable to click the 'Download' dropdown option.
Unable to click the Download dropdown option.
Scraper closed.


In [10]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def click_more_options_button(self, retries=3):
        """
        Click the "More Options" button on the search results page.
        If the button is not found, return None.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str or None: The HTML content of the page after the click, or None if the button is not found.
        """
        for attempt in range(retries):
            try:
                # Wait for the "More Options" button to be clickable
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#more-options-selector--publication-list-header"))
                )
                print("More Options button found. Clicking...")
                more_options_button.click()
                self.add_random_delay(2, 4)
                print('more options clicked')
                download_button = WebDriverWait(self.driver,30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss970.prod-jss966.prod-jss969.prod-jss981 > ul > section:nth-child(1)"))
                )
                self.add_random_delay(2, 4)
                download_button.click()
                print('download dropdown clicked')
                list_csv_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss970.prod-jss966.prod-jss969.prod-jss981 > ul > li:nth-child(3)"))
                )
                list_csv_button.click()
                print('list csv clicked')

                #  # Wait for the dropdown menu to appear
                # WebDriverWait(self.driver, 30).until(
                #      EC.presence_of_element_located((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss970.prod-jss966.prod-jss969.prod-jss981 > ul > section:nth-child(1)"))
                #  )
                # # print("Dropdown menu appeared after clicking More Options button.")

                # Add a delay to ensure the dropdown is fully rendered
                self.add_random_delay(2, 4)

                # Return the updated page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the 'More Options' button to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to click the 'More Options' button.")
                    return None
            except Exception as e:
                print(f"An error occurred while clicking the 'More Options' button: {e}")
                return None

    # def click_download_dropdown_option(self, retries=3):
    #     """
    #     Click the "Download" dropdown option in the menu.
    #     If the option is not found, return None.

    #     Args:
    #         retries (int): Number of retry attempts.

    #     Returns:
    #         str or None: The HTML content of the page after the click, or None if the option is not found.
    #     """
    #     for attempt in range(retries):
    #         try:
    #             # Wait for the "Download" dropdown option to be clickable
    #             download_option = WebDriverWait(self.driver, 30).until(
    #                 EC.element_to_be_clickable((By.CSS_SELECTOR, "#simple-dropdown > div.prod-jss694.prod-jss623.prod-jss626.prod-jss705 > ul > li:nth-child(3)"))
    #             )
    #             print("Download dropdown option found. Clicking...")
    #             download_option.click()
    #             self.add_random_delay(2, 4)

    #             # Wait for the action to complete (e.g., a file download or page update)
    #             WebDriverWait(self.driver, 30).until(
    #                 EC.presence_of_element_located((By.TAG_NAME, "body"))
    #             )
    #             print("Download dropdown option clicked successfully.")

    #             # Add a delay to ensure the action is fully processed
    #             self.add_random_delay(2, 4)

    #             # Return the updated page HTML
    #             return self.driver.page_source

    #         except TimeoutException:
    #             print(f"Timed out waiting for the 'Download' dropdown option to load. Retrying ({attempt + 1}/{retries})...")
    #             if attempt == retries - 1:
    #                 print("Max retries reached. Unable to click the 'Download' dropdown option.")
    #                 return None
    #         except Exception as e:
    #             print(f"An error occurred while clicking the 'Download' dropdown option: {e}")
    #             return None

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Click the "More Options" button and get the updated page HTML
            page_html_after_click = scraper.click_more_options_button(retries=3)
            if page_html_after_click:
                print("More Options button clicked successfully.")

                # Click the "Download" dropdown option and get the updated page HTML
                page_html_after_download = scraper.click_download_dropdown_option(retries=3)
                if page_html_after_download:
                    print("Download dropdown option clicked successfully.")

                    # You can now parse the updated page HTML as needed
                    soup = BeautifulSoup(page_html_after_download, 'html.parser')
                    # Add your parsing logic here

                else:
                    print("Unable to click the Download dropdown option.")
            else:
                print("Unable to click the More Options button.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
More Options button found. Clicking...
more options clicked
Timed out waiting for the 'More Options' button to load. Retrying (1/3)...
More Options button found. Clicking...
An error occurred while clicking the 'More Options' button: Message: element click intercepted: Element <button class="prod-jss36 prod-jss27" tabindex="0" type="button" aria-haspopup="true" id="more-options-selector--publication-list-header" aria-owns="simple-dropdown">...</button> is not clickable at point (767, 217). Other element would receive the click: <section class="prod-jss36 prod-jss306 prod-jss302 prod-jss303 prod-jss313 prod-jss314" tabindex="-1" role="menuitem" aria-disabled="false" style="padding-right: 8px;">...</section>
  (Session info: chrome

In [11]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def download_csv(self, retries=3):
        """
        Complete the sequence of clicking:
        1. More Options button
        2. Download dropdown
        3. List (CSV) option
        
        This handles overlapping elements and ensures each click is successful
        before proceeding to the next.

        Args:
            retries (int): Number of retry attempts for the entire sequence.

        Returns:
            bool: True if the download sequence was successful, False otherwise.
        """
        for attempt in range(retries):
            try:
                print(f"Attempting download sequence (Attempt {attempt + 1})...")
                
                # Step 1: Click "More Options" button
                print("Looking for More Options button...")
                more_options_selector = "#more-options-selector--publication-list-header"
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, more_options_selector))
                )
                
                # Try to click, but handle intercepted clicks
                try:
                    print("More Options button found. Clicking...")
                    more_options_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#more-options-selector--publication-list-header")', more_options_button)
                    
                self.add_random_delay(2, 3)
                print('More Options clicked successfully')
                
                # Step 2: Click "Download" section in the dropdown
                print("Looking for Download section...")
                # Use a more general selector to find the Download section
                # This uses contains() to match the text rather than a fixed CSS path
                download_section_xpath = "/html/body/div[2]/div[3]/ul/section[1]"
                download_section = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_section_xpath))
                )
                
                try:
                    print("Download section found. Clicking...")
                    download_section.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > section:nth-child(1)")', download_section)
                    
                self.add_random_delay(1, 2)
                print('Download section clicked successfully')
                
                # Step 3: Click "List (CSV)" option
                print("Looking for List (CSV) option...")
                # Use contains() with the XPATH to find the CSV option based on text
                csv_option_xpath = "/html/body/div[2]/div[3]/ul/li[2]"
                csv_option = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, csv_option_xpath))
                )
                
                try:
                    print("List (CSV) option found. Clicking...")
                    csv_option.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > li:nth-child(3)")', csv_option)
                    
                self.add_random_delay(2, 3)
                print('List (CSV) option clicked successfully')
                
                # Wait for download to start or download dialog to appear
                # Note: We can't directly check for file download completion,
                # but we can check if the page remains responsive
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                
                print("Download sequence completed successfully")
                return True
                
            except TimeoutException as e:
                print(f"Timeout during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
            except Exception as e:
                print(f"Error during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
                
            # If we reach here, there was an error and we need to try again
            # Refresh the page before the next attempt
            try:
                self.driver.refresh()
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                self.add_random_delay(3, 5)
            except Exception as e:
                print(f"Error refreshing page: {e}")

        return False

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Perform the download sequence
            if scraper.download_csv(retries=3):
                print("CSV download initiated successfully.")
                # Wait a bit to ensure the download starts
                time.sleep(10)
                print("Download should be complete or in progress.")
            else:
                print("Failed to download CSV.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
Attempting download sequence (Attempt 1)...
Looking for More Options button...
More Options button found. Clicking...
More Options clicked successfully
Looking for Download section...
Download section found. Clicking...
Download section clicked successfully
Looking for List (CSV) option...
List (CSV) option found. Clicking...
List (CSV) option clicked successfully
Download sequence completed successfully
CSV download initiated successfully.
Download should be complete or in progress.
Scraper closed.


final code

In [15]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            url (str): The URL to navigate to.
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def download_csv(self, retries=3, max_results=500):
        """
        Complete the sequence of clicking:
        1. More Options button
        2. Download dropdown
        3. List (CSV) option
        4. Handle download dialog by:
           - Setting the "To" value to max_results (e.g., 500)
           - Clicking the Download button
        
        Args:
            retries (int): Number of retry attempts for the entire sequence.
            max_results (int): Maximum number of results to download (1-500).

        Returns:
            bool: True if the download sequence was successful, False otherwise.
        """
        for attempt in range(retries):
            try:
                print(f"Attempting download sequence (Attempt {attempt + 1})...")
                
                # Step 1: Click "More Options" button
                print("Looking for More Options button...")
                more_options_selector = "#more-options-selector--publication-list-header"
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, more_options_selector))
                )
                
                # Try to click, but handle intercepted clicks
                try:
                    print("More Options button found. Clicking...")
                    more_options_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#more-options-selector--publication-list-header")', more_options_button)
                    
                self.add_random_delay(2, 3)
                print('More Options clicked successfully')
                
                # Step 2: Click "Download" section in the dropdown
                print("Looking for Download section...")
                # Use a more general selector to find the Download section
                # This uses contains() to match the text rather than a fixed CSS path
                download_section_xpath = "/html/body/div[2]/div[3]/ul/section[1]"
                download_section = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_section_xpath))
                )
                
                try:
                    print("Download section found. Clicking...")
                    download_section.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > section:nth-child(1)")', download_section)
                    
                self.add_random_delay(1, 2)
                print('Download section clicked successfully')
                
                # Step 3: Click "List (CSV)" option
                print("Looking for List (CSV) option...")
                # Use contains() with the XPATH to find the CSV option based on text
                csv_option_xpath = "/html/body/div[2]/div[3]/ul/li[2]"
                csv_option = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, csv_option_xpath))
                )
                
                try:
                    print("List (CSV) option found. Clicking...")
                    csv_option.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > li:nth-child(3)")', csv_option)
                    
                self.add_random_delay(2, 3)
                print('List (CSV) option clicked successfully')
                
                # Step 4: Handle the download dialog
                print("Waiting for download dialog to appear...")
                
                # Wait for the dialog to appear
                download_dialog_xpath = "/html/body/div[2]/div[3]/div/div"
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, download_dialog_xpath))
                )
                print("Download dialog appeared")
                
                # Find the "To" input field
                to_input_xpath = "/html/body/div[2]/div[3]/div/div/div/div[1]/input[2]"
                to_input = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, to_input_xpath))
                )
                
                # Clear the input and set it to max_results
                print(f"Setting maximum results to {max_results}...")
                to_input.clear()
                to_input.send_keys(str(max_results))
                self.add_random_delay(1, 2)
                
                # Click the Download button in the dialog
                download_button_xpath = "/html/body/div[2]/div[3]/div/div/div/button"
                download_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_button_xpath))
                )
                
                try:
                    print("Download button found. Clicking...")
                    download_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("body > div.prod-jss12 > div.prod-jss15.prod-jss13 > div > div > div > button")', download_button)
                
                print("Download button clicked")
                
                # Wait for a moment to ensure the download starts
                self.add_random_delay(3, 5)
                
                # Check if there are any error messages
                try:
                    error_message = self.driver.find_element(By.XPATH, "//div[contains(@class, 'download-modal__validation')]//span")
                    if error_message.is_displayed() and error_message.text.strip():
                        print(f"Error in download dialog: {error_message.text}")
                        return False
                except:
                    # No error message found, continue
                    pass
                
                print("Download sequence completed successfully")
                return True
                
            except TimeoutException as e:
                print(f"Timeout during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
            except Exception as e:
                print(f"Error during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
                
            # If we reach here, there was an error and we need to try again
            # Refresh the page before the next attempt
            try:
                self.driver.refresh()
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                self.add_random_delay(3, 5)
            except Exception as e:
                print(f"Error refreshing page: {e}")

        return False

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # Define the search URL
    search_url = 'https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr'

    try:
        # Get the page HTML
        html = scraper.get_page_html(search_url, retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Perform the download sequence with max 500 results
            if scraper.download_csv(retries=3, max_results=500):
                print("CSV download initiated successfully.")
                # Wait a bit to ensure the download starts
                time.sleep(10)
                print("Download should be complete or in progress.")
            else:
                print("Failed to download CSV.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt%20all%20"hydrogen"%20AND%20ctxt%20all%20"battery"%20AND%20ctxt%20%3D%20"electric%20vehicle"%20AND%20ctxt%20all%20"charging"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
Attempting download sequence (Attempt 1)...
Looking for More Options button...
More Options button found. Clicking...
More Options clicked successfully
Looking for Download section...
Download section found. Clicking...
Download section clicked successfully
Looking for List (CSV) option...
List (CSV) option found. Clicking...
List (CSV) option clicked successfully
Waiting for download dialog to appear...
Download dialog appeared
Setting maximum results to 500...
Download button found. Clicking...
Download button clicked
Download sequence completed successfully
CSV download initiated successfully.
Download should be complete or in progress.
Scraper closed.
