read the results into a dataframe 

changing the query to a list of keywords 

In [20]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as ucwe
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

class EspacenetScraper:
    def __init__(self, search_keywords, headless=True):
        """Initialize the scraper with configurable options and search keywords."""
        self.search_keywords = search_keywords
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def construct_search_url(self):
        """Construct the search URL based on the provided keywords."""
        base_url = 'https://worldwide.espacenet.com/patent/search?q='
        query = ' AND '.join([f'ctxt all "{keyword}"' for keyword in self.search_keywords])
        query += '&queryLang=en%3Ade%3Afr'
        return base_url + query

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, retries=3):
        """
        Navigate to the constructed URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        url = self.construct_search_url()
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def download_csv(self, retries=3, max_results=500):
        """
        Complete the sequence of clicking:
        1. More Options button
        2. Download dropdown
        3. List (CSV) option
        4. Handle download dialog by:
           - Setting the "To" value to max_results (e.g., 500)
           - Clicking the Download button
        
        Args:
            retries (int): Number of retry attempts for the entire sequence.
            max_results (int): Maximum number of results to download (1-500).

        Returns:
            bool: True if the download sequence was successful, False otherwise.
        """
        for attempt in range(retries):
            try:
                print(f"Attempting download sequence (Attempt {attempt + 1})...")
                
                # Step 1: Click "More Options" button
                print("Looking for More Options button...")
                more_options_selector = "#more-options-selector--publication-list-header"
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, more_options_selector))
                )
                
                # Try to click, but handle intercepted clicks
                try:
                    print("More Options button found. Clicking...")
                    more_options_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#more-options-selector--publication-list-header").click()', more_options_button)
                    
                self.add_random_delay(2, 3)
                print('More Options clicked successfully')
                
                # Step 2: Click "Download" section in the dropdown
                print("Looking for Download section...")
                # Use a more general selector to find the Download section
                # This uses contains() to match the text rather than a fixed CSS path
                download_section_xpath = "/html/body/div[2]/div[3]/ul/section[1]"
                download_section = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_section_xpath))
                )
                
                try:
                    print("Download section found. Clicking...")
                    download_section.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > section:nth-child(1)").click()', download_section)
                    
                self.add_random_delay(1, 2)
                print('Download section clicked successfully')
                
                # Step 3: Click "List (CSV)" option
                print("Looking for List (CSV) option...")
                # Use contains() with the XPATH to find the CSV option based on text
                csv_option_xpath = "/html/body/div[2]/div[3]/ul/li[2]"
                csv_option = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, csv_option_xpath))
                )
                
                try:
                    print("List (CSV) option found. Clicking...")
                    csv_option.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > li:nth-child(3)").click()', csv_option)
                    
                self.add_random_delay(2, 3)
                print('List (CSV) option clicked successfully')
                
                # Step 4: Handle the download dialog
                print("Waiting for download dialog to appear...")
                
                # Wait for the dialog to appear
                download_dialog_xpath = "/html/body/div[2]/div[3]/div/div"
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, download_dialog_xpath))
                )
                print("Download dialog appeared")
                
                # Find the "To" input field
                to_input_xpath = "/html/body/div[2]/div[3]/div/div/div/div[1]/input[2]"
                to_input = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, to_input_xpath))
                )
                
                # Clear the input and set it to max_results
                print(f"Setting maximum results to {max_results}...")
                to_input.clear()
                to_input.send_keys(str(max_results))
                self.add_random_delay(1, 2)
                
                # Click the Download button in the dialog
                download_button_xpath = "/html/body/div[2]/div[3]/div/div/div/button"
                download_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_button_xpath))
                )
                
                try:
                    print("Download button found. Clicking...")
                    download_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("body > div.prod-jss12 > div.prod-jss15.prod-jss13 > div > div > div > button").click()', download_button)
                
                print("Download button clicked")
                
                # Wait for a moment to ensure the download starts
                self.add_random_delay(3, 5)
                
                # Check if there are any error messages
                try:
                    error_message = self.driver.find_element(By.XPATH, "//div[contains(@class, 'download-modal__validation')]//span")
                    if error_message.is_displayed() and error_message.text.strip():
                        print(f"Error in download dialog: {error_message.text}")
                        return False
                except:
                    # No error message found, continue
                    pass
                
                print("Download sequence completed successfully")
                return True
                
            except TimeoutException as e:
                print(f"Timeout during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
            except Exception as e:
                print(f"Error during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
                
            # If we reach here, there was an error and we need to try again
            # Refresh the page before the next attempt
            try:
                self.driver.refresh()
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                self.add_random_delay(3, 5)
            except Exception as e:
                print(f"Error refreshing page: {e}")

        return False

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()




In [21]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

class EspacenetScraper:
    def __init__(self, search_keywords, headless=False):
        """Initialize the scraper with configurable options and search keywords."""
        self.search_keywords = search_keywords
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1600, 1300)

    def construct_search_url(self):
        """Construct the search URL based on the provided keywords and their search fields."""
        base_url = 'https://worldwide.espacenet.com/patent/search?q='
        
        # Mapping of search fields to Espacenet query parameters
        field_mapping = {
            'title': 'ti',
            'abstract': 'ab',
            'claims': 'cl',
            'title,abstract or claims': 'ctxt' ,
            'all text fields' : 'ftxt',
            'title or abstract' : 'ta',
            'description' : 'desc',
            'all text fields or names' : 'nftxt',
            'title , abstract or names' : 'ntxt'
              # Full text search
        }
        
        query_parts = []
        for keyword, field in self.search_keywords.items():
            field_param = field_mapping.get(field, 'ctxt')  # Default to 'ctxt' if field is unknown
            query_parts.append(f'{field_param} = "{keyword}"')
        
        query = ' AND '.join(query_parts)
        query += '&queryLang=en%3Ade%3Afr'
        
        return base_url + query

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, retries=3):
        """
        Navigate to the constructed URL and return the page HTML.
        Retry the operation if a timeout occurs.

        Args:
            retries (int): Number of retry attempts.

        Returns:
            str: The page HTML, or None if all retries fail.
        """
        url = self.construct_search_url()
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML
                return self.driver.page_source

            except TimeoutException:
                print(f"Timed out waiting for the page to load. Retrying ({attempt + 1}/{retries})...")
                if attempt == retries - 1:
                    print("Max retries reached. Unable to load the page.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def download_csv(self, retries=3, max_results=500):
        """
        Complete the sequence of clicking:
        1. More Options button
        2. Download dropdown
        3. List (CSV) option
        4. Handle download dialog by:
           - Setting the "To" value to max_results (e.g., 500)
           - Clicking the Download button
        
        Args:
            retries (int): Number of retry attempts for the entire sequence.
            max_results (int): Maximum number of results to download (1-500).

        Returns:
            bool: True if the download sequence was successful, False otherwise.
        """
        for attempt in range(retries):
            try:
                print(f"Attempting download sequence (Attempt {attempt + 1})...")
                
                # Step 1: Click "More Options" button
                print("Looking for More Options button...")
                more_options_selector = "#more-options-selector--publication-list-header"
                more_options_button = WebDriverWait(self.driver, 30).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, more_options_selector))
                )
                
                # Try to click, but handle intercepted clicks
                try:
                    print("More Options button found. Clicking...")
                    more_options_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#more-options-selector--publication-list-header").click()', more_options_button)
                    
                self.add_random_delay(2, 3)
                print('More Options clicked successfully')
                
                # Step 2: Click "Download" section in the dropdown
                print("Looking for Download section...")
                # Use a more general selector to find the Download section
                # This uses contains() to match the text rather than a fixed CSS path
                download_section_xpath = "/html/body/div[2]/div[3]/ul/section[1]"
                download_section = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_section_xpath))
                )
                
                try:
                    print("Download section found. Clicking...")
                    download_section.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > section:nth-child(1)").click()', download_section)
                    
                self.add_random_delay(1, 2)
                print('Download section clicked successfully')
                
                # Step 3: Click "List (CSV)" option
                print("Looking for List (CSV) option...")
                # Use contains() with the XPATH to find the CSV option based on text
                csv_option_xpath = "/html/body/div[2]/div[3]/ul/li[2]"
                csv_option = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, csv_option_xpath))
                )
                
                try:
                    print("List (CSV) option found. Clicking...")
                    csv_option.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("#simple-dropdown > div.prod-jss1034.prod-jss966.prod-jss969.prod-jss1045 > ul > li:nth-child(3)").click()', csv_option)
                    
                self.add_random_delay(2, 3)
                print('List (CSV) option clicked successfully')
                
                # Step 4: Handle the download dialog
                print("Waiting for download dialog to appear...")
                
                # Wait for the dialog to appear
                download_dialog_xpath = "/html/body/div[2]/div[3]/div/div"
                WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, download_dialog_xpath))
                )
                print("Download dialog appeared")
                
                # Find the "To" input field
                to_input_xpath = "/html/body/div[2]/div[3]/div/div/div/div[1]/input[2]"
                to_input = WebDriverWait(self.driver, 10).until(
                    EC.presence_of_element_located((By.XPATH, to_input_xpath))
                )
                
                # Clear the input and set it to max_results
                print(f"Setting maximum results to {max_results}...")
                to_input.clear()
                to_input.send_keys(str(max_results))
                self.add_random_delay(1, 2)
                
                # Click the Download button in the dialog
                download_button_xpath = "/html/body/div[2]/div[3]/div/div/div/button"
                download_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, download_button_xpath))
                )
                
                try:
                    print("Download button found. Clicking...")
                    download_button.click()
                except ElementClickInterceptedException:
                    print("Click intercepted, trying JavaScript click...")
                    self.driver.execute_script('document.querySelector("body > div.prod-jss12 > div.prod-jss15.prod-jss13 > div > div > div > button").click()', download_button)
                
                print("Download button clicked")
                
                # Wait for a moment to ensure the download starts
                self.add_random_delay(3, 5)
                
                # Check if there are any error messages
                try:
                    error_message = self.driver.find_element(By.XPATH, "//div[contains(@class, 'download-modal__validation')]//span")
                    if error_message.is_displayed() and error_message.text.strip():
                        print(f"Error in download dialog: {error_message.text}")
                        return False
                except:
                    # No error message found, continue
                    pass
                
                print("Download sequence completed successfully")
                return True
                
            except TimeoutException as e:
                print(f"Timeout during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
            except Exception as e:
                print(f"Error during download sequence: {e}")
                if attempt == retries - 1:
                    print("Max retries reached. Download sequence failed.")
                    return False
                
            # If we reach here, there was an error and we need to try again
            # Refresh the page before the next attempt
            try:
                self.driver.refresh()
                WebDriverWait(self.driver, 30).until(
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )
                self.add_random_delay(3, 5)
            except Exception as e:
                print(f"Error refreshing page: {e}")

        return False

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()




        'title': 'ti',
            'abstract': 'ab',
            'claims': 'cl',
            'title,abstract or claims': 'ctxt' ,
            'all text fields' : 'ftxt',
            'title or abstract' : 'ta',
            'description' : 'desc',
            'all text fields or names' : 'nftxt',
            'title , abstract or names' : 'ntxt'

In [23]:
if __name__ == '__main__':
    # Define the search keywords with their search fields
    search_keywords = {
        "Autonomous ": "title,abstract and claims",
        "Vehicles":"title,abstract and claims"
        

    }

    # Initialize the scraper with the search keywords
    scraper = EspacenetScraper(search_keywords, headless=False)  # Set headless to False to see the browser in action

    try:
        # Construct and print the search URL
        search_url = scraper.construct_search_url()
        print("Constructed Search URL:", search_url)
        
        # Get the page HTML
        html = scraper.get_page_html(retries=3)
        if html:
            print("Page HTML retrieved successfully.")

            # Perform the download sequence with max 500 results
            if scraper.download_csv(retries=3, max_results=500):
                print("CSV download initiated successfully.")
                # Wait a bit to ensure the download starts
                time.sleep(10)
                print("Download should be complete or in progress.")
            else:
                print("Failed to download CSV.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")


Constructed Search URL: https://worldwide.espacenet.com/patent/search?q=ctxt = "Autonomous " AND ctxt = "Vehicles"&queryLang=en%3Ade%3Afr
Navigating to: https://worldwide.espacenet.com/patent/search?q=ctxt = "Autonomous " AND ctxt = "Vehicles"&queryLang=en%3Ade%3Afr (Attempt 1)
Page HTML retrieved successfully.
Attempting download sequence (Attempt 1)...
Looking for More Options button...
More Options button found. Clicking...
More Options clicked successfully
Looking for Download section...
Download section found. Clicking...
Download section clicked successfully
Looking for List (CSV) option...
List (CSV) option found. Clicking...
List (CSV) option clicked successfully
Waiting for download dialog to appear...
Download dialog appeared
Setting maximum results to 500...
Download button found. Clicking...
Download button clicked
Download sequence completed successfully
CSV download initiated successfully.
Download should be complete or in progress.
Scraper closed.


In [1]:
import os
import glob

# Get the Downloads folder path
downloads_folder = os.path.expanduser("~/Downloads")

# Get all CSV files in the Downloads folder
list_of_files = glob.glob(os.path.join(downloads_folder, "*.csv"))

if list_of_files:  # Ensure there are CSV files
    latest_file = max(list_of_files, key=os.path.getmtime)
    print("Latest downloaded file:", latest_file)

    # Read the latest CSV file into a DataFrame
    import pandas as pd
    df = pd.read_csv(latest_file,delimiter=';', skiprows=7)
    df.head()
else:
    print("No CSV files found in Downloads.")


Latest downloaded file: C:\Users\tasni/Downloads\Résultat_de_la_recherche_dans_Espacenet_20250505_1339.csv


In [2]:
import pandas as pd

# Assuming df is your DataFrame
df.rename(columns={
    'Titre': 'Title',
    'Inventeurs': 'Inventors',
    'Demandeurs': 'Applicants',
    'Numéro de publication': 'Publication number',
    'Priorité la plus ancienne': 'Earliest priority',
    'CIB': 'IPC',
    'CPC': 'CPC',
    'Date de publication': 'Publication date',
    'Publication la plus ancienne': 'Earliest publication',
    'Numéro de famille': 'Family number'
}, inplace=True)


In [3]:

df[['first publication date','second publication date']] = df['Publication date'].str.split(' ' , n=1 , expand= True)
df['second publication date'] = df['second publication date'].str.strip('\n')
df['second publication date'] = df['second publication date'].str.strip('\r')
df['second publication date'] = df['second publication date'].str.strip('\n')


In [4]:
#first filing country 
df[['first publication number', 'second publication number']] = df['Publication number'].str.split(' ' , n=1 , expand=True)



In [5]:
df.head()

Unnamed: 0,No,Title,Inventors,Applicants,Publication number,Earliest priority,IPC,CPC,Publication date,Earliest publication,Family number,Unnamed: 11,first publication date,second publication date,first publication number,second publication number
0,1,AUTONOMOUS GANGED VEHICLES,HIGH DONALD R [US] \r\nWINKLE DAVID C [US] \r\...,WAL MART STORES INC [US] \r\nWALMART APOLLO LL...,US10254766B2 \r\nUS2018129223A1,2016-11-09,G05D1/00 \r\nG05D1/02 \r\nB64C1/20,"G05D1/0022 (GB,US) \r\nG05D1/0027 (GB,US) \r\n...",2018-05-10 \r\n2019-04-09,2018-05-10,62065506,,2018-05-10,2019-04-09,US10254766B2,\r\nUS2018129223A1
1,2,Method of communication of travel destination ...,SCHUSTER KILIAN [CH] \r\nFRIEDLI PAUL [CH],INVENTIO AG [CH],US6394231B1,1999-04-22,B66B1/18 \r\nB66B3/00 \r\nB66B9/16,"B66B3/00 (EP,US) \r\nB66B3/006 (EP,US)",2002-05-28,2000-10-22,8242784,,2002-05-28,,US6394231B1,
2,3,AUTONOMOUS UNMANNED UNDERWATER VEHICLES,LICHTER HARRY J [US] \r\nBAKER WAYNE A [US] \r...,LOCKHEED CORP [US],US10196117B2 \r\nUS2017081004A1,2015-09-21,B63G8/32 \r\nB63G8/00 \r\nB63G8/08 \r\nB63G8/1...,"B63G8/001 (EP,US) \r\nB63G8/08 (US) \r\nB63G8/...",2017-03-23 \r\n2019-02-05,2017-03-23,58276622,,2017-03-23,2019-02-05,US10196117B2,\r\nUS2017081004A1
3,4,DISPATCH SYSTEM FOR AUTONOMOUS VEHICLES,HUANG SHIH-CHIA [TW] \r\nJIAU MING-KAI [TW] \r...,UNIV NAT TAIPEI TECHNOLOGY [TW],US2017059336A1 \r\nUS9772197B2,2015-08-31,G01C21/34 \r\nG05D1/02 \r\nG06Q10/08 \r\nG06Q1...,"G01C21/34 (US) \r\nG01C21/343 (EP,US) \r\nG05D...",2017-03-02 \r\n2017-09-26,2017-03-01,58104312,,2017-03-02,2017-09-26,US2017059336A1,\r\nUS9772197B2
4,5,INTELLIGENT COLLABORATION BETWEEN AUTONOMOUS A...,RAKSHIT SARBAJIT K [IN] \r\nJAKKULA SATYAM [IN...,IBM [US],US2024242599A1,2023-01-13,B60K35/00 \r\nB64C39/02 \r\nB64U20/87 \r\nG08G...,"B60K35/00 (EP,US) \r\nB64C39/024 (EP,US) \r\nB...",2024-07-18,2024-07-18,91854893,,2024-07-18,,US2024242599A1,


In [6]:
df_patents = df

In [7]:
df_patents = df_patents.applymap(
    lambda x: x.replace('\r\n', ' ') if isinstance(x, str) else x
)


  df_patents = df_patents.applymap(


In [8]:
df_patents.rename(columns={'No': 'id'}, inplace=True)


In [9]:
if 'Unnamed: 11' in df_patents.columns:
    df_patents.drop(columns=['Unnamed: 11','Publication date'], inplace=True)


In [10]:
df_patents['Family number'] = pd.to_numeric(df_patents['Family number'], errors='coerce')


In [11]:
df.rename(columns={'Family number': 'family number'}, inplace=True)

In [None]:
#3 threads
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd
import threading

class PatentsSearch:
    def __init__(self, headless=True):
        """Initialize the scraper with enhanced compatibility options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--disable-extensions')
        
        try:
            self.driver = uc.Chrome(
                options=options,
                use_subprocess=True,
                version_main=None,
                suppress_welcome=True,
                debug=False
            )
            self.driver.set_page_load_timeout(30)
            self.driver.set_window_size(1920, 1080)
        except Exception as e:
            print(f"Failed to initialize ChromeDriver: {e}")
            print("Trying alternative initialization method...")
            self.driver = uc.Chrome(
                options=options,
                driver_executable_path=None
            )

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url):
        """Navigate to the given URL and return the page HTML."""
        try:
            print(f"Navigating to: {url}")
            self.driver.get(url)
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "h5"))
            )
            self.add_random_delay(3, 5)
            return self.driver.page_source
        except TimeoutException:
            print("Timed out waiting for the page to load.")
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def parse_html(self, html):
        """Parse the HTML and extract all span elements inside the 'Published as' content."""
        soup = BeautifulSoup(html, 'html.parser')
        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Publié en tant que" in tag.text or "Published as" in tag.text))
        if published_as_element:
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                spans = content_element.find_all('span')
                return [span.get_text(strip=True) for span in spans]
        return []

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()

def process_rows(df, indices):
    """Process a subset of DataFrame rows using a dedicated PatentsSearch instance."""
    scraper = PatentsSearch(headless=False)  # Set to False as per task requirement for visible windows
    try:
        for index in indices:
            row = df.loc[index]
            url = f"https://worldwide.espacenet.com/patent/search/family/{row['family number']}/publication/{row['first publication number']}?q=hydrogen%20battery"
            html = scraper.get_page_html(url)
            if html:
                family_members = scraper.parse_html(html)
                df.at[index, 'family_members'] = family_members
            else:
                print(f"Failed to retrieve the page HTML for {row['first publication number']}.")
    finally:
        scraper.close()

if __name__ == "__main__":
    # Assuming df is defined elsewhere with 'family number' and 'first publication number' columns
    df = df.head(280)
    df['family_members'] = None

    # Split the DataFrame indices into three parts
    indices = df.index.tolist()
    n = len(indices)
    part_size = n // 5
    remainder = n % 5
    parts = []
    start = 0
    for i in range(5):
        if i < remainder:
            end = start + part_size + 1
        else:
            end = start + part_size
        parts.append(indices[start:end])
        start = end

    # Create three threads, each with its own PatentsSearch instance
    threads = []
    for part in parts:
        thread = threading.Thread(target=process_rows, args=(df, part))
        threads.append(thread)

    # Start all three threads to run three browser windows concurrently
    for thread in threads:
        thread.start()

    # Wait for all threads to complete
    for thread in threads:
        thread.join()

    print("All threads finished.")
    # df now contains the 'family_members' column filled from all three threads

Failed to initialize ChromeDriver: [WinError 183] Impossible de créer un fichier déjà existant: 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected\\chromedriver-win32\\chromedriver.exe' -> 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected_chromedriver.exe'
Trying alternative initialization method...
Navigating to: https://worldwide.espacenet.com/patent/search/family/88237912/publication/WO2024088680A1?q=hydrogen%20battery
Navigating to: https://worldwide.espacenet.com/patent/search/family/62065506/publication/US10254766B2?q=hydrogen%20battery
Failed to initialize ChromeDriver: [WinError 183] Impossible de créer un fichier déjà existant: 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected\\chromedriver-win32\\chromedriver.exe' -> 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected_chromedriver.exe'
Trying alternative initialization method...
Navigating to: https://worldwide.espacenet.com/patent/search

Exception in thread Thread-7 (process_rows):
Traceback (most recent call last):
  File "C:\Users\tasni\AppData\Local\Temp\ipykernel_12584\2025829514.py", line 25, in __init__
  File "C:\Users\tasni\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\undetected_chromedriver\__init__.py", line 258, in __init__
    self.patcher.auto()
  File "C:\Users\tasni\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\undetected_chromedriver\patcher.py", line 178, in auto
    self.unzip_package(self.fetch_package())
  File "C:\Users\tasni\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\undetected_chromedriver\patcher.py", line 310, in unzip_package
    os.rename(os.path.join(self.zip_path, exe_path), self.executable_path)
FileExistsError: [WinError 183] Impossible de créer un fichie

Timed out waiting for the page to load.
Failed to retrieve the page HTML for WO2024088680A1.
Navigating to: https://worldwide.espacenet.com/patent/search/family/65275375/publication/US10453275B2?q=hydrogen%20battery
Timed out waiting for the page to load.
Failed to retrieve the page HTML for US10254766B2.
Navigating to: https://worldwide.espacenet.com/patent/search/family/8242784/publication/US6394231B1?q=hydrogen%20battery
Timed out waiting for the page to load.
Failed to retrieve the page HTML for US11763672B2.
Navigating to: https://worldwide.espacenet.com/patent/search/family/58584087/publication/US10255168B2?q=hydrogen%20battery
Timed out waiting for the page to load.
Failed to retrieve the page HTML for US2016021178A1.
Navigating to: https://worldwide.espacenet.com/patent/search/family/67541817/publication/US11158194B2?q=hydrogen%20battery
Timed out waiting for the page to load.
Failed to retrieve the page HTML for US10453275B2.
Navigating to: https://worldwide.espacenet.com/pate

In [28]:
df[df['family_members'].apply(lambda x: isinstance(x, list) and len(x) == 0)].head()

Unnamed: 0,No,Title,Inventors,Applicants,Publication number,Earliest priority,IPC,CPC,Publication date,Earliest publication,family number,Unnamed: 11,first publication date,second publication date,first publication number,second publication number,family_members
0,1,AUTONOMOUS GANGED VEHICLES,HIGH DONALD R [US] \r\nWINKLE DAVID C [US] \r\...,WAL MART STORES INC [US] \r\nWALMART APOLLO LL...,US10254766B2 \r\nUS2018129223A1,2016-11-09,G05D1/00 \r\nG05D1/02 \r\nB64C1/20,"G05D1/0022 (GB,US) \r\nG05D1/0027 (GB,US) \r\n...",2018-05-10 \r\n2019-04-09,2018-05-10,62065506,,2018-05-10,2019-04-09,US10254766B2,\r\nUS2018129223A1,[]
1,2,Method of communication of travel destination ...,SCHUSTER KILIAN [CH] \r\nFRIEDLI PAUL [CH],INVENTIO AG [CH],US6394231B1,1999-04-22,B66B1/18 \r\nB66B3/00 \r\nB66B9/16,"B66B3/00 (EP,US) \r\nB66B3/006 (EP,US)",2002-05-28,2000-10-22,8242784,,2002-05-28,,US6394231B1,,[]
2,3,AUTONOMOUS UNMANNED UNDERWATER VEHICLES,LICHTER HARRY J [US] \r\nBAKER WAYNE A [US] \r...,LOCKHEED CORP [US],US10196117B2 \r\nUS2017081004A1,2015-09-21,B63G8/32 \r\nB63G8/00 \r\nB63G8/08 \r\nB63G8/1...,"B63G8/001 (EP,US) \r\nB63G8/08 (US) \r\nB63G8/...",2017-03-23 \r\n2019-02-05,2017-03-23,58276622,,2017-03-23,2019-02-05,US10196117B2,\r\nUS2017081004A1,[]
7,8,MANAGEMENT OF MULTIPLE AUTONOMOUS VEHICLES,MARCZUK KATARZYNA ANNA [SG] \r\nALBERT MARC LA...,APTIV TECH LTD [BB],US2020042019A1,2018-08-02,G05D1/02 \r\nG06Q10/02 \r\nG06Q50/30 \r\nG08G1/00,G05D1/0214 (CN) \r\nG05D1/0221 (CN) \r\nG05D1/...,2020-02-06,2020-02-06,69228552,,2020-02-06,,US2020042019A1,,[]
9,10,GESTION DE PLUSIEURS VÉHICULES AUTONOMES,MARCZUK KATARZYNA ANNA [SG] \r\nALBERT MARC LA...,APTIV TECH LTD [BB] \r\nMOTIONAL AD LLC [US],EP3605488A1 \r\nEP3605488B1,2018-08-02,G05D1/02 \r\nG06Q50/30 \r\nG01C21/34 \r\nG06Q1...,G01C21/3415 (EP) \r\nG01C21/3438 (EP) \r\nG06Q...,2020-02-05 \r\n2024-12-11,2020-02-05,67438109,,2020-02-05,2024-12-11,EP3605488A1,\r\nEP3605488B1,[]


In [27]:
len(df['family_members'].dropna())

280

In [1]:
empty_family_members_count = df['family_members'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
print(f"Number of rows with empty family_members: {empty_family_members_count}")

NameError: name 'df' is not defined

In [37]:
#parallel processing chatgpt
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country')
                country = country.get('$') if isinstance(country, dict) else country
                doc_number = doc.get('doc-number')
                doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                kind = doc.get('kind')
                kind = kind.get('$') if isinstance(kind, dict) else kind
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent(patent: str) -> dict:
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception:
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = 10) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    patents = df[patent_col].tolist()
    results = {}

    # Use ThreadPoolExecutor for I/O-bound API calls
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                results[patent] = future.result()
            except Exception:
                results[patent] = {'jurisdictions': None, 'family_members': None}
            # Optional small sleep to space requests
            time.sleep(0.1)

    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage:
if __name__ == "__main__":
    #df = pd.read_csv('your_patents.csv')
    processed_df = process_dataframe_parallel(df, 'first publication number', max_workers=20)
    processed_df[['first publication number', 'family_jurisdictions', 'family_members']]



In [41]:
df[['first publication number', 'family_members']].isnull().sum()

first publication number      0
family_members              278
dtype: int64

In [2]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

NameError: name 'processed_df' is not defined

In [16]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

Unnamed: 0,first publication number,family_jurisdictions,family_members
0,US2006250902A1,"[AU, CA, CN, EP, HK, JP, KR, US, WO]","[AU2006295147A1, AU2006295147B2, CA2623398A1, ..."
1,KR102511398B1,[KR],[KR102511398B1]
2,KR102511391B1,[KR],[KR102511391B1]
3,GB2631101A,"[GB, WO]","[GB2631101A, WO2024261465A1]"
4,KR20230163874A,[KR],[KR20230163874A]
...,...,...,...
495,JP3334587B2,,
496,CN111452613A,[CN],[CN111452613A]
497,CN211844104U,[CN],[CN211844104U]
498,CN205544506U,[CN],[CN205544506U]


In [None]:
import concurrent.futures
import os
import requests
import time
import logging
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
from functools import lru_cache

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Rate limiting settings
REQUEST_INTERVAL = 0.2  # seconds between requests
MAX_RETRIES = 3

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    
    logger.info("Getting new access token")
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
            response.raise_for_status()
            TOKEN = response.json()["access_token"]
            TOKEN_EXPIRY = time.time() + 3500  # Slightly less than 1 hour
            logger.info("New token acquired")
            return TOKEN
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            logger.warning(f"Failed to get token (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
            time.sleep(wait_time)
    
    logger.error("Failed to get access token after multiple attempts")
    raise Exception("Failed to get access token")

def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

@lru_cache(maxsize=1000)
def process_patent_with_cache(patent: str, token: str) -> dict:
    """Cached version of process_patent to avoid redundant API calls"""
    return process_patent_internal(patent, token)

def process_patent_internal(patent: str, token: str) -> dict:
    """Internal function that actually processes the patent"""
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}
    
    for attempt in range(MAX_RETRIES):
        try:
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            
            logger.debug(f"Requesting data for patent: {patent}")
            response = requests.get(url, headers=headers, timeout=15)
            
            if response.status_code == 401:  # Unauthorized - token expired
                logger.warning("Token expired, getting a new one")
                return None  # Signal to the caller that we need a new token
                
            if response.status_code == 403:
                logger.warning(f"Access forbidden for patent {patent}. Possibly rate limited.")
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                time.sleep(wait_time)
                continue
                
            if response.status_code == 404:
                logger.info(f"No data found for patent {patent}")
                return {'jurisdictions': None, 'family_members': None}
                
            response.raise_for_status()
            result = extract_jurisdictions_and_members(response.json())
            logger.debug(f"Successfully processed patent {patent}")
            return result
            
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            logger.warning(f"Error processing patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
            time.sleep(wait_time)
    
    logger.error(f"Failed to process patent {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    country = country.get('$') if isinstance(country, dict) else country
                    doc_number = doc.get('doc-number')
                    doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                    kind = doc.get('kind')
                    kind = kind.get('$') if isinstance(kind, dict) else kind
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.error(f"Error extracting jurisdictions and members: {str(e)}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    """Wrapper function to process a patent with proper token management"""
    # Skip None/NaN values
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    
    # Get the current token
    token = get_access_token()
    
    # Try to process the patent
    result = process_patent_with_cache(patent, token)
    
    # If we got None, the token expired, get a new one and retry
    if result is None:
        token = get_access_token()  # Force a token refresh
        result = process_patent_with_cache(patent, token)
    
    return result

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = 5) -> pd.DataFrame:
    """Process patents in parallel with better rate limiting and error handling"""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    
    logger.info(f"Starting to process {len(patents)} patents with {max_workers} workers")
    
    # Get an access token before we start processing
    get_access_token()
    
    # Use ThreadPoolExecutor for I/O-bound API calls
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        
        completed = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                results[patent] = future.result()
                completed += 1
                if completed % 10 == 0:
                    logger.info(f"Processed {completed}/{len(patents)} patents")
            except Exception as e:
                logger.error(f"Unhandled exception for patent {patent}: {str(e)}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
            
            # Space out requests to avoid rate limiting
            time.sleep(REQUEST_INTERVAL)
    
    logger.info(f"Completed processing {len(patents)} patents")
    
    # Apply results to the dataframe
    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    
    # Log statistics
    success_count = sum(1 for r in results.values() if r['jurisdictions'] is not None)
    logger.info(f"Successfully processed {success_count} out of {len(patents)} patents ({success_count/len(patents)*100:.1f}%)")
    
    return result_df

# Example usage:
if __name__ == "__main__":
    # df = pd.read_csv('your_patents.csv')
    processed_df = process_dataframe_parallel(df, 'first publication number', max_workers=5)
    processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

2025-04-21 18:16:52,832 - INFO - Starting to process 500 patents with 5 workers
2025-04-21 18:16:52,833 - INFO - Getting new access token


2025-04-21 18:16:55,326 - INFO - New token acquired
2025-04-21 18:17:23,306 - ERROR - Failed to process patent US10958679B2 after 3 attempts
2025-04-21 18:17:23,634 - INFO - Processed 10/500 patents
2025-04-21 18:17:35,343 - ERROR - Failed to process patent US11546360B2 after 3 attempts
2025-04-21 18:17:42,477 - ERROR - Failed to process patent US2015100357A1 after 3 attempts
2025-04-21 18:17:42,879 - INFO - Processed 20/500 patents
2025-04-21 18:17:53,709 - ERROR - Failed to process patent US10142346B2 after 3 attempts
2025-04-21 18:18:01,450 - ERROR - Failed to process patent US2014053280A1 after 3 attempts
2025-04-21 18:18:01,450 - INFO - Processed 30/500 patents
2025-04-21 18:18:14,835 - INFO - Processed 40/500 patents
2025-04-21 18:18:23,180 - ERROR - Failed to process patent US2020007553A1 after 3 attempts
2025-04-21 18:18:23,303 - ERROR - Failed to process patent US11122426B2 after 3 attempts
2025-04-21 18:18:31,767 - INFO - Processed 50/500 patents
2025-04-21 18:18:39,115 - INF

In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

In [None]:
import concurrent.futures
import os
import requests
import time
import logging
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
from functools import lru_cache
import threading

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Rate limiting settings - much more conservative
REQUEST_INTERVAL = 1.0  # 1 second between requests
MAX_WORKERS = 3         # Limit concurrent connections
MAX_RETRIES = 5         # More retries
TIMEOUT = 30            # Longer timeout

# Rate limiter with token bucket algorithm
class RateLimiter:
    def __init__(self, rate=1, capacity=5):
        self.rate = rate            # tokens per second
        self.capacity = capacity    # maximum tokens
        self.tokens = capacity      # current tokens
        self.last_refill = time.time()
        self.lock = threading.Lock()
    
    def consume(self, tokens=1):
        with self.lock:
            self._refill()
            if self.tokens >= tokens:
                self.tokens -= tokens
                return True
            else:
                sleep_time = (tokens - self.tokens) / self.rate
                logger.debug(f"Rate limited, sleeping for {sleep_time:.2f}s")
                time.sleep(sleep_time)
                self._refill()
                self.tokens -= tokens
                return True
    
    def _refill(self):
        now = time.time()
        elapsed = now - self.last_refill
        new_tokens = elapsed * self.rate
        self.tokens = min(self.capacity, self.tokens + new_tokens)
        self.last_refill = now

# Create a rate limiter - 1 request per second, max 3 burst
rate_limiter = RateLimiter(rate=0.5, capacity=3)

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Token lock to prevent multiple simultaneous token requests
token_lock = threading.Lock()

def get_access_token(force_refresh=False) -> str:
    global TOKEN, TOKEN_EXPIRY
    
    with token_lock:
        if not force_refresh and TOKEN and time.time() < TOKEN_EXPIRY:
            return TOKEN
        
        logger.info("Getting new access token")
        data = {
            "grant_type": "client_credentials",
            "client_id": CONSUMER_KEY,
            "client_secret": CONSUMER_SECRET
        }
        headers = {"Content-Type": "application/x-www-form-urlencoded"}
        
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                response.raise_for_status()
                TOKEN = response.json()["access_token"]
                TOKEN_EXPIRY = time.time() + 3500  # Slightly less than 1 hour
                logger.info("New token acquired")
                return TOKEN
            except Exception as e:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                logger.warning(f"Failed to get token (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
                time.sleep(wait_time)
        
        logger.error("Failed to get access token after multiple attempts")
        raise Exception("Failed to get access token")

def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

@lru_cache(maxsize=1000)
def process_patent_with_cache(patent: str) -> dict:
    """Cached version of process_patent to avoid redundant API calls"""
    return process_patent_internal(patent)

def backoff_sleep(attempt, base=2.0, max_sleep=60):
    """Calculate and sleep for exponential backoff time with jitter"""
    sleep_time = min(base ** attempt + random.uniform(0, 1), max_sleep)
    logger.debug(f"Backing off for {sleep_time:.2f}s")
    time.sleep(sleep_time)

def process_patent_internal(patent: str) -> dict:
    """Internal function that actually processes the patent"""
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}
    
    for attempt in range(MAX_RETRIES):
        try:
            # Consume a token from the rate limiter
            rate_limiter.consume()
            
            # Get a fresh token if needed
            token = get_access_token()
            
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            
            logger.debug(f"Requesting data for patent: {patent}")
            response = requests.get(url, headers=headers, timeout=TIMEOUT)
            
            if response.status_code == 401:  # Unauthorized - token expired
                logger.warning("Token expired, getting a new one")
                get_access_token(force_refresh=True)
                backoff_sleep(attempt)
                continue
                
            if response.status_code == 403:
                logger.warning(f"Access forbidden for patent {patent}. Possibly rate limited.")
                backoff_sleep(attempt + 2)  # More aggressive backoff for rate limiting
                continue
                
            if response.status_code == 404:
                logger.info(f"No data found for patent {patent}")
                return {'jurisdictions': None, 'family_members': None}
                
            response.raise_for_status()
            result = extract_jurisdictions_and_members(response.json())
            logger.debug(f"Successfully processed patent {patent}")
            return result
            
        except requests.exceptions.Timeout:
            logger.warning(f"Timeout for patent {patent} (attempt {attempt+1}/{MAX_RETRIES})")
            backoff_sleep(attempt)
            continue
            
        except requests.exceptions.ConnectionError as e:
            logger.warning(f"Connection error for patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}")
            backoff_sleep(attempt + 1)  # More aggressive backoff for connection errors
            continue
            
        except Exception as e:
            logger.warning(f"Error processing patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}")
            backoff_sleep(attempt)
            continue
    
    logger.error(f"Failed to process patent {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    country = country.get('$') if isinstance(country, dict) else country
                    doc_number = doc.get('doc-number')
                    doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                    kind = doc.get('kind')
                    kind = kind.get('$') if isinstance(kind, dict) else kind
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.error(f"Error extracting jurisdictions and members: {str(e)}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    """Wrapper function to call the cached version"""
    # Skip None/NaN values
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, chunk_size=50) -> pd.DataFrame:
    """Process patents in smaller chunks to avoid overwhelming the API"""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    patents = df[patent_col].tolist()
    total_patents = len(patents)
    results = {}
    
    logger.info(f"Starting to process {total_patents} patents in chunks of {chunk_size}")
    
    # Get an access token before we start processing
    get_access_token()
    
    # Process in chunks to avoid overwhelming the API
    for chunk_start in range(0, total_patents, chunk_size):
        chunk_end = min(chunk_start + chunk_size, total_patents)
        current_chunk = patents[chunk_start:chunk_end]
        
        logger.info(f"Processing chunk {chunk_start//chunk_size + 1}/{(total_patents+chunk_size-1)//chunk_size} " +
                   f"(patents {chunk_start+1}-{chunk_end})")
        
        # Use ThreadPoolExecutor for I/O-bound API calls
        with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            # Submit all tasks for this chunk
            future_to_patent = {executor.submit(process_patent, p): p for p in current_chunk}
            
            for future in concurrent.futures.as_completed(future_to_patent):
                patent = future_to_patent[future]
                try:
                    results[patent] = future.result()
                except Exception as e:
                    logger.error(f"Unhandled exception for patent {patent}: {str(e)}")
                    results[patent] = {'jurisdictions': None, 'family_members': None}
        
        # Calculate success rate for this chunk
        chunk_success = sum(1 for p in current_chunk if results.get(p, {}).get('jurisdictions') is not None)
        logger.info(f"Chunk complete: {chunk_success}/{len(current_chunk)} successful " +
                   f"({chunk_success/len(current_chunk)*100:.1f}%)")
        
        # If success rate is too low, pause to avoid rate limiting
        if chunk_success < len(current_chunk) * 0.5:
            pause_time = 60  # 1 minute pause
            logger.warning(f"Low success rate detected, pausing for {pause_time} seconds")
            time.sleep(pause_time)
    
    logger.info(f"Completed processing {total_patents} patents")
    
    # Apply results to the dataframe
    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    
    # Log statistics
    success_count = sum(1 for r in results.values() if r['jurisdictions'] is not None)
    logger.info(f"Successfully processed {success_count} out of {total_patents} patents ({success_count/total_patents*100:.1f}%)")
    
    return result_df

# Example usage:
if __name__ == "__main__":
    # df = pd.read_csv('your_patents.csv')
    processed_df = process_dataframe_parallel(df, 'first publication number', chunk_size=50)
    #processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

2025-04-21 18:34:44,767 - INFO - Starting to process 500 patents in chunks of 50
2025-04-21 18:34:44,768 - INFO - Getting new access token
2025-04-21 18:34:45,074 - INFO - New token acquired
2025-04-21 18:34:45,075 - INFO - Processing chunk 1/10 (patents 1-50)
2025-04-21 18:37:10,882 - INFO - Chunk complete: 50/50 successful (100.0%)
2025-04-21 18:37:10,883 - INFO - Processing chunk 2/10 (patents 51-100)
2025-04-21 18:40:33,596 - INFO - Chunk complete: 50/50 successful (100.0%)
2025-04-21 18:40:33,596 - INFO - Processing chunk 3/10 (patents 101-150)
2025-04-21 18:42:11,940 - INFO - Chunk complete: 50/50 successful (100.0%)
2025-04-21 18:42:11,941 - INFO - Processing chunk 4/10 (patents 151-200)
2025-04-21 18:43:52,078 - INFO - Chunk complete: 50/50 successful (100.0%)
2025-04-21 18:43:52,079 - INFO - Processing chunk 5/10 (patents 201-250)
2025-04-21 18:45:31,923 - INFO - Chunk complete: 50/50 successful (100.0%)
2025-04-21 18:45:31,924 - INFO - Processing chunk 6/10 (patents 251-300)


In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

parallelism + single token

In [16]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
import logging
from functools import lru_cache

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Fast but not too fast settings
MAX_WORKERS = 10  # High parallelism
TIMEOUT = 20      # Reasonable timeout
MAX_RETRIES = 3   # Quick retries

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    global TOKEN, TOKEN_EXPIRY
    current_time = time.time()
    if TOKEN and current_time < TOKEN_EXPIRY:
        return TOKEN
    
    logger.info("Getting new access token")
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    
    for attempt in range(MAX_RETRIES):
        try:
            response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
            response.raise_for_status()
            TOKEN = response.json()["access_token"]
            TOKEN_EXPIRY = current_time + 3500  # Slightly less than 1 hour
            logger.info("New token acquired")
            return TOKEN
        except Exception as e:
            wait_time = (2 ** attempt) + random.uniform(0, 1)
            logger.warning(f"Failed to get token (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
            time.sleep(wait_time)
    
    logger.error("Failed to get access token after multiple attempts")
    raise Exception("Failed to get access token")

def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

@lru_cache(maxsize=1000)
def process_patent_with_cache(patent: str) -> dict:
    """Cached version of process_patent to avoid redundant API calls"""
    # Get token here once, not for every retry
    token = get_access_token()
    return process_patent_internal(patent, token)

def process_patent_internal(patent: str, token: str) -> dict:
    """Internal function that actually processes the patent"""
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}
    
    for attempt in range(MAX_RETRIES):
        try:
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            
            response = requests.get(url, headers=headers, timeout=TIMEOUT)
            
            if response.status_code == 401:  # Unauthorized - token expired
                logger.warning("Token expired, getting a new one")
                token = get_access_token()  # Get a fresh token
                continue
                
            if response.status_code == 403:
                # Fast retry for rate limiting
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                logger.debug(f"Rate limited for patent {patent}, backing off for {wait_time:.2f}s")
                time.sleep(wait_time)
                continue
                
            if response.status_code == 404:
                logger.debug(f"No data found for patent {patent}")
                return {'jurisdictions': None, 'family_members': None}
                
            response.raise_for_status()
            result = extract_jurisdictions_and_members(response.json())
            return result
            
        except Exception as e:
            wait_time = 0.5 * (2 ** attempt) + random.uniform(0, 0.5)
            if attempt < MAX_RETRIES - 1:  # Only log if we'll retry
                logger.debug(f"Error processing patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
                time.sleep(wait_time)
    
    logger.warning(f"Failed to process patent {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    country = country.get('$') if isinstance(country, dict) else country
                    doc_number = doc.get('doc-number')
                    doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                    kind = doc.get('kind')
                    kind = kind.get('$') if isinstance(kind, dict) else kind
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.debug(f"Error extracting jurisdictions and members: {str(e)}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    """Wrapper function to call the cached version"""
    # Skip None/NaN values
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    """Process patents in parallel with optimized speed and reliability"""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    
    logger.info(f"Starting to process {len(patents)} patents with {max_workers} workers")
    
    # Get a token before starting
    get_access_token()
    
    # Use ThreadPoolExecutor for I/O-bound API calls
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        
        # Track progress
        completed = 0
        successful = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                result = future.result()
                results[patent] = result
                completed += 1
                
                # Count successful results
                if result['jurisdictions'] is not None:
                    successful += 1
                
                # Log progress at intervals
                if completed % 20 == 0:
                    success_rate = (successful / completed) * 100
                    logger.info(f"Processed {completed}/{len(patents)} patents - {success_rate:.1f}% success rate")
                    
                    # Optional: If success rate drops too low, slow down
                    if completed >= 50 and success_rate < 30:
                        logger.warning("Low success rate detected - consider reducing parallelism or spacing requests")
                
            except Exception as e:
                logger.error(f"Unhandled exception for patent {patent}: {str(e)}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    logger.info(f"Completed processing {len(patents)} patents")
    
    # Apply results to the dataframe
    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    
    # Log statistics
    success_count = sum(1 for r in results.values() if r['jurisdictions'] is not None)
    logger.info(f"Successfully processed {success_count} out of {len(patents)} patents ({success_count/len(patents)*100:.1f}%)")
    
    return result_df

def auto_recover_process(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    """Process with automatic recovery for failures"""
    # First attempt with high parallelism
    logger.info("Starting high-speed processing run")
    result_df = process_dataframe_parallel(df, patent_col, max_workers=MAX_WORKERS)
    
    # Check for patents that failed and retry them with more conservative settings
    failed_patents = result_df[result_df['family_jurisdictions'].isna()]
    if not failed_patents.empty:
        failed_count = len(failed_patents)
        logger.info(f"First pass completed with {failed_count} failures. Running recovery pass...")
        
        # Create a new dataframe with just the failed patents
        recovery_df = failed_patents.copy()
        
        # Process with more conservative settings
        recovered_df = process_dataframe_parallel(recovery_df, patent_col, max_workers=3)
        
        # Update the original results with any recovered data
        for idx, row in recovered_df.iterrows():
            if row['family_jurisdictions'] is not None:
                result_df.at[idx, 'family_jurisdictions'] = row['family_jurisdictions']
                result_df.at[idx, 'family_members'] = row['family_members']
        
        # Log recovery statistics
        final_failed = len(result_df[result_df['family_jurisdictions'].isna()])
        logger.info(f"Recovery complete. Recovered {failed_count - final_failed} patents. Final failure count: {final_failed}")
    
    return result_df

# Example usage:
if __name__ == "__main__":
    # df = pd.read_csv('your_patents.csv')
    
    # Option 1: Fast but may have failures
    # processed_df = process_dataframe_parallel(df, 'first publication number')
    
    # Option 2: Fast with automatic recovery of failures
    processed_df = auto_recover_process(df, 'first publication number')
    
    processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

2025-04-21 19:19:31,257 - INFO - Starting high-speed processing run
2025-04-21 19:19:31,259 - INFO - Starting to process 500 patents with 10 workers
2025-04-21 19:19:31,260 - INFO - Getting new access token
2025-04-21 19:19:31,633 - INFO - New token acquired
2025-04-21 19:19:32,746 - INFO - Processed 20/500 patents - 100.0% success rate
2025-04-21 19:19:33,316 - INFO - Processed 40/500 patents - 100.0% success rate
2025-04-21 19:19:34,119 - INFO - Processed 60/500 patents - 100.0% success rate
2025-04-21 19:19:34,754 - INFO - Processed 80/500 patents - 100.0% success rate
2025-04-21 19:19:35,344 - INFO - Processed 100/500 patents - 100.0% success rate
2025-04-21 19:19:35,993 - INFO - Processed 120/500 patents - 100.0% success rate
2025-04-21 19:19:36,869 - INFO - Processed 140/500 patents - 100.0% success rate
2025-04-21 19:19:39,489 - INFO - Processed 160/500 patents - 100.0% success rate
2025-04-21 19:19:42,544 - INFO - Processed 180/500 patents - 100.0% success rate
2025-04-21 19:19

In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

In [20]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
import logging
from functools import lru_cache

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0
TOKEN_REQUEST_COUNT = 0  # Track requests per token
TOKEN_REQUEST_LIMIT = 150  # Get new token after this many requests

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Fast but not too fast settings
MAX_WORKERS = 15  # High parallelism
TIMEOUT = 20      # Reasonable timeout
MAX_RETRIES = 3   # Quick retries

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token(force_refresh=False) -> str:
    global TOKEN, TOKEN_EXPIRY, TOKEN_REQUEST_COUNT
    current_time = time.time()
    
    # Refresh token if forced, expired, or reached request limit
    if force_refresh or TOKEN_REQUEST_COUNT >= TOKEN_REQUEST_LIMIT or not TOKEN or current_time >= TOKEN_EXPIRY:
        logger.info(f"Getting new access token (request count: {TOKEN_REQUEST_COUNT})")
        data = {
            "grant_type": "client_credentials",
            "client_id": CONSUMER_KEY,
            "client_secret": CONSUMER_SECRET
        }
        headers = {"Content-Type": "application/x-www-form-urlencoded"}
        
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                response.raise_for_status()
                TOKEN = response.json()["access_token"]
                TOKEN_EXPIRY = current_time + 3500  # Slightly less than 1 hour
                TOKEN_REQUEST_COUNT = 0  # Reset counter
                logger.info("New token acquired")
                return TOKEN
            except Exception as e:
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                logger.warning(f"Failed to get token (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
                time.sleep(wait_time)
        
        logger.error("Failed to get access token after multiple attempts")
        raise Exception("Failed to get access token")
    else:
        return TOKEN

def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

def process_patent_internal(patent: str) -> dict:
    """Internal function that actually processes the patent"""
    global TOKEN_REQUEST_COUNT
    
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}
    
    for attempt in range(MAX_RETRIES):
        try:
            # Get token and increment request counter
            token = get_access_token()
            TOKEN_REQUEST_COUNT += 1
            
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            
            response = requests.get(url, headers=headers, timeout=TIMEOUT)
            
            if response.status_code == 401:  # Unauthorized - token expired
                logger.warning("Token expired, getting a new one")
                get_access_token(force_refresh=True)
                continue
                
            if response.status_code == 403:
                # Fast retry for rate limiting
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                logger.debug(f"Rate limited for patent {patent}, backing off for {wait_time:.2f}s")
                
                # If we get rate limited, try rotating the token
                if attempt > 0:
                    get_access_token(force_refresh=True)
                    
                time.sleep(wait_time)
                continue
                
            if response.status_code == 404:
                logger.debug(f"No data found for patent {patent}")
                return {'jurisdictions': None, 'family_members': None}
                
            response.raise_for_status()
            result = extract_jurisdictions_and_members(response.json())
            return result
            
        except Exception as e:
            wait_time = 0.5 * (2 ** attempt) + random.uniform(0, 0.5)
            if attempt < MAX_RETRIES - 1:  # Only log if we'll retry
                logger.debug(f"Error processing patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {str(e)}. Retrying in {wait_time:.2f}s")
                time.sleep(wait_time)
    
    logger.warning(f"Failed to process patent {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

# Use a smaller cache size to prevent memory issues with the modified function
@lru_cache(maxsize=500)
def process_patent_with_cache(patent: str) -> dict:
    """Cached version of process_patent to avoid redundant API calls"""
    return process_patent_internal(patent)

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    country = country.get('$') if isinstance(country, dict) else country
                    doc_number = doc.get('doc-number')
                    doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                    kind = doc.get('kind')
                    kind = kind.get('$') if isinstance(kind, dict) else kind
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.debug(f"Error extracting jurisdictions and members: {str(e)}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    """Wrapper function to call the cached version"""
    # Skip None/NaN values
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    """Process patents in parallel with optimized speed and reliability"""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Make a copy to avoid modifying the original
    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    
    logger.info(f"Starting to process {len(patents)} patents with {max_workers} workers")
    
    # Get a fresh token before starting
    get_access_token(force_refresh=True)
    
    # Use ThreadPoolExecutor for I/O-bound API calls
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_patent = {}
        for p in patents:
            future_to_patent[executor.submit(process_patent, p)] = p
            time.sleep(0.01)  # Small delay between submissions to avoid overwhelming the API
        
        # Track progress
        completed = 0
        successful = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                result = future.result()
                results[patent] = result
                completed += 1
                
                # Count successful results
                if result['jurisdictions'] is not None:
                    successful += 1
                
                # Log progress at intervals
                if completed % 20 == 0:
                    success_rate = (successful / completed) * 100
                    logger.info(f"Processed {completed}/{len(patents)} patents - {success_rate:.1f}% success rate")
                    
                    # Force token refresh every chunk_size patents
                    if completed % TOKEN_REQUEST_LIMIT == 0:
                        logger.info(f"Reached {completed} patents, forcing token refresh")
                        get_access_token(force_refresh=True)
                
            except Exception as e:
                logger.error(f"Unhandled exception for patent {patent}: {str(e)}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    logger.info(f"Completed processing {len(patents)} patents")
    
    # Apply results to the dataframe
    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    
    # Log statistics
    success_count = sum(1 for r in results.values() if r['jurisdictions'] is not None)
    logger.info(f"Successfully processed {success_count} out of {len(patents)} patents ({success_count/len(patents)*100:.1f}%)")
    
    return result_df

def process_in_batches(df: pd.DataFrame, patent_col: str, batch_size: int = 200) -> pd.DataFrame:
    """Process the dataframe in smaller batches with fresh tokens for each batch"""
    total_rows = len(df)
    result_df = df.copy()
    
    logger.info(f"Processing {total_rows} patents in batches of {batch_size}")
    
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df.iloc[start_idx:end_idx].copy()
        
        logger.info(f"Processing batch {start_idx//batch_size + 1} (rows {start_idx} to {end_idx-1})")
        
        # Force new token for each batch
        get_access_token(force_refresh=True)
        
        # Process this batch
        processed_batch = process_dataframe_parallel(batch_df, patent_col)
        
        # Update results in the main dataframe
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_jurisdictions')] = processed_batch['family_jurisdictions']
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_members')] = processed_batch['family_members']
        
        # Add a pause between batches to avoid rate limiting
        if end_idx < total_rows:
            pause_time = 5
            logger.info(f"Batch complete. Pausing for {pause_time} seconds before next batch.")
            time.sleep(pause_time)
    
    return result_df

def auto_recover_process(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    """Process with automatic recovery for failures"""
    # First process in batches with token rotation
    logger.info("Starting batch processing with token rotation")
    result_df = process_in_batches(df, patent_col, batch_size=200)
    
    # Check for patents that failed and retry them with more conservative settings
    failed_patents = result_df[result_df['family_jurisdictions'].isna()]
    if not failed_patents.empty:
        failed_count = len(failed_patents)
        logger.info(f"First pass completed with {failed_count} failures. Running recovery pass...")
        
        # Create a new dataframe with just the failed patents
        recovery_df = failed_patents.copy()
        
        # Force new token for recovery
        get_access_token(force_refresh=True)
        
        # Process with more conservative settings
        recovered_df = process_dataframe_parallel(recovery_df, patent_col, max_workers=3)
        
        # Update the original results with any recovered data
        for idx, row in recovered_df.iterrows():
            if row['family_jurisdictions'] is not None:
                result_df.at[idx, 'family_jurisdictions'] = row['family_jurisdictions']
                result_df.at[idx, 'family_members'] = row['family_members']
        
        # Log recovery statistics
        final_failed = len(result_df[result_df['family_jurisdictions'].isna()])
        logger.info(f"Recovery complete. Recovered {failed_count - final_failed} patents. Final failure count: {final_failed}")
    
    return result_df

# Example usage:
if __name__ == "__main__":
    # df = pd.read_csv('your_patents.csv')
    
    # Initialize result columns
    if 'family_jurisdictions' not in df.columns:
        df['family_jurisdictions'] = None
    if 'family_members' not in df.columns:
        df['family_members'] = None
    
    # Process in batches with token rotation and recovery
    processed_df = auto_recover_process(df, 'first publication number')
    
    # Save results to avoid reprocessing
    processed_df.to_csv('processed_patents.csv', index=False)
    
    processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

2025-04-22 09:56:42,642 - INFO - Starting batch processing with token rotation
2025-04-22 09:56:42,643 - INFO - Processing 500 patents in batches of 200
2025-04-22 09:56:42,644 - INFO - Processing batch 1 (rows 0 to 199)
2025-04-22 09:56:42,645 - INFO - Getting new access token (request count: 0)
2025-04-22 09:56:43,369 - INFO - New token acquired
2025-04-22 09:56:43,370 - INFO - Starting to process 200 patents with 15 workers
2025-04-22 09:56:43,370 - INFO - Getting new access token (request count: 0)
2025-04-22 09:56:43,862 - INFO - New token acquired
2025-04-22 09:56:45,955 - INFO - Processed 20/200 patents - 100.0% success rate
2025-04-22 09:56:46,285 - INFO - Processed 40/200 patents - 100.0% success rate
2025-04-22 09:56:47,191 - INFO - Processed 60/200 patents - 100.0% success rate
2025-04-22 09:56:49,003 - INFO - Processed 80/200 patents - 100.0% success rate
2025-04-22 09:56:50,006 - INFO - Getting new access token (request count: 101)
2025-04-22 09:56:50,729 - INFO - Getting 

In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

In [21]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].head(10)

Unnamed: 0,first publication number,family_jurisdictions,family_members
0,US10254766B2,"[CA, GB, MX, US, WO]","[CA3042744A1, GB201907387D0, GB2570843A, MX201..."
1,US6394231B1,"[AT, CA, DE, EP, JP, US]","[ATE271512T1, CA2306359A1, CA2306359C, DE50007..."
2,US10196117B2,"[US, WO]","[US10106233B2, US10196117B2, US2017081002A1, U..."
3,US2017059336A1,"[CN, TW, US]","[CN106485340A, TW201708996A, TWI611279B, US201..."
4,US2024242599A1,[US],[US2024242599A1]
5,US11836985B2,[US],"[US10755111B2, US11836985B2, US2019236379A1, U..."
6,US12056529B2,"[US, WO]","[US12056529B2, US2023088692A1, WO2023045493A1]"
7,US10216190B2,[US],"[US10216190B2, US10409285B2, US2018081360A1, U..."
8,US2020125120A1,[US],"[US10216196B2, US10303182B2, US10303183B2, US1..."
9,EP3605488A1,"[CN, DK, EP, US]","[CN110850866A, DK201870686A1, EP3605488A1, EP3..."


rotating tokens

In [22]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
import logging
from functools import lru_cache

# Set up logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load credentials from .env file
load_dotenv()
CREDENTIALS = [
    {
        "CONSUMER_KEY": os.getenv("CONSUMER_KEY_1").strip(),
        "CONSUMER_SECRET": os.getenv("CONSUMER_SECRET_1").strip()
    },
    {
        "CONSUMER_KEY": os.getenv("CONSUMER_KEY_2").strip(),
        "CONSUMER_SECRET": os.getenv("CONSUMER_SECRET_2").strip()
    }
]

# Token cache
TOKENS = [None] * len(CREDENTIALS)
TOKEN_EXPIRIES = [0] * len(CREDENTIALS)
TOKEN_COUNTS = [0] * len(CREDENTIALS)
CURRENT_INDEX = 0
TOKEN_REQUEST_LIMIT = 150  # Optional: rotate early

# API Endpoints and constants
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"
MAX_WORKERS = 15
TIMEOUT = 20
MAX_RETRIES = 3

# Token manager
def get_rotating_access_token(force_refresh=False):
    global CURRENT_INDEX, TOKENS, TOKEN_EXPIRIES, TOKEN_COUNTS
    num_keys = len(CREDENTIALS)
    now = time.time()

    for i in range(num_keys):
        idx = (CURRENT_INDEX + i) % num_keys
        cred = CREDENTIALS[idx]

        if (force_refresh or not TOKENS[idx] or 
            TOKEN_COUNTS[idx] >= TOKEN_REQUEST_LIMIT or 
            now >= TOKEN_EXPIRIES[idx]):
            
            logger.info(f"Fetching new token using key index {idx}")
            data = {
                "grant_type": "client_credentials",
                "client_id": cred["CONSUMER_KEY"],
                "client_secret": cred["CONSUMER_SECRET"]
            }
            headers = {"Content-Type": "application/x-www-form-urlencoded"}

            for attempt in range(MAX_RETRIES):
                try:
                    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                    response.raise_for_status()
                    TOKENS[idx] = response.json()["access_token"]
                    TOKEN_EXPIRIES[idx] = now + 3500
                    TOKEN_COUNTS[idx] = 0
                    CURRENT_INDEX = (idx + 1) % num_keys
                    return TOKENS[idx]
                except Exception as e:
                    wait = (2 ** attempt) + random.random()
                    logger.warning(f"Token fetch failed (key {idx}, attempt {attempt+1}): {e}")
                    time.sleep(wait)
        else:
            TOKEN_COUNTS[idx] += 1
            CURRENT_INDEX = (idx + 1) % num_keys
            return TOKENS[idx]

    raise Exception("All credential sets failed to provide a valid token.")

# Patent processing functions
def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

def process_patent_internal(patent: str) -> dict:
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}

    for attempt in range(MAX_RETRIES):
        try:
            token = get_rotating_access_token()
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            response = requests.get(url, headers=headers, timeout=TIMEOUT)

            if response.status_code == 401:
                logger.warning("Token expired. Refreshing...")
                get_rotating_access_token(force_refresh=True)
                continue

            if response.status_code == 403:
                wait_time = (2 ** attempt) + random.random()
                logger.warning(f"Rate limit hit for patent {patent}, backing off for {wait_time:.2f}s")
                time.sleep(wait_time)
                continue

            if response.status_code == 404:
                logger.debug(f"No data found for {patent}")
                return {'jurisdictions': None, 'family_members': None}

            response.raise_for_status()
            return extract_jurisdictions_and_members(response.json())

        except Exception as e:
            wait_time = 0.5 * (2 ** attempt) + random.random()
            logger.debug(f"Error processing patent {patent} (attempt {attempt+1}/{MAX_RETRIES}): {e}. Retrying in {wait_time:.2f}s")
            time.sleep(wait_time)

    logger.warning(f"Failed to process {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

@lru_cache(maxsize=500)
def process_patent_with_cache(patent: str) -> dict:
    return process_patent_internal(patent)

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    doc_number = doc.get('doc-number')
                    kind = doc.get('kind')
                    if isinstance(country, dict): country = country.get('$')
                    if isinstance(doc_number, dict): doc_number = doc_number.get('$')
                    if isinstance(kind, dict): kind = kind.get('$')
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.debug(f"Error extracting family info: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    logger.info(f"Processing {len(patents)} patents with {max_workers} workers")
    get_rotating_access_token(force_refresh=True)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        completed = 0
        successful = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                result = future.result()
                results[patent] = result
                completed += 1
                if result['jurisdictions']:
                    successful += 1
                if completed % 20 == 0:
                    logger.info(f"Processed {completed}/{len(patents)} patents - {successful/completed*100:.1f}% success")
            except Exception as e:
                logger.error(f"Unhandled error for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}

    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    return result_df

def process_in_batches(df: pd.DataFrame, patent_col: str, batch_size: int = 200) -> pd.DataFrame:
    total_rows = len(df)
    result_df = df.copy()
    logger.info(f"Processing {total_rows} patents in batches of {batch_size}")
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df.iloc[start_idx:end_idx].copy()
        logger.info(f"Processing batch {start_idx//batch_size + 1}: rows {start_idx}–{end_idx-1}")
        get_rotating_access_token(force_refresh=True)
        processed_batch = process_dataframe_parallel(batch_df, patent_col)
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_jurisdictions')] = processed_batch['family_jurisdictions']
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_members')] = processed_batch['family_members']
        if end_idx < total_rows:
            logger.info("Pausing before next batch...")
            time.sleep(5)
    return result_df

def auto_recover_process(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    logger.info("Starting primary batch processing...")
    result_df = process_in_batches(df, patent_col)
    failed = result_df[result_df['family_jurisdictions'].isna()]
    if not failed.empty:
        logger.info(f"Recovery pass for {len(failed)} failed patents...")
        recovery_df = failed.copy()
        get_rotating_access_token(force_refresh=True)
        recovered_df = process_dataframe_parallel(recovery_df, patent_col, max_workers=3)
        for idx, row in recovered_df.iterrows():
            if row['family_jurisdictions']:
                result_df.at[idx, 'family_jurisdictions'] = row['family_jurisdictions']
                result_df.at[idx, 'family_members'] = row['family_members']
    return result_df

# Main entry
if __name__ == "__main__":
    # Example:
    # df = pd.read_csv("your_patents.csv")
    
    if 'family_jurisdictions' not in df.columns:
        df['family_jurisdictions'] = None
    if 'family_members' not in df.columns:
        df['family_members'] = None

    processed_df = auto_recover_process(df, 'first publication number')
    #processed_df.to_csv("processed_patents.csv", index=False)


2025-04-22 10:15:48,165 - INFO - Starting primary batch processing...
2025-04-22 10:15:48,167 - INFO - Processing 500 patents in batches of 200
2025-04-22 10:15:48,168 - INFO - Processing batch 1: rows 0–199
2025-04-22 10:15:48,168 - INFO - Fetching new token using key index 0
2025-04-22 10:15:48,527 - INFO - Processing 200 patents with 15 workers
2025-04-22 10:15:48,528 - INFO - Fetching new token using key index 1
2025-04-22 10:15:49,860 - INFO - Processed 20/200 patents - 100.0% success
2025-04-22 10:15:50,276 - INFO - Processed 40/200 patents - 100.0% success
2025-04-22 10:15:50,932 - INFO - Processed 60/200 patents - 100.0% success
2025-04-22 10:15:51,520 - INFO - Processed 80/200 patents - 100.0% success
2025-04-22 10:15:52,130 - INFO - Processed 100/200 patents - 100.0% success
2025-04-22 10:15:52,742 - INFO - Processed 120/200 patents - 100.0% success
2025-04-22 10:15:53,570 - INFO - Processed 140/200 patents - 100.0% success
2025-04-22 10:15:54,315 - INFO - Processed 160/200 p

In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

rotating 3 keys

In [17]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
import logging
from functools import lru_cache

# Logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load credentials
load_dotenv()
CREDENTIALS = [
    {
        "CONSUMER_KEY": os.getenv("CONSUMER_KEY_1").strip(),
        "CONSUMER_SECRET": os.getenv("CONSUMER_SECRET_1").strip()
    },
    {
        "CONSUMER_KEY": os.getenv("CONSUMER_KEY_2").strip(),
        "CONSUMER_SECRET": os.getenv("CONSUMER_SECRET_2").strip()
    },
    {
        "CONSUMER_KEY": os.getenv("CONSUMER_KEY_3").strip(),
        "CONSUMER_SECRET": os.getenv("CONSUMER_SECRET_3").strip()
    }
]

# Constants
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"
MAX_WORKERS = 15
TIMEOUT = 20
MAX_RETRIES = 3
TOKEN_REQUEST_LIMIT = 150

# Token state
TOKENS = [None] * len(CREDENTIALS)
TOKEN_EXPIRIES = [0] * len(CREDENTIALS)
TOKEN_COUNTS = [0] * len(CREDENTIALS)
CURRENT_INDEX = 0

# Rotate tokens
def get_rotating_access_token(force_refresh=False):
    global CURRENT_INDEX, TOKENS, TOKEN_EXPIRIES, TOKEN_COUNTS
    now = time.time()

    for i in range(len(CREDENTIALS)):
        idx = (CURRENT_INDEX + i) % len(CREDENTIALS)
        cred = CREDENTIALS[idx]

        if (force_refresh or not TOKENS[idx] or 
            TOKEN_COUNTS[idx] >= TOKEN_REQUEST_LIMIT or 
            now >= TOKEN_EXPIRIES[idx]):

            logger.info(f"Fetching new token using key index {idx}")
            data = {
                "grant_type": "client_credentials",
                "client_id": cred["CONSUMER_KEY"],
                "client_secret": cred["CONSUMER_SECRET"]
            }
            headers = {"Content-Type": "application/x-www-form-urlencoded"}

            for attempt in range(MAX_RETRIES):
                try:
                    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                    response.raise_for_status()
                    TOKENS[idx] = response.json()["access_token"]
                    TOKEN_EXPIRIES[idx] = now + 3500
                    TOKEN_COUNTS[idx] = 0
                    CURRENT_INDEX = (idx + 1) % len(CREDENTIALS)
                    return TOKENS[idx]
                except Exception as e:
                    wait = (2 ** attempt) + random.random()
                    logger.warning(f"Token fetch failed (key {idx}, attempt {attempt+1}): {e}")
                    time.sleep(wait)
        else:
            TOKEN_COUNTS[idx] += 1
            CURRENT_INDEX = (idx + 1) % len(CREDENTIALS)
            return TOKENS[idx]

    raise Exception("All credential sets failed to provide a valid token.")

def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

def process_patent_internal(patent: str) -> dict:
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}

    for attempt in range(MAX_RETRIES):
        try:
            token = get_rotating_access_token()
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            response = requests.get(url, headers=headers, timeout=TIMEOUT)

            if response.status_code == 401:
                logger.warning("Token expired. Refreshing...")
                get_rotating_access_token(force_refresh=True)
                continue

            if response.status_code == 403:
                wait_time = (5 * (2 ** attempt)) + random.uniform(1, 2)
                logger.warning(f"Rate limit hit for patent {patent}, waiting {wait_time:.1f}s before retry")
                time.sleep(wait_time)
                continue

            if response.status_code == 404:
                logger.debug(f"No data found for {patent}")
                return {'jurisdictions': None, 'family_members': None}

            response.raise_for_status()
            return extract_jurisdictions_and_members(response.json())

        except Exception as e:
            wait_time = 0.5 * (2 ** attempt) + random.random()
            logger.debug(f"Error processing patent {patent} (attempt {attempt+1}): {e}. Retrying in {wait_time:.2f}s")
            time.sleep(wait_time)

    logger.warning(f"Failed to process {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

@lru_cache(maxsize=500)
def process_patent_with_cache(patent: str) -> dict:
    return process_patent_internal(patent)

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        world_data = data.get('ops:world-patent-data', {})
        patent_family = world_data.get('ops:patent-family', {})
        members = patent_family.get('ops:family-member', []) or []
        if isinstance(members, dict):
            members = [members]

        for member in members:
            pub_ref = member.get('publication-reference', {})
            docs = pub_ref.get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country')
                    doc_number = doc.get('doc-number')
                    kind = doc.get('kind')
                    if isinstance(country, dict): country = country.get('$')
                    if isinstance(doc_number, dict): doc_number = doc_number.get('$')
                    if isinstance(kind, dict): kind = kind.get('$')
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.debug(f"Error extracting family info: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")

    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    logger.info(f"Processing {len(patents)} patents with {max_workers} workers")
    get_rotating_access_token(force_refresh=True)

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_patent = {}
        for p in patents:
            future_to_patent[executor.submit(process_patent, p)] = p
            if max_workers <= 5:
                time.sleep(1)  # Throttle more in recovery mode

        completed = 0
        successful = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                result = future.result()
                results[patent] = result
                completed += 1
                if result['jurisdictions']:
                    successful += 1
                if completed % 20 == 0:
                    logger.info(f"Processed {completed}/{len(patents)} patents - {successful/completed*100:.1f}% success")
            except Exception as e:
                logger.error(f"Unhandled error for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}

    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    return result_df

def process_in_batches(df: pd.DataFrame, patent_col: str, batch_size: int = 200) -> pd.DataFrame:
    total_rows = len(df)
    result_df = df.copy()
    logger.info(f"Processing {total_rows} patents in batches of {batch_size}")
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df.iloc[start_idx:end_idx].copy()
        logger.info(f"Processing batch {start_idx//batch_size + 1}: rows {start_idx}–{end_idx-1}")
        get_rotating_access_token(force_refresh=True)
        processed_batch = process_dataframe_parallel(batch_df, patent_col)
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_jurisdictions')] = processed_batch['family_jurisdictions']
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_members')] = processed_batch['family_members']
        if end_idx < total_rows:
            logger.info("Pausing before next batch...")
            time.sleep(5)
    return result_df

def auto_recover_process(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    logger.info("Starting primary batch processing...")
    result_df = process_in_batches(df, patent_col)

    failed = result_df[result_df['family_jurisdictions'].isna()]
    if not failed.empty:
        logger.info(f"Recovery pass for {len(failed)} failed patents...")

        failed_patents = failed.copy()
        for i in range(0, len(failed_patents), 20):
            mini_df = failed_patents.iloc[i:i+20].copy()
            get_rotating_access_token(force_refresh=True)
            recovered = process_dataframe_parallel(mini_df, patent_col, max_workers=3)
            for idx, row in recovered.iterrows():
                if row['family_jurisdictions']:
                    result_df.at[idx, 'family_jurisdictions'] = row['family_jurisdictions']
                    result_df.at[idx, 'family_members'] = row['family_members']
            logger.info(f"Processed recovery batch {i}-{min(i+20, len(failed_patents))}")
            time.sleep(10)

    return result_df

# Main runner
if __name__ == "__main__":
    # df = pd.read_csv("your_patents.csv")

    if 'family_jurisdictions' not in df.columns:
        df['family_jurisdictions'] = None
    if 'family_members' not in df.columns:
        df['family_members'] = None

    processed_df = auto_recover_process(df, 'first publication number')
    #processed_df.to_csv("processed_patents.csv", index=False)


2025-05-03 18:32:47,204 - INFO - Starting primary batch processing...
2025-05-03 18:32:47,205 - INFO - Processing 500 patents in batches of 200
2025-05-03 18:32:47,206 - INFO - Processing batch 1: rows 0–199
2025-05-03 18:32:47,207 - INFO - Fetching new token using key index 0
2025-05-03 18:32:47,672 - INFO - Processing 200 patents with 15 workers
2025-05-03 18:32:47,673 - INFO - Fetching new token using key index 1
2025-05-03 18:32:48,008 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,010 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,011 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,012 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,015 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,016 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,017 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,019 - INFO - Fetching new token using key index 2
2025-05-03 18:32:48,

KeyboardInterrupt: 

In [31]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

first publication number     0
family_jurisdictions        43
family_members              43
dtype: int64

improved token rotation

In [None]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
import random
import logging
import threading
from functools import lru_cache

# Logging Configuration
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Load Environment Variables
load_dotenv()
CREDENTIALS = [
    {
        "key": os.getenv("CONSUMER_KEY_1").strip(),
        "secret": os.getenv("CONSUMER_SECRET_1").strip()
    },
    {
        "key": os.getenv("CONSUMER_KEY_2").strip(),
        "secret": os.getenv("CONSUMER_SECRET_2").strip()
    },
    {
        "key": os.getenv("CONSUMER_KEY_3").strip(),
        "secret": os.getenv("CONSUMER_SECRET_3").strip()
    }
]

# API Configuration
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"
MAX_WORKERS = 15
TIMEOUT = 20
MAX_RETRIES = 3
TOKEN_REQUEST_LIMIT = 150

# Token Management State
TOKEN_LOCK = threading.Lock()
TOKEN_LOCKS = [threading.Lock() for _ in CREDENTIALS]
TOKENS = [None] * len(CREDENTIALS)
TOKEN_EXPIRIES = [0] * len(CREDENTIALS)
TOKEN_COUNTS = [0] * len(CREDENTIALS)
CURRENT_INDEX = 0

def get_rotating_access_token():
    """Thread-safe token rotation with automatic refresh"""
    global CURRENT_INDEX, TOKENS, TOKEN_EXPIRIES, TOKEN_COUNTS
    now = time.time()

    with TOKEN_LOCK:
        # Check for existing valid tokens
        for i in range(len(CREDENTIALS)):
            idx = (CURRENT_INDEX + i) % len(CREDENTIALS)
            if (TOKENS[idx] and 
                TOKEN_COUNTS[idx] < TOKEN_REQUEST_LIMIT and 
                now < TOKEN_EXPIRIES[idx]):
                
                TOKEN_COUNTS[idx] += 1
                CURRENT_INDEX = (idx + 1) % len(CREDENTIALS)
                return TOKENS[idx]

        # No valid tokens found, select next index
        idx_to_refresh = CURRENT_INDEX
        CURRENT_INDEX = (idx_to_refresh + 1) % len(CREDENTIALS)

    # Refresh token with per-credential lock
    with TOKEN_LOCKS[idx_to_refresh]:
        # Double-check inside lock
        with TOKEN_LOCK:
            if (TOKENS[idx_to_refresh] and 
                TOKEN_COUNTS[idx_to_refresh] < TOKEN_REQUEST_LIMIT and 
                now < TOKEN_EXPIRIES[idx_to_refresh]):
                
                TOKEN_COUNTS[idx_to_refresh] += 1
                return TOKENS[idx_to_refresh]

        # Fetch new token
        cred = CREDENTIALS[idx_to_refresh]
        data = {
            "grant_type": "client_credentials",
            "client_id": cred["key"],
            "client_secret": cred["secret"]
        }
        headers = {"Content-Type": "application/x-www-form-urlencoded"}

        for attempt in range(MAX_RETRIES):
            try:
                response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                response.raise_for_status()
                new_token = response.json()["access_token"]
                expiry = now + 3500  # 58.3 minutes

                with TOKEN_LOCK:
                    TOKENS[idx_to_refresh] = new_token
                    TOKEN_EXPIRIES[idx_to_refresh] = expiry
                    TOKEN_COUNTS[idx_to_refresh] = 1

                logger.info(f"Refreshed token for index {idx_to_refresh}")
                return new_token
            except Exception as e:
                wait = (2 ** attempt) + random.random()
                logger.warning(f"Token fetch failed (index {idx_to_refresh}, attempt {attempt+1}): {e}")
                time.sleep(wait)
        
        raise Exception(f"Failed to fetch token for index {idx_to_refresh} after {MAX_RETRIES} attempts")

def force_token_refresh():
    """Force refresh all tokens on next request"""
    with TOKEN_LOCK:
        TOKEN_EXPIRIES[:] = [0] * len(CREDENTIALS)
        logger.info("Forced token refresh queued")

# Patent Processing Functions
def validate_patent_number(patent: str) -> bool:
    return bool(patent and isinstance(patent, str) and len(patent.strip()) >= 4)

def process_patent_internal(patent: str) -> dict:
    if not validate_patent_number(patent):
        logger.warning(f"Invalid patent number: '{patent}'")
        return {'jurisdictions': None, 'family_members': None}

    for attempt in range(MAX_RETRIES):
        try:
            token = get_rotating_access_token()
            url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
            headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
            response = requests.get(url, headers=headers, timeout=TIMEOUT)

            if response.status_code == 401:
                logger.warning("Token expired detected")
                force_token_refresh()
                continue

            if response.status_code == 403:
                wait_time = (5 * (2 ** attempt)) + random.uniform(1, 2)
                logger.warning(f"Rate limit hit for {patent}, waiting {wait_time:.1f}s")
                time.sleep(wait_time)
                continue

            if response.status_code == 404:
                logger.debug(f"No data found for {patent}")
                return {'jurisdictions': None, 'family_members': None}

            response.raise_for_status()
            return extract_jurisdictions_and_members(response.json())

        except Exception as e:
            wait_time = 0.5 * (2 ** attempt) + random.random()
            logger.debug(f"Retry {patent} (attempt {attempt+1}): {e}")
            time.sleep(wait_time)

    logger.warning(f"Failed {patent} after {MAX_RETRIES} attempts")
    return {'jurisdictions': None, 'family_members': None}

@lru_cache(maxsize=500)
def process_patent_with_cache(patent: str) -> dict:
    return process_patent_internal(patent)

def extract_jurisdictions_and_members(data: dict) -> dict:
    try:
        jurisdictions = set()
        family_members = []
        members = data.get('ops:world-patent-data', {}).get('ops:patent-family', {}).get('ops:family-member', []) or []
        
        if isinstance(members, dict):
            members = [members]

        for member in members:
            docs = member.get('publication-reference', {}).get('document-id', []) or []
            if isinstance(docs, dict):
                docs = [docs]
                
            for doc in docs:
                if doc.get('@document-id-type') == 'docdb':
                    country = doc.get('country', {}).get('$') if isinstance(doc.get('country'), dict) else doc.get('country')
                    doc_number = doc.get('doc-number', {}).get('$') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number')
                    kind = doc.get('kind', {}).get('$') if isinstance(doc.get('kind'), dict) else doc.get('kind')
                    
                    if country and doc_number and kind:
                        jurisdictions.add(country)
                        family_members.append(f"{country}{doc_number}{kind}")
                        
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
    except Exception as e:
        logger.debug(f"Extraction error: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent(patent: str) -> dict:
    if pd.isna(patent) or patent is None:
        return {'jurisdictions': None, 'family_members': None}
    return process_patent_with_cache(patent)

# Batch Processing Functions
def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found")

    result_df = df.copy()
    patents = df[patent_col].tolist()
    results = {}
    logger.info(f"Processing {len(patents)} patents with {max_workers} workers")

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        
        completed = 0
        successful = 0
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                result = future.result()
                results[patent] = result
                completed += 1
                if result['jurisdictions']:
                    successful += 1
                if completed % 20 == 0:
                    logger.info(f"Progress: {completed}/{len(patents)} ({successful/completed*100:.1f}% success)")
            except Exception as e:
                logger.error(f"Error for {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}

    result_df['family_jurisdictions'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    result_df['family_members'] = result_df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    return result_df

def process_in_batches(df: pd.DataFrame, patent_col: str, batch_size: int = 200) -> pd.DataFrame:
    total_rows = len(df)
    result_df = df.copy()
    logger.info(f"Processing {total_rows} patents in batches of {batch_size}")
    
    for start_idx in range(0, total_rows, batch_size):
        end_idx = min(start_idx + batch_size, total_rows)
        batch_df = df.iloc[start_idx:end_idx].copy()
        logger.info(f"Processing batch {start_idx//batch_size + 1}: rows {start_idx}–{end_idx-1}")
        
        # Force refresh before each batch
        force_token_refresh()
        processed_batch = process_dataframe_parallel(batch_df, patent_col)
        
        # Update results
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_jurisdictions')] = processed_batch['family_jurisdictions']
        result_df.iloc[start_idx:end_idx, result_df.columns.get_loc('family_members')] = processed_batch['family_members']
        
        if end_idx < total_rows:
            logger.info("Pausing before next batch...")
            time.sleep(5)
            
    return result_df

def auto_recover_process(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    logger.info("Starting primary batch processing...")
    result_df = process_in_batches(df, patent_col)

    failed = result_df[result_df['family_jurisdictions'].isna()]
    if not failed.empty:
        logger.info(f"Recovery pass for {len(failed)} failed patents...")
        failed_patents = failed.copy()
        
        for i in range(0, len(failed_patents), 20):
            mini_df = failed_patents.iloc[i:i+20].copy()
            force_token_refresh()
            recovered = process_dataframe_parallel(mini_df, patent_col, max_workers=3)
            
            for idx, row in recovered.iterrows():
                if row['family_jurisdictions']:
                    result_df.at[idx, 'family_jurisdictions'] = row['family_jurisdictions']
                    result_df.at[idx, 'family_members'] = row['family_members']
            
            logger.info(f"Processed recovery batch {i}-{min(i+20, len(failed_patents))}")
            time.sleep(10)

    return result_df

# Main Execution
if __name__ == "__main__":
    # Example usage:
    # df = pd.read_csv("your_patents.csv")
     if 'family_jurisdictions' not in df.columns:
         df['family_jurisdictions'] = None
     if 'family_members' not in df.columns:
         df['family_members'] = None
     processed_df = auto_recover_process(df, 'first publication number')
    # processed_df.to_csv("processed_patents.csv", index=False)
     pass

2025-05-03 18:28:26,081 - INFO - Starting primary batch processing...
2025-05-03 18:28:26,082 - INFO - Processing 500 patents in batches of 200
2025-05-03 18:28:26,083 - INFO - Processing batch 1: rows 0–199
2025-05-03 18:28:26,083 - INFO - Forced token refresh queued
2025-05-03 18:28:26,084 - INFO - Processing 200 patents with 15 workers
2025-05-03 18:28:26,343 - INFO - Refreshed token for index 0
2025-05-03 18:28:26,345 - INFO - Refreshed token for index 1
2025-05-03 18:28:46,544 - INFO - Progress: 20/200 (100.0% success)
2025-05-03 18:28:57,634 - INFO - Progress: 40/200 (100.0% success)


In [41]:
# Count the number of None (NaN) values in the specified columns
none_counts = processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

# Display the counts
print(none_counts)

first publication number     0
family_jurisdictions        58
family_members              58
dtype: int64


In [None]:
# Count the number of None (NaN) values in the specified columns
none_counts = processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

# Display the counts
print(none_counts)

parallel extraction for abstracts

In [27]:
#improved token rotation for abstract
import os
import requests
import time
import concurrent.futures
import pandas as pd
from urllib.parse import quote
from dotenv import load_dotenv
import logging
import random
import threading

# === Setup ===
load_dotenv()
CREDENTIALS = [
    {"key": os.getenv("CONSUMER_KEY_1").strip(), "secret": os.getenv("CONSUMER_SECRET_1").strip()},
    {"key": os.getenv("CONSUMER_KEY_2").strip(), "secret": os.getenv("CONSUMER_SECRET_2").strip()},
    {"key": os.getenv("CONSUMER_KEY_3").strip(), "secret": os.getenv("CONSUMER_SECRET_3").strip()}
]

TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"
TIMEOUT = 15
MAX_WORKERS = 10
DELAY_BETWEEN_SUBMITS = 0.1
TOKEN_REQUEST_LIMIT = 150

# Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Token management
TOKENS = [None] * len(CREDENTIALS)
TOKEN_EXPIRIES = [0] * len(CREDENTIALS)
TOKEN_COUNTS = [0] * len(CREDENTIALS)
CURRENT_INDEX = 0
TOKEN_LOCK = threading.Lock()
TOKEN_LOCKS = [threading.Lock() for _ in CREDENTIALS]

def get_rotating_access_token():
    global CURRENT_INDEX, TOKENS, TOKEN_EXPIRIES, TOKEN_COUNTS
    now = time.time()

    # First check for existing valid tokens
    with TOKEN_LOCK:
        for i in range(len(CREDENTIALS)):
            idx = (CURRENT_INDEX + i) % len(CREDENTIALS)
            if (TOKENS[idx] and 
                TOKEN_COUNTS[idx] < TOKEN_REQUEST_LIMIT and 
                now < TOKEN_EXPIRIES[idx]):
                
                TOKEN_COUNTS[idx] += 1
                CURRENT_INDEX = (idx + 1) % len(CREDENTIALS)
                return TOKENS[idx]

        # No valid token found, select next credential to refresh
        idx = CURRENT_INDEX
        CURRENT_INDEX = (idx + 1) % len(CREDENTIALS)

    # Use per-index lock to prevent concurrent refreshes
    with TOKEN_LOCKS[idx]:
        # Double-check after acquiring index lock
        with TOKEN_LOCK:
            if (TOKENS[idx] and 
                TOKEN_COUNTS[idx] < TOKEN_REQUEST_LIMIT and 
                now < TOKEN_EXPIRIES[idx]):
                
                TOKEN_COUNTS[idx] += 1
                return TOKENS[idx]

        # Fetch new token
        cred = CREDENTIALS[idx]
        data = {
            "grant_type": "client_credentials",
            "client_id": cred["key"],
            "client_secret": cred["secret"]
        }
        headers = {"Content-Type": "application/x-www-form-urlencoded"}

        for attempt in range(3):
            try:
                response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=TIMEOUT)
                response.raise_for_status()
                new_token = response.json()["access_token"]
                expiry = time.time() + 3500

                with TOKEN_LOCK:
                    TOKENS[idx] = new_token
                    TOKEN_EXPIRIES[idx] = expiry
                    TOKEN_COUNTS[idx] = 1  # Reset count for new token

                logger.info(f"Successfully refreshed token for index {idx}")
                return new_token
            except Exception as e:
                wait = (2 ** attempt) + random.random()
                logger.warning(f"Token fetch failed (index {idx}, attempt {attempt+1}): {e}")
                time.sleep(wait)
        
        raise Exception(f"Failed to fetch token for index {idx} after 3 attempts")

# === Abstract Extraction (unchanged) ===
def get_abstract_json(publication_number: str, token: str) -> dict:
    url = f"{BASE_URL}/published-data/publication/docdb/{quote(publication_number)}/abstract"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/json"
    }
    response = requests.get(url, headers=headers, timeout=TIMEOUT)
    response.raise_for_status()
    return response.json()

def extract_english_abstract_from_json(json_data: dict) -> str:
    try:
        world_data = json_data.get("ops:world-patent-data", {})
        exch_docs = world_data.get("exchange-documents", {})
        doc = exch_docs.get("exchange-document", {})
        abstract_section = doc.get("abstract", [])
        if isinstance(abstract_section, dict):
            abstract_section = [abstract_section]
        for abstract in abstract_section:
            if abstract.get("@lang") == "en":
                p = abstract.get("p")
                if isinstance(p, dict):
                    return p.get("$", "").strip()
                elif isinstance(p, list):
                    return " ".join(item.get("$", "").strip() for item in p if isinstance(item, dict))
                elif isinstance(p, str):
                    return p.strip()
        return None
    except Exception as e:
        logger.debug(f"Error parsing JSON: {e}")
        return None

def process_patent_abstract(pub_number: str) -> str:
    for attempt in range(3):
        try:
            token = get_rotating_access_token()
            json_data = get_abstract_json(pub_number, token)
            return extract_english_abstract_from_json(json_data)
        except Exception as e:
            wait = 0.5 * (2 ** attempt) + random.random()
            logger.warning(f"Retrying {pub_number} (attempt {attempt+1}): {e}")
            time.sleep(wait)
    return None

# === Main Parallel Processor ===
def add_abstracts_to_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = MAX_WORKERS) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")

    df = df.copy()
    patents = df[patent_col].tolist()
    abstracts = {}

    logger.info(f"Processing {len(patents)} patents with {max_workers} workers...")

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_patent = {}
        for pub_number in patents:
            future = executor.submit(process_patent_abstract, pub_number)
            future_to_patent[future] = pub_number
            time.sleep(DELAY_BETWEEN_SUBMITS)

        for i, future in enumerate(concurrent.futures.as_completed(future_to_patent)):
            pub_number = future_to_patent[future]
            try:
                result = future.result()
                abstracts[pub_number] = result
                if (i + 1) % 20 == 0:
                    logger.info(f"Processed {i + 1}/{len(patents)} abstracts")
            except Exception as e:
                logger.error(f"Unexpected error for {pub_number}: {e}")
                abstracts[pub_number] = None

    df["abstract"] = df[patent_col].map(lambda pub: abstracts.get(pub))
    return df

# === Main Entry ===
if __name__ == "__main__":
    # Example usage:
    # df = pd.read_csv("your_patents.csv")
    enriched_df = add_abstracts_to_dataframe_parallel(df, "first publication number")
    # enriched_df.to_csv("abstracts_with_rotation.csv", index=False)
    pass

2025-04-22 11:13:22,391 - INFO - Processing 500 patents with 10 workers...
2025-04-22 11:13:22,732 - INFO - Successfully refreshed token for index 1
2025-04-22 11:13:22,738 - INFO - Successfully refreshed token for index 0
2025-04-22 11:13:52,798 - INFO - Successfully refreshed token for index 0
2025-04-22 11:13:52,913 - INFO - Successfully refreshed token for index 1
2025-04-22 11:14:12,561 - INFO - Processed 20/500 abstracts
2025-04-22 11:14:12,561 - INFO - Processed 40/500 abstracts
2025-04-22 11:14:12,562 - INFO - Processed 60/500 abstracts
2025-04-22 11:14:12,562 - INFO - Processed 80/500 abstracts
2025-04-22 11:14:12,563 - INFO - Processed 100/500 abstracts
2025-04-22 11:14:12,563 - INFO - Processed 120/500 abstracts
2025-04-22 11:14:12,563 - INFO - Processed 140/500 abstracts
2025-04-22 11:14:12,564 - INFO - Processed 160/500 abstracts
2025-04-22 11:14:12,564 - INFO - Processed 180/500 abstracts
2025-04-22 11:14:12,564 - INFO - Processed 200/500 abstracts
2025-04-22 11:14:12,564

In [28]:
enriched_df['abstract'].isnull().sum()

np.int64(18)

In [56]:
import concurrent.futures
import os
import requests
import time
import pandas as pd
import numpy as np
from urllib.parse import quote
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Web scraping constants
ESPACENET_BASE_URL = "https://worldwide.espacenet.com/patent/search/family/"

# =========== API PROCESSING FUNCTIONS ===========

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(str(patent).strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """
    Extract jurisdictions and family member publication numbers from API response.
    """
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country')
                country = country.get('$') if isinstance(country, dict) else country
                doc_number = doc.get('doc-number')
                doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                kind = doc.get('kind')
                kind = kind.get('$') if isinstance(kind, dict) else kind
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO OPS API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(str(patent))}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code in (403, 404):
            print(f"API: Patent {patent} not found or access forbidden")
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API Error processing patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_api_batch(patents, batch_size=20, delay=1.2):
    """Process a batch of patents using the API with rate limiting."""
    results = {}
    for i in range(0, len(patents), batch_size):
        batch = patents[i:i+batch_size]
        print(f"API: Processing batch {i//batch_size + 1}/{(len(patents)-1)//batch_size + 1}")
        
        for patent in batch:
            results[patent] = process_patent_api(patent)
            time.sleep(delay)  # Rate limiting
    
    return results

# =========== WEB SCRAPING FUNCTIONS ===========

def setup_driver():
    """Setup and return a configured Chrome WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920,1080")
    
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def scrape_patent_data(patent, driver):
    """Scrape data for a single patent using Selenium."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    
    try:
        url = f"{ESPACENET_BASE_URL}{quote(str(patent))}"
        driver.get(url)
        
        # Wait for the family members section to load
        wait = WebDriverWait(driver, 10)
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".table-family-members")))
        
        # Extract family members
        family_members = []
        jurisdictions = set()
        
        # Find all rows in the family members table
        rows = driver.find_elements(By.CSS_SELECTOR, ".table-family-members tbody tr")
        
        for row in rows:
            try:
                # Extract publication number
                pub_cell = row.find_element(By.CSS_SELECTOR, "td:nth-child(1)")
                pub_number = pub_cell.text.strip()
                
                if pub_number:
                    # Extract country code (first two characters usually)
                    country_code = pub_number[:2]
                    jurisdictions.add(country_code)
                    family_members.append(pub_number)
            except NoSuchElementException:
                continue
        
        return {
            'jurisdictions': sorted(jurisdictions),
            'family_members': sorted(set(family_members))
        }
        
    except TimeoutException:
        print(f"Scraper: Timeout for patent {patent}")
        return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraper Error processing patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent_batch_with_scraper(patents, worker_id):
    """Process a batch of patents with a single scraper instance."""
    results = {}
    driver = setup_driver()
    
    try:
        for i, patent in enumerate(patents):
            print(f"Scraper {worker_id}: Processing patent {i+1}/{len(patents)}: {patent}")
            results[patent] = scrape_patent_data(patent, driver)
            # Random delay between requests to avoid detection
            time.sleep(np.random.uniform(2.0, 4.0))
    finally:
        driver.quit()
    
    return results

def scrape_patents_parallel(patents, num_workers=3):
    """Process patents using parallel web scrapers."""
    results = {}
    
    # Handle case where there are fewer patents than workers
    num_workers = min(num_workers, len(patents))
    if num_workers == 0:
        return {}
        
    # Split patents into chunks for each worker
    chunk_size = max(1, len(patents) // num_workers)
    chunks = [patents[i:i+chunk_size] for i in range(0, len(patents), chunk_size)]
    
    # Use ThreadPoolExecutor for parallel scraping
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
        future_to_chunk = {
            executor.submit(process_patent_batch_with_scraper, chunk, i): i 
            for i, chunk in enumerate(chunks)
        }
        
        for future in concurrent.futures.as_completed(future_to_chunk):
            worker_id = future_to_chunk[future]
            try:
                chunk_results = future.result()
                results.update(chunk_results)
                print(f"Worker {worker_id} completed processing {len(chunk_results)} patents")
            except Exception as e:
                print(f"Worker {worker_id} generated an exception: {e}")
    
    return results

# =========== MAIN PROCESSING FUNCTION ===========

def process_dataframe_combined(df, patent_col, api_fraction=0.5, api_batch_size=20, 
                              api_delay=1.2, scraper_workers=3):
    """
    Process a dataframe using both API and web scraping approaches.
    
    Parameters:
    - df: DataFrame containing patent numbers
    - patent_col: Column name containing patent numbers
    - api_fraction: Fraction of data to process with API (0-1)
    - api_batch_size: Number of patents to process in each API batch
    - api_delay: Delay between API requests in seconds
    - scraper_workers: Number of parallel web scraper instances
    
    Returns:
    - DataFrame with added family_jurisdictions and family_members columns
    """
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Make a copy of the dataframe
    result_df = df.copy()
    
    # Get all patents
    patents = result_df[patent_col].tolist()
    total = len(patents)
    
    # Handle empty dataframe case
    if total == 0:
        return result_df
        
    # Split patents between API and scraping
    api_count = max(1, int(total * api_fraction))
    # Ensure we don't take more than the available patents
    api_count = min(api_count, total)
    api_patents = patents[:api_count]
    scrape_patents = patents[api_count:]
    
    print(f"Processing {len(api_patents)} patents with API and {len(scrape_patents)} with scrapers")
    
    # Process API patents
    api_results = {}
    if api_patents:
        print("\n===== STARTING API PROCESSING =====")
        api_results = process_api_batch(api_patents, batch_size=api_batch_size, delay=api_delay)
        print(f"API processing completed for {len(api_results)} patents")
    
    # Process scraper patents
    scraper_results = {}
    if scrape_patents:
        print("\n===== STARTING WEB SCRAPING =====")
        scraper_results = scrape_patents_parallel(scrape_patents, num_workers=scraper_workers)
        print(f"Web scraping completed for {len(scraper_results)} patents")
    
    # Combine results
    all_results = {**api_results, **scraper_results}
    
    # Map the processed results to new DataFrame columns
    result_df['family_jurisdictions'] = result_df[patent_col].map(
        lambda p: all_results.get(p, {}).get('jurisdictions')
    )
    result_df['family_members'] = result_df[patent_col].map(
        lambda p: all_results.get(p, {}).get('family_members')
    )
    
    # Check for missing values
    null_count = result_df[['family_jurisdictions', 'family_members']].isnull().sum()
    print(f"\nMissing values after processing:\n{null_count}")
    
    return result_df

# ======= USAGE EXAMPLE =======
if __name__ == "__main__":
    # Load your dataframe
    # df = pd.read_csv('your_patents.csv')
    
    # Example: Create a small test dataframe
    # test_df = pd.DataFrame({
    #     'first publication number': ['EP1234567A1', 'US20100123456A1', 'WO2015123456A1', 'JP2010123456A']
    # })
    test_df = df.head(50)
    
    # Process the dataframe using combined approach
    processed_df = process_dataframe_combined(
        df=test_df,
        patent_col='first publication number',
        api_fraction=0.5,     # 50% of patents processed by API
        api_batch_size=20,    # Process 20 patents in each API batch
        api_delay=1.2,        # 1.2 seconds delay between API requests
        scraper_workers=3     # Run 3 parallel web scrapers
    )
    
    # Show results
    print("\nFinal Results:")
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])
    
    # Optionally save results
    # processed_df.to_csv('patent_jurisdictions_combined.csv', index=False)

Processing 25 patents with API and 25 with scrapers

===== STARTING API PROCESSING =====
API: Processing batch 1/2
API: Processing batch 2/2
API processing completed for 25 patents

===== STARTING WEB SCRAPING =====
Scraper 1: Processing patent 1/8: CN221914114U
Scraper 0: Processing patent 1/8: CN115158041A
Scraper 2: Processing patent 1/8: US10960783B2
Scraper: Timeout for patent US10960783B2
Scraper: Timeout for patent CN221914114U
Scraper: Timeout for patent CN115158041A
Scraper 2: Processing patent 2/8: CN104210379A
Scraper 1: Processing patent 2/8: US2022258626A1
Scraper 0: Processing patent 2/8: CN101685890A
Scraper: Timeout for patent CN104210379A
Scraper: Timeout for patent US2022258626A1
Scraper: Timeout for patent CN101685890A
Scraper 1: Processing patent 3/8: CN108248409A
Scraper 2: Processing patent 3/8: JPH11164401A
Scraper 0: Processing patent 3/8: CN108674229A
Scraper: Timeout for patent CN108248409A
Scraper: Timeout for patent JPH11164401A
Scraper: Timeout for patent C

In [58]:
processed_df[['family_jurisdictions','family_members']].isnull().sum()

family_jurisdictions    25
family_members          25
dtype: int64

In [46]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def scrape_patent(driver, patent: str) -> dict:
    """
    Placeholder for web scraping logic.
    Replace this with your actual scraping implementation.
    """
    try:
        # Example: Navigate to a patent search page (adjust URL as needed)
        driver.get(f"{BASE_URL}/family/publication/docdb/{quote(patent)}")
        # Add your scraping logic here, e.g., using driver.find_element()
        # For demonstration, return dummy data
        time.sleep(1)  # Simulate scraping delay
        return {'jurisdictions': ['US', 'EP'], 'family_members': [f"{patent}A1"]}
    except Exception as e:
        print(f"Scraping error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent_scrape(patent: str) -> dict:
    """Process a single patent using web scraping with Selenium."""
    try:
        driver = webdriver.Chrome()  # Ensure ChromeDriver is installed
        data = scrape_patent(driver, patent)
        driver.quit()
        return data
    except Exception as e:
        print(f"Scraping setup error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_combined(df: pd.DataFrame, patent_col: str) -> pd.DataFrame:
    """Process the dataframe by splitting it and using API and web scraping in parallel."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    
    # Split the dataframe into two halves
    patents = df[patent_col].tolist()
    mid = len(patents) // 2
    patents_api = patents[:mid]  # First half for API
    patents_scrape = patents[mid:]  # Second half for scraping
    
    results = {}
    
    # Use two ThreadPoolExecutors for parallel processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:
        
        # Submit API tasks
        future_api = {api_executor.submit(process_patent_api, p): p for p in patents_api}
        
        # Submit scraping tasks (3 workers for 3 browser instances)
        future_scrape = {scrape_executor.submit(process_patent_scrape, p): p for p in patents_scrape}
        
        # Collect results from both
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    # Map results back to the dataframe
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage
if __name__ == "__main__":
    # Replace with your actual dataframe loading
    # df = pd.read_csv('your_patents.csv')
    # For demonstration, create a dummy dataframe
    df = df.head(20)
    
    processed_df = process_dataframe_combined(df, 'first publication number')
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']].head())
    # Check for null values
    print("\nNull values check:")
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum())

  first publication number                  family_jurisdictions  \
0           US2006250902A1  [AU, CA, CN, EP, HK, JP, KR, US, WO]   
1            KR102511398B1                                  [KR]   
2            KR102511391B1                              [US, EP]   
3               GB2631101A                              [US, EP]   
4           KR20230163874A                              [US, EP]   

                                      family_members  
0  [AU2006295147A1, AU2006295147B2, CA2623398A1, ...  
1                                    [KR102511398B1]  
2                                  [KR102511391B1A1]  
3                                     [GB2631101AA1]  
4                                 [KR20230163874AA1]  

Null values check:
first publication number    0
family_jurisdictions        0
family_members              0
dtype: int64


In [47]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

Unnamed: 0,first publication number,family_jurisdictions,family_members
0,US2006250902A1,"[AU, CA, CN, EP, HK, JP, KR, US, WO]","[AU2006295147A1, AU2006295147B2, CA2623398A1, ..."
1,KR102511398B1,[KR],[KR102511398B1]
2,KR102511391B1,"[US, EP]",[KR102511391B1A1]
3,GB2631101A,"[US, EP]",[GB2631101AA1]
4,KR20230163874A,"[US, EP]",[KR20230163874AA1]


In [16]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']]

Unnamed: 0,first publication number,family_jurisdictions,family_members
0,US2006250902A1,,
1,KR102511398B1,,
2,KR102511391B1,,
3,GB2631101A,,
4,KR20230163874A,,
5,CN105098267A,[CN],"[CN105098267A, CN105098267B]"
6,TW201310755A,,
7,US2022140365A1,,
8,WO2011019133A2,"[KR, WO]","[KR100949260B1, WO2011019133A2, WO2011019133A3]"
9,CN108550935A,,


In [34]:
df.columns

Index(['No', 'Title', 'Inventors', 'Applicants', 'Publication number',
       'Earliest priority', 'IPC', 'CPC', 'Publication date',
       'Earliest publication', 'family number', 'Unnamed: 11',
       'first publication date', 'second publication date',
       'first publication number', 'second publication number'],
      dtype='object')

In [33]:
df.rename(columns={'Family number': 'family number'}, inplace=True)

In [38]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token for EPO API."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Expires in ~58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Basic validation for patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from EPO API JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Respect API rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

from urllib.parse import quote
import pandas as pd
from selenium import webdriver

def scrape_patent(driver, publication_number: str, family_number: str) -> dict:
    # Construct URL
    url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}"
    driver.get(url)
    # Add logic to extract family members (simplified example)
    family_members = driver.find_elements_by_xpath("//some/xpath/to/family/members")  # Adjust XPath
    family_members_list = [elem.text for elem in family_members]
    return {'jurisdictions': None, 'family_members': family_members_list}

def process_patent_scrape(row: pd.Series) -> dict:
    publication_number = row['first publication number']
    family_number = row['family number']
    if pd.isnull(publication_number) or pd.isnull(family_number):
        print(f"Skipping patent due to missing values: {row}")
        return {'jurisdictions': None, 'family_members': None}
    publication_number = str(publication_number)
    family_number = str(family_number)
    try:
        # Configure ChromeDriver with explicit path and options
        options = webdriver.ChromeOptions()
        options.headless = False  # Ensure browser is visible
        driver = webdriver.Chrome(executable_path='C:/chromedriver/chromedriver.exe', options=options)
        data = scrape_patent(driver, publication_number, family_number)
        driver.quit()
        return data
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}


def process_dataframe_combined(df: pd.DataFrame, patent_col: str = 'first publication number', family_col: str = 'family number') -> pd.DataFrame:
    """Process the DataFrame by splitting it and using API and web scraping in parallel."""
    # Validate required columns
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")

    # Split the DataFrame into two halves
    mid = len(df) // 2
    df_api = df.iloc[:mid]  # First half for API
    df_scrape = df.iloc[mid:]  # Second half for scraping

    results = {}

    # Parallel processing with two ThreadPoolExecutors
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:

        # Submit API tasks for the first half
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}

        # Submit scraping tasks for the second half
        future_scrape = {scrape_executor.submit(process_patent_scrape, row): row[patent_col] for _, row in df_scrape.iterrows()}

        # Collect results from both
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}

    # Map results back to the DataFrame
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results.get(p, {}).get('jurisdictions'))
    df['family_members'] = df[patent_col].map(lambda p: results.get(p, {}).get('family_members'))
    return df

# Example usage
if __name__ == "__main__":
    # Create a dummy DataFrame for demonstration
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'] * 10,
    #     'family number': ['06543210', '05432109'] * 10
    # })
    df = df.head(10)

    # Process the DataFrame
    processed_df = process_dataframe_combined(df)
    print("\nProcessed DataFrame (first 5 rows):")
    print(processed_df[['first publication number', 'family number', 'family_jurisdictions', 'family_members']].head())

    # Check for null values
    print("\nNull values check:")
    print(processed_df[['family_jurisdictions', 'family_members']].isnull().sum())

Scraping error for patent CN105098267A: WebDriver.__init__() got an unexpected keyword argument 'executable_path'
Scraping error for patent TW201310755A: WebDriver.__init__() got an unexpected keyword argument 'executable_path'
Scraping error for patent US2022140365A1: WebDriver.__init__() got an unexpected keyword argument 'executable_path'
Scraping error for patent WO2011019133A2: WebDriver.__init__() got an unexpected keyword argument 'executable_path'
Scraping error for patent CN108550935A: WebDriver.__init__() got an unexpected keyword argument 'executable_path'

Processed DataFrame (first 5 rows):
  first publication number  family number  \
0           US2006250902A1       37397092   
1            KR102511398B1       85796445   
2            KR102511391B1       85796535   
3               GB2631101A       91738186   
4           KR20230163874A       89124565   

                   family_jurisdictions  \
0  [AU, CA, CN, EP, HK, JP, KR, US, WO]   
1                               

In [40]:
processed_df[['first publication number', 'family number', 'family_jurisdictions', 'family_members']]

Unnamed: 0,first publication number,family number,family_jurisdictions,family_members
0,US2006250902A1,37397092,"[AU, CA, CN, EP, HK, JP, KR, US, WO]","[AU2006295147A1, AU2006295147B2, CA2623398A1, ..."
1,KR102511398B1,85796445,[KR],[KR102511398B1]
2,KR102511391B1,85796535,[KR],[KR102511391B1]
3,GB2631101A,91738186,"[GB, WO]","[GB2631101A, WO2024261465A1]"
4,KR20230163874A,89124565,[KR],[KR20230163874A]
5,CN105098267A,54578217,,
6,TW201310755A,48482101,,
7,US2022140365A1,75243546,,
8,WO2011019133A2,42183715,,
9,CN108550935A,63515557,,


In [61]:
df.rename(columns={'Family number': 'family number'}, inplace=True)

In [None]:
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import pandas as pd

class PatentsSearch:
    def __init__(self, headless=True):
        """Initialize the scraper with enhanced compatibility options."""
        
        options = uc.ChromeOptions()
        
        
        if headless:
            options.add_argument('--headless')
        
        
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--disable-extensions')
        
        try:
            
            self.driver = uc.Chrome(
                options=options, 
                use_subprocess=True,  
                version_main=None,    
                suppress_welcome=True,
                debug=False
            )
            
            
            self.driver.set_page_load_timeout(30)
            self.driver.set_window_size(1920, 1080)
        
        except Exception as e:
            print(f"Failed to initialize ChromeDriver: {e}")
            print("Trying alternative initialization method...")
            
            # Alternative initialization method
            self.driver = uc.Chrome(
                options=options,
                driver_executable_path=None  
            )

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url):
        """Navigate to the given URL and return the page HTML."""
        try:
            print(f"Navigating to: {url}")
            self.driver.get(url)

            
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            
            self.add_random_delay(3, 5)

            
            return self.driver.page_source

        except TimeoutException:
            print("Timed out waiting for the page to load.")
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def parse_html(self, html):
        """Parse the HTML and extract all span elements inside the 'Published as' content."""
        soup = BeautifulSoup(html, 'html.parser')
        
        # Look for the element containing "Publié en tant que" or "Published as"
        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Publié en tant que" in tag.text or "Published as" in tag.text))
        
        if published_as_element:
            # Get the next sibling span that contains the relevant content
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                # Extract all span elements within the content
                spans = content_element.find_all('span')
                return [span.get_text(strip=True) for span in spans]
        return []

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()



if __name__ == '__main__':
    
    scraper = PatentsSearch(headless=False)  # Set headless to False to see the browser in action

    df=df.head(10)
  
    
    df['family_members'] = None

    try:
        for index, row in df.iterrows():
            
            url = f"https://worldwide.espacenet.com/patent/search/family/{row['Family number']}/publication/{row['first publication number']}?q=hydrogen%20battery"

            
            html = scraper.get_page_html(url)
            if html:
                print(f"Page HTML retrieved successfully for {row['first publication number']}.")
                
                family_members = scraper.parse_html(html)
                df.at[index, 'family_members'] = family_members  
            else:
                print(f"Failed to retrieve the page HTML for {row['first publication number']}.")

    finally:
        
        scraper.close()
        print("Scraper closed.")

    
    df.head()

Navigating to: https://worldwide.espacenet.com/patent/search/family/37397092/publication/US2006250902A1
Page HTML retrieved successfully for US2006250902A1.
Navigating to: https://worldwide.espacenet.com/patent/search/family/85796445/publication/KR102511398B1
Page HTML retrieved successfully for KR102511398B1.
Navigating to: https://worldwide.espacenet.com/patent/search/family/85796535/publication/KR102511391B1
Page HTML retrieved successfully for KR102511391B1.
Navigating to: https://worldwide.espacenet.com/patent/search/family/91738186/publication/GB2631101A
Page HTML retrieved successfully for GB2631101A.
Navigating to: https://worldwide.espacenet.com/patent/search/family/89124565/publication/KR20230163874A
Page HTML retrieved successfully for KR20230163874A.
Scraper closed.


In [None]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import random

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Define search keywords


# Construct the query from search_keywords
keywords = [key.strip() for key in search_keywords.keys()]  # Remove trailing space from "Autonomous "
QUERY = " AND ".join([f'tac="{keyword}"' for keyword in keywords])

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def extract_country_code(pub_number):
    """Extract the two-letter country code from a publication number."""
    match = re.match(r'^[A-Z]{2}', pub_number)
    return match.group(0) if match else None

def scrape_patent(driver, publication_number: str, family_number: str) -> dict:
    """Scrape family information from Espacenet with the predefined query."""
    try:
        # Construct the URL with the query parameter
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        driver.get(url)

        # Wait for the 'Published as' or 'Publié en tant que' section to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Published as') or contains(text(), 'Publié en tant que')]"))
        )

        # Add a random delay to mimic human behavior
        time.sleep(random.uniform(1, 3))

        # Get the page source
        html = driver.page_source

        # Parse the HTML with BeautifulSoup
        soup = BeautifulSoup(html, 'html.parser')

        # Find the 'Published as' or 'Publié en tant que' h5 element
        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Published as" in tag.text or "Publié en tant que" in tag.text))
        
        if published_as_element:
            # Get the next sibling span containing family member details
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                # Extract all span texts as family members
                spans = content_element.find_all('span')
                family_members = [span.get_text(strip=True) for span in spans if span.get_text(strip=True)]

                # Extract unique jurisdictions from family members
                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))

                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }

        return {'jurisdictions': None, 'family_members': None}

    except TimeoutException:
        print(f"Timed out waiting for page to load for patent {publication_number}")
        return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent_scrape(row: pd.Series) -> dict:
    """Process a single patent using web scraping with Selenium."""
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])

    # Check for missing values
    if pd.isnull(publication_number) or pd.isnull(family_number):
        return {'jurisdictions': None, 'family_members': None}

    try:
        driver = webdriver.Chrome()  # Ensure ChromeDriver is installed
        data = scrape_patent(driver, publication_number, family_number)
        driver.quit()
        return data
    except Exception as e:
        print(f"Scraping setup error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_combined(df: pd.DataFrame, patent_col: str, family_col: str) -> pd.DataFrame:
    """Process the dataframe by splitting it and using API and web scraping in parallel."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")
    
    # Split the dataframe into two halves
    mid = len(df) // 2
    df_api = df.iloc[:mid]
    df_scrape = df.iloc[mid:]
    
    results = {}
    
    # Use two ThreadPoolExecutors for parallel processing
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:
        
        # Submit API tasks for the first half
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}
        
        # Submit scraping tasks for the second half
        future_scrape = {scrape_executor.submit(process_patent_scrape, row): row[patent_col] for _, row in df_scrape.iterrows()}
        
        # Collect results from both
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    # Map results back to the dataframe
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage
if __name__ == "__main__":
    # Replace with your actual dataframe loading
    df = pd.DataFrame({
        'first publication number': ['US2020000000A1', 'EP1234567A1'],
        'family number': ['06543210', '05432109']
    })
    
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number')
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

In [None]:
import urllib.parse

# Define the field mapping based on the provided dictionary
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt",
    "title,abstract and claims": "ctxt"  # Interpret as 'ctxt' (title, abstract, or claims)
}

# Define the search keywords and their corresponding fields
search_keywords = {
    "Autonomous": "title,abstract and claims",
    "Vehicles": "title,abstract and claims"
}

# Function to construct the query
def construct_query(keywords_dict):
    parts = []
    for keyword, field in keywords_dict.items():
        # Map the field to its code; default to 'ctxt' if not found
        field_code = field_mapping.get(field, "ctxt")
        # Construct the query part for this keyword
        parts.append(f'{field_code}="{keyword}"')
    # Join all parts with " AND "
    return " AND ".join(parts)

# Function to build the full URL
def build_patent_url(family_number, publication_number, query):
    base_url = "https://worldwide.espacenet.com/patent/search/family/"
    encoded_query = urllib.parse.quote(query)
    url = f"{base_url}{urllib.parse.quote(family_number)}/publication/{urllib.parse.quote(publication_number)}?q={encoded_query}"
    return url

# Example usage
family_number = "12345678"
publication_number = "EP1234567A1"

# Construct the query
query = construct_query(search_keywords)
# Expected query: 'ctxt="Autonomous" AND ctxt="Vehicles"'

# Build the URL
url = build_patent_url(family_number, publication_number, query)
print(f"Generated URL: {url}")

# Optional: Add language parameter if needed
url += "&queryLang=en%3Ade%3Afr"

Generated URL: https://worldwide.espacenet.com/patent/search/family/12345678/publication/EP1234567A1?q=ctxt%3D%22Autonomous%22%20AND%20ctxt%3D%22Vehicles%22


In [69]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import random

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Define field mapping for query construction
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt"
}

# Define search keywords and their corresponding fields

# Construct the query for the URL
def construct_query():
    parts = []
    for keyword, field in search_keywords.items():
        field_code = field_mapping.get(field, "ctxt")
        parts.append(f'{field_code}="{keyword}"')
    return " AND ".join(parts)

QUERY = construct_query()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def extract_country_code(pub_number):
    """Extract the two-letter country code from a publication number."""
    match = re.match(r'^[A-Z]{2}', pub_number)
    return match.group(0) if match else None

def scrape_patent(driver, publication_number: str, family_number: str) -> dict:
    """Scrape family information from Espacenet with the predefined query."""
    try:
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        url += "&queryLang=en%3Ade%3Afr"
        driver.get(url)

        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Published as') or contains(text(), 'Publié en tant que')]"))
        )

        time.sleep(random.uniform(1, 3))

        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')

        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Published as" in tag.text or "Publié en tant que" in tag.text))
        
        if published_as_element:
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                spans = content_element.find_all('span')
                family_members = [span.get_text(strip=True) for span in spans if span.get_text(strip=True)]

                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))

                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }

        return {'jurisdictions': None, 'family_members': None}

    except TimeoutException:
        print(f"Timed out waiting for page to load for patent {publication_number}")
        return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent_scrape(row: pd.Series) -> dict:
    """Process a single patent using web scraping with Selenium."""
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])

    if pd.isnull(publication_number) or pd.isnull(family_number):
        return {'jurisdictions': None, 'family_members': None}

    try:
        driver = webdriver.Chrome()
        data = scrape_patent(driver, publication_number, family_number)
        driver.quit()
        return data
    except Exception as e:
        print(f"Scraping setup error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_combined(df: pd.DataFrame, patent_col: str, family_col: str) -> pd.DataFrame:
    """Process the dataframe by splitting it and using API and web scraping in parallel."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")
    
    mid = len(df) // 2
    df_api = df.iloc[:mid]
    df_scrape = df.iloc[mid:]
    
    results = {}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=1) as scrape_executor:
        
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}
        future_scrape = {scrape_executor.submit(process_patent_scrape, row): row[patent_col] for _, row in df_scrape.iterrows()}
        
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage
if __name__ == "__main__":
    # Sample DataFrame
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'],
    #     'family number': ['06543210', '05432109']
    # })
    df=df.head(8)
    
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number')
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

Timed out waiting for page to load for patent KR20230163874A
Timed out waiting for page to load for patent CN105098267A
Timed out waiting for page to load for patent TW201310755A
Timed out waiting for page to load for patent US2022140365A1
  first publication number                  family_jurisdictions  \
0           US2006250902A1  [AU, CA, CN, EP, HK, JP, KR, US, WO]   
1            KR102511398B1                                  [KR]   
2            KR102511391B1                                  [KR]   
3               GB2631101A                              [GB, WO]   
4           KR20230163874A                                  None   
5             CN105098267A                                  None   
6             TW201310755A                                  None   
7           US2022140365A1                                  None   

                                      family_members  
0  [AU2006295147A1, AU2006295147B2, CA2623398A1, ...  
1                                    

In [65]:
df['family_jurisdictions']

0    [AU, CA, CN, EP, HK, JP, KR, US, WO]
1                                    [KR]
2                                    [KR]
3                                [GB, WO]
4                                    [KR]
5                                    None
6                                    None
7                                    None
8                                    None
9                                    None
Name: family_jurisdictions, dtype: object

In [13]:
df.rename(columns={'Family number': 'family number'}, inplace=True)

In [None]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import random
import undetected_chromedriver as uc

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Define field mapping for query construction
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt"
}

# Define search keywords and their corresponding fields
search_keywords = {
    "Autonomous": "title,abstract or claims",
    "Vehicles": "title,abstract or claims"
}

# Construct the query for the URL
def construct_query():
    parts = []
    for keyword, field in search_keywords.items():
        field_code = field_mapping.get(field, "ctxt")
        parts.append(f'{field_code}="{keyword}"')
    return " AND ".join(parts)

QUERY = construct_query()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def extract_country_code(pub_number):
    """Extract the two-letter country code from a publication number."""
    match = re.match(r'^[A-Z]{2}', pub_number)
    return match.group(0) if match else None

def scrape_patent(driver, publication_number: str, family_number: str) -> dict:
    """Scrape family information from Espacenet with the predefined query."""
    try:
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        driver.get(url)

        # Add random delay to mimic human behavior
        time.sleep(random.uniform(5, 10))

        # Wait for the page to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//h5[contains(text(), 'Published as') or contains(text(), 'Publié en tant que')]"))
        )

        html = driver.page_source

        soup = BeautifulSoup(html, 'html.parser')

        # Check for 403 error
        if soup.find('h1', string='403 Forbidden'):
            print(f"403 Forbidden error for patent {publication_number}")
            return {'jurisdictions': None, 'family_members': None}

        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Published as" in tag.text or "Publié en tant que" in tag.text))
        
        if published_as_element:
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                spans = content_element.find_all('span')
                family_members = [span.get_text(strip=True) for span in spans if span.get_text(strip=True)]

                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))

                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }

        return {'jurisdictions': None, 'family_members': None}

    except TimeoutException:
        print(f"Timed out waiting for page to load for patent {publication_number}")
        return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_patent_scrape(row: pd.Series) -> dict:
    """Process a single patent using web scraping with Selenium."""
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])

    if pd.isnull(publication_number) or pd.isnull(family_number):
        return {'jurisdictions': None, 'family_members': None}

    try:
        # Use undetected_chromedriver to avoid detection
        driver = uc.Chrome()
        data = scrape_patent(driver, publication_number, family_number)
        driver.quit()
        return data
    except Exception as e:
        print(f"Scraping setup error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_combined(df: pd.DataFrame, patent_col: str, family_col: str) -> pd.DataFrame:
    """Process the dataframe by splitting it and using API and web scraping in parallel."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")
    
    mid = len(df) // 2
    df_api = df.iloc[:mid]
    df_scrape = df.iloc[mid:]
    
    results = {}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:
        
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}
        future_scrape = {scrape_executor.submit(process_patent_scrape, row): row[patent_col] for _, row in df_scrape.iterrows()}
        
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage
if __name__ == "__main__":
    # Sample DataFrame
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'],
    #     'family number': ['06543210', '05432109']
    # })
    df=df.head(200)
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number')
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

Scraping setup error for patent DE102013016943A1: [WinError 183] Impossible de créer un fichier déjà existant: 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected\\chromedriver-win32\\chromedriver.exe' -> 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected_chromedriver.exe'
Scraping setup error for patent CN111683053A: [WinError 183] Impossible de créer un fichier déjà existant: 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected\\chromedriver-win32\\chromedriver.exe' -> 'C:\\Users\\tasni\\appdata\\roaming\\undetected_chromedriver\\undetected_chromedriver.exe'
Timed out waiting for page to load for patent CN102761521A
Timed out waiting for page to load for patent CN106254315A
Timed out waiting for page to load for patent CN108418786A
Timed out waiting for page to load for patent CN103220294A
Timed out waiting for page to load for patent CN118862092A
Timed out waiting for page to load for patent US12204638B1
Timed out waitin

In [None]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

first publication number    0
family_jurisdictions        6
family_members              6
dtype: int64

In [106]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].tail(20)

Unnamed: 0,first publication number,family_jurisdictions,family_members
180,US12162688B2,,
181,CN117894195A,,
182,SE1751173A1,,
183,CN114973735A,,
184,US11036370B2,,
185,US2025141270A1,,
186,US2014358353A1,,
187,US11978338B2,,
188,US11580521B2,,
189,US2015309510A1,,


In [18]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

first publication number    0
family_jurisdictions        6
family_members              6
dtype: int64

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
import time
import random

# Set up Chrome options for headless browsing
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)

try:
    # Replace with the target URL you want to bypass
    url = "https://example.com"
    driver.get(url)
    
    # Wait for the page to load (adjust time as needed)
    time.sleep(random.uniform(3, 5))
    
    # Look for Cloudflare's checkbox (iframe containing the CAPTCHA)
    iframe = driver.find_elements(By.TAG_NAME, "iframe")
    if iframe:
        driver.switch_to.frame(iframe[0])  # Switch to the iframe
        
        # Find the checkbox element (adjust selector based on actual structure)
        checkbox = driver.find_element(By.ID, "checkbox")  # Cloudflare checkbox ID might differ
        if checkbox:
            # Simulate human-like mouse movement to the checkbox
            actions = ActionChains(driver)
            actions.move_to_element(checkbox).pause(random.uniform(0.5, 1.5)).click().perform()
            print("Checkbox clicked successfully.")
            
            # Wait for verification to complete
            time.sleep(random.uniform(2, 4))
            
            # Switch back to the main content
            driver.switch_to.default_content()
    
    # Check if we successfully bypassed Cloudflare
    if "example.com" in driver.current_url:
        print("Successfully accessed the page!")
        print(driver.page_source)  # Output the page source
    else:
        print("Failed to bypass Cloudflare.")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Clean up and close the browser
    driver.quit()

In [None]:
#patent scraper class with proxy rotation
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium.common.exceptions import TimeoutException
import random
import undetected_chromedriver as uc
from patentscraper import PatentsSearch  # Assuming PatentsSearch is defined in patentscraper.py

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Define field mapping for query construction
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt"
}

# Define search keywords and their corresponding fields
search_keywords = {
    "Autonomous": "title,abstract or claims",
    "Vehicles": "title,abstract or claims"
}

# Construct the query for the URL
def construct_query():
    parts = []
    for keyword, field in search_keywords.items():
        field_code = field_mapping.get(field, "ctxt")
        parts.append(f'{field_code}="{keyword}"')
    return " AND ".join(parts)

QUERY = construct_query()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def extract_country_code(pub_number):
    """Extract the two-letter country code from a publication number."""
    match = re.match(r'^[A-Z]{2}', pub_number)
    return match.group(0) if match else None

def process_patent_scrape(row: pd.Series, proxy_list: list) -> dict:
    """Process a single patent using web scraping with PatentsSearch class and rotating proxies."""
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])

    if pd.isnull(publication_number) or pd.isnull(family_number):
        return {'jurisdictions': None, 'family_members': None}

    try:
        # Select a random proxy from the list
        proxy = random.choice(proxy_list)
        
        # Initialize PatentsSearch with proxy
        scraper = PatentsSearch(headless=True, proxy=proxy)
        
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        
        html = scraper.get_page_html(url)
        if html:
            family_members = scraper.parse_html(html)
            if family_members:
                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))
                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }
            else:
                return {'jurisdictions': None, 'family_members': None}
        else:
            return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}
    finally:
        scraper.close()

def process_dataframe_combined(df: pd.DataFrame, patent_col: str, family_col: str, proxy_list: list) -> pd.DataFrame:
    """Process the dataframe by splitting it and using API and web scraping in parallel with rotating proxies."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")
    
    mid = len(df) // 2
    df_api = df.iloc[:mid]
    df_scrape = df.iloc[mid:]
    
    results = {}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:
        
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}
        future_scrape = {scrape_executor.submit(process_patent_scrape, row, proxy_list): row[patent_col] for _, row in df_scrape.iterrows()}
        
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage
if __name__ == "__main__":
    # Sample DataFrame
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'],
    #     'family number': ['06543210', '05432109']
    # })
    # List of proxies (example format: 'http://ip:port')
    proxy_list = ['http://proxy1:port', 'http://proxy2:port', 'http://proxy3:port']
    
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number', proxy_list)
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

In [22]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv
from selenium.common.exceptions import TimeoutException
import random
import undetected_chromedriver as uc
from patentscraper import PatentsSearch

# Global token cache for API
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

# Define field mapping for query construction
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt"
}

# Define search keywords and their corresponding fields
search_keywords = {
    "Autonomous": "title,abstract or claims",
    "Vehicles": "title,abstract or claims"
}

# Construct the query for the URL
def construct_query():
    parts = []
    for keyword, field in search_keywords.items():
        field_code = field_mapping.get(field, "ctxt")
        parts.append(f'{field_code}="{keyword}"')
    return " AND ".join(parts)

QUERY = construct_query()

# Proxy list provided by the user
proxy_list = [
    "http://162.159.241.204:80",
    "http://162.159.242.122:80",
    "http://162.159.242.37:80",
    "http://162.159.242.187:80",
    "http://1.0.0.45:80",
    "http://162.159.247.22:80",
    "http://1.0.0.54:80",
    "http://1.0.0.37:80",
    "http://1.0.0.105:80",
    "http://1.0.0.193:80"
]

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Approximately 58 minutes
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    """Perform a basic validation for the patent number format."""
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    """Extract jurisdictions and family members from the JSON response."""
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country', {}).get('$', '') if isinstance(doc.get('country'), dict) else doc.get('country', '')
                doc_number = doc.get('doc-number', {}).get('$', '') if isinstance(doc.get('doc-number'), dict) else doc.get('doc-number', '')
                kind = doc.get('kind', {}).get('$', '') if isinstance(doc.get('kind'), dict) else doc.get('kind', '')
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent_api(patent: str) -> dict:
    """Process a single patent using the EPO API."""
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        time.sleep(2)  # Delay to respect rate limits
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        print(f"API error for patent {patent}: {e}")
        return {'jurisdictions': None, 'family_members': None}

def extract_country_code(pub_number):
    """Extract the two-letter country code from a publication number."""
    match = re.match(r'^[A-Z]{2}', pub_number)
    return match.group(0) if match else None

def process_patent_scrape(row: pd.Series, proxy_list: list) -> dict:
    """Process a single patent using web scraping with PatentsSearch and rotating proxies."""
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])

    if pd.isnull(publication_number) or pd.isnull(family_number):
        return {'jurisdictions': None, 'family_members': None}

    try:
        # Select a random proxy or use None if list is empty
        proxy = random.choice(proxy_list) if proxy_list else None
        print(f"Using proxy: {proxy} for patent {publication_number}")
        
        # Initialize PatentsSearch
        scraper = PatentsSearch(headless=True, proxy=proxy)
        
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        
        html = scraper.get_page_html(url)
        if html:
            family_members = scraper.parse_html(html)
            if family_members:
                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))
                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }
            else:
                return {'jurisdictions': None, 'family_members': None}
        else:
            return {'jurisdictions': None, 'family_members': None}
    except Exception as e:
        print(f"Scraping error for patent {publication_number}: {e}")
        return {'jurisdictions': None, 'family_members': None}
    finally:
        scraper.close()

def process_dataframe_combined(df: pd.DataFrame, patent_col: str, family_col: str, proxy_list: list) -> pd.DataFrame:
    """Process the dataframe using API and web scraping with rotating proxies."""
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    if family_col not in df.columns:
        raise ValueError(f"Column '{family_col}' not found in DataFrame")
    
    mid = len(df) // 2
    df_api = df.iloc[:mid]
    df_scrape = df.iloc[mid:]
    
    results = {}
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as api_executor, \
         concurrent.futures.ThreadPoolExecutor(max_workers=3) as scrape_executor:
        
        future_api = {api_executor.submit(process_patent_api, row[patent_col]): row[patent_col] for _, row in df_api.iterrows()}
        future_scrape = {scrape_executor.submit(process_patent_scrape, row, proxy_list): row[patent_col] for _, row in df_scrape.iterrows()}
        
        all_futures = list(future_api.keys()) + list(future_scrape.keys())
        for future in concurrent.futures.as_completed(all_futures):
            patent = future_api.get(future) or future_scrape.get(future)
            try:
                results[patent] = future.result()
            except Exception as e:
                print(f"Error collecting result for patent {patent}: {e}")
                results[patent] = {'jurisdictions': None, 'family_members': None}
    
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

if __name__ == "__main__":
    # Sample DataFrame
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'],
    #     'family number': ['06543210', '05432109']
    # })
    df = df.head(100)
    # Use provided proxy list
    if not proxy_list:
        print("No proxies available, running without proxies")
        proxy_list = [None]
    
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number', proxy_list)
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

Using proxy: http://1.0.0.105:80 for patent US2015127942A1
Scraping error for patent US2015127942A1: PatentsSearch.__init__() got an unexpected keyword argument 'proxy'
Using proxy: http://162.159.241.204:80 for patent US11477269B2
Scraping error for patent US11477269B2: PatentsSearch.__init__() got an unexpected keyword argument 'proxy'
Using proxy: http://1.0.0.193:80 for patent US2016112455A1
Scraping error for patent US2016112455A1: PatentsSearch.__init__() got an unexpected keyword argument 'proxy'
Using proxy: http://1.0.0.37:80 for patent US2024394760A1
Scraping error for patent US2024394760A1: PatentsSearch.__init__() got an unexpected keyword argument 'proxy'
Using proxy: http://1.0.0.37:80 for patent US10411903B2
Scraping error for patent US10411903B2: PatentsSearch.__init__() got an unexpected keyword argument 'proxy'
Using proxy: http://1.0.0.193:80 for patent US10187381B2
Scraping error for patent US10187381B2: PatentsSearch.__init__() got an unexpected keyword argument 'p

In [23]:
processed_df[['first publication number', 'family_jurisdictions', 'family_members']].isnull().sum()

first publication number     0
family_jurisdictions        50
family_members              50
dtype: int64

In [None]:
import pandas as pd
import time
import random
from urllib.parse import quote
import requests
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
field_mapping = {
    "title": "ti",
    "abstract": "ab",
    "claims": "cl",
    "title,abstract or claims": "ctxt",
    "all text fields": "ftxt",
    "title or abstract": "ta",
    "description": "desc",
    "all text fields or names": "nftxt",
    "title , abstract or names": "ntxt"
}

# Constants (replace with your actual values)
BASE_URL = "https://api.epo.org/3.2/rest-services"
#QUERY = "pa=google"
# Construct the query for the URL
def construct_query():
    parts = []
    for keyword, field in search_keywords.items():
        field_code = field_mapping.get(field, "ctxt")
        parts.append(f'{field_code}="{keyword}"')
    return " AND ".join(parts)

QUERY = construct_query()
# Error tracking variables
scraping_error_count = 0
api_error_count = 0
current_method = 'scraping'
MAX_ERRORS = 3

# Helper functions (assuming these are defined elsewhere or as needed)
def extract_country_code(patent):
    return patent[:2] if patent else None

def validate_patent_number(patent):
    return bool(patent and isinstance(patent, str) and len(patent) > 2)

def get_access_token():
    # Replace with your actual token retrieval logic
    return "your_access_token"

def extract_jurisdictions_and_members(data):
    # Replace with your actual data parsing logic
    family_members = data.get('family_members', [])
    jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))
    return {
        'jurisdictions': jurisdictions if jurisdictions else None,
        'family_members': family_members if family_members else None
    }

# Web scraping class
class PatentsSearch:
    def __init__(self, headless=True):
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-blink-features=AutomationControlled')
        options.add_argument('--disable-extensions')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(30)
        self.driver.set_window_size(1920, 1080)

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url):
        try:
            self.driver.get(url)
            WebDriverWait(self.driver, 20).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )
            self.add_random_delay(3, 5)
            return self.driver.page_source
        except TimeoutException:
            raise TimeoutException("Timed out waiting for page to load")
        except Exception as e:
            raise Exception(f"Error during page load: {e}")

    def parse_html(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        published_as_element = soup.find(lambda tag: tag.name == "h5" and ("Publié en tant que" in tag.text or "Published as" in tag.text))
        if published_as_element:
            content_element = published_as_element.find_next_sibling("span")
            if content_element:
                spans = content_element.find_all('span')
                return [span.get_text(strip=True) for span in spans]
        return []

    def close(self):
        if self.driver:
            self.driver.quit()

# Processing functions
def process_patent_scrape(row):
    publication_number = str(row['first publication number'])
    family_number = str(row['family number'])
    if pd.isnull(publication_number) or pd.isnull(family_number):
        raise ValueError("Invalid publication or family number")
    try:
        scraper = PatentsSearch(headless=False)  # Browser visible
        url = f"https://worldwide.espacenet.com/patent/search/family/{quote(family_number)}/publication/{quote(publication_number)}?q={quote(QUERY)}"
        html = scraper.get_page_html(url)
        if html:
            family_members = scraper.parse_html(html)
            if family_members:
                jurisdictions = sorted(set([extract_country_code(member) for member in family_members if extract_country_code(member)]))
                return {
                    'jurisdictions': jurisdictions if jurisdictions else None,
                    'family_members': family_members if family_members else None
                }
            else:
                raise Exception("No family members found")
        else:
            raise Exception("Failed to retrieve HTML")
    except Exception as e:
        raise e
    finally:
        scraper.close()

def process_patent_api(patent):
    if not validate_patent_number(patent):
        raise ValueError("Invalid patent number")
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception as e:
        raise e

def process_dataframe_combined(df, patent_col, family_col):
    global scraping_error_count, api_error_count, current_method
    results = {}
    for index, row in df.iterrows():
        method = current_method
        try:
            if method == 'scraping':
                result = process_patent_scrape(row)
            else:
                result = process_patent_api(row[patent_col])
            results[row[patent_col]] = result
            if method == 'scraping':
                scraping_error_count = 0
            else:
                api_error_count = 0
        except Exception as e:
            print(f"Error processing {row[patent_col]} with {method}: {e}")
            if method == 'scraping':
                scraping_error_count += 1
                if scraping_error_count >= MAX_ERRORS:
                    current_method = 'api'
                    scraping_error_count = 0
            else:
                api_error_count += 1
                if api_error_count >= MAX_ERRORS:
                    current_method = 'scraping'
                    api_error_count = 0
            results[row[patent_col]] = {'jurisdictions': None, 'family_members': None}
        time.sleep(2)  # Delay to avoid rate limiting
    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

if __name__ == "__main__":
    # Sample DataFrame
    # df = pd.DataFrame({
    #     'first publication number': ['US2020000000A1', 'EP1234567A1'],
    #     'family number': ['06543210', '05432109']
    # })
    df=df.head(100)
    processed_df = process_dataframe_combined(df, 'first publication number', 'family number')
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

Error processing US10254766B2 with scraping: No family members found
Error processing US6394231B1 with scraping: No family members found
Error processing US10196117B2 with scraping: No family members found
Error processing US2017059336A1 with api: HTTPSConnectionPool(host='api.epo.org', port=443): Max retries exceeded with url: /3.2/rest-services/family/publication/docdb/US2017059336A1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E5CBF68DD0>: Failed to resolve 'api.epo.org' ([Errno 11001] getaddrinfo failed)"))
Error processing US2024242599A1 with api: HTTPSConnectionPool(host='api.epo.org', port=443): Max retries exceeded with url: /3.2/rest-services/family/publication/docdb/US2024242599A1 (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000001E5CBF69D00>: Failed to resolve 'api.epo.org' ([Errno 11001] getaddrinfo failed)"))
Error processing US11836985B2 with api: HTTPSConnectionPool(host='api.epo.org', port=443):