final test

In [24]:
from bs4 import BeautifulSoup
import time
import random
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class EspacenetScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with configurable options."""
        options = uc.ChromeOptions()
        if headless:
            options.add_argument('--headless')  # Run in headless mode

        options.add_argument('--disable-blink-features=AutomationControlled')
        self.driver = uc.Chrome(options=options)
        self.driver.set_page_load_timeout(60)  # Increase page load timeout
        self.driver.set_window_size(1300, 800)
        

    def add_random_delay(self, min_seconds=1, max_seconds=3):
        """Add a random delay to mimic human behavior."""
        time.sleep(random.uniform(min_seconds, max_seconds))

    def get_page_html(self, url, retries=3):
        """
        Navigate to the given URL and return the page HTML.
        Retry the request if it fails.
        """
        for attempt in range(retries):
            try:
                print(f"Navigating to: {url} (Attempt {attempt + 1})")
                self.driver.get(url)
                WebDriverWait(self.driver, 60).until(  # Increase timeout
                    EC.presence_of_element_located((By.TAG_NAME, "body"))
                )

                # Add a small delay before switching to the iframe
                self.add_random_delay(2, 4)

                # Switch to the iframe
                iframe = WebDriverWait(self.driver, 60).until(  # Increase timeout
                    EC.presence_of_element_located((By.TAG_NAME, "iframe"))
                )
                self.driver.switch_to.frame(iframe)

                # Wait for a specific element inside the iframe that indicates the table is loaded
                WebDriverWait(self.driver, 60).until(  # Increase timeout
                    EC.presence_of_element_located((By.CSS_SELECTOR, "div.cpcbrowser-results-holder"))  # More specific element
                )

                # Add a random delay to mimic human behavior
                self.add_random_delay(3, 5)

                # Return the page HTML of the iframe
                page_html = self.driver.page_source

                # Switch back to the default content
                self.driver.switch_to.default_content()

                return page_html

            except TimeoutException:
                print(f"Timed out waiting for the page to load (Attempt {attempt + 1}).")
                if attempt < retries - 1:
                    print("Retrying...")
                else:
                    print("Max retries reached. Moving to the next URL.")
                    return None
            except Exception as e:
                print(f"An error occurred: {e}")
                return None

    def parse_html(self, html):
        """
        Parses the HTML content and extracts a dictionary of classification symbols and titles.

        Args:
            html (str): The HTML content to parse.

        Returns:
            dict: A dictionary where keys are classification symbols and values are their corresponding titles.
        """
        soup = BeautifulSoup(html, 'html.parser')
        result_dict = {}

        # Find all classitem divs
        for classitem in soup.find_all('div', class_='classitem'):
            titlebar = classitem.find('div', class_='titlebar')
            symbol_holder = classitem.find('div', class_='symbol-holder')

            if titlebar and symbol_holder:
                # Find all <span class="raw-text"> elements inside the titlebar
                raw_text_spans = titlebar.find_all('span', class_='raw-text')
                # Combine the text from all raw-text spans into one sentence
                combined_text = ' '.join(span.get_text(strip=True) for span in raw_text_spans)

                symbol_ref = symbol_holder.find('a', class_='symbol classref')

                if combined_text and symbol_ref:
                    key = symbol_ref.get_text(strip=True)  # Symbol is the key
                    value = combined_text  # Combined text from all raw-text spans is the value
                    result_dict[key] = value

        return result_dict

    def close(self):
        """Close the browser when done."""
        if self.driver:
            self.driver.quit()


if __name__ == '__main__':
    # Initialize the scraper
    scraper = EspacenetScraper(headless=False)  # Set headless to False to see the browser in action

    # List of CPC symbols to process
    cpc_list = ["A", "B", "C","D","E","F","G","H","Y"]  # Add more CPC symbols as needed

    # Dictionary to store all results
    all_results = {}

    try:
        for cpc_symbol in cpc_list:
            # Construct the URL using the CPC symbol
            url = f'https://worldwide.espacenet.com/patent/cpc-browser#!/CPC={cpc_symbol}'

            # Get the page HTML
            html = scraper.get_page_html(url)
            if html:
                print(f"Page HTML for CPC {cpc_symbol} retrieved successfully.")
                # Parse the HTML to extract classification data
                classification_data = scraper.parse_html(html)
                print(f"Classification data for CPC {cpc_symbol}: {classification_data}")

                # Save the results to the all_results dictionary
                all_results[cpc_symbol] = classification_data

                # # Save the HTML to a file for inspection
                # with open(f"classification_search_{cpc_symbol.replace('/', '_')}.html", "w", encoding="utf-8") as file:
                #     file.write(html)
                # print(f"HTML for CPC {cpc_symbol} saved to 'classification_search_{cpc_symbol.replace('/', '_')}.html'.")

    finally:
        # Close the browser
        scraper.close()
        print("Scraper closed.")

    # Print all results
    print("All results:")
    for cpc_symbol, data in all_results.items():
        print(f"CPC {cpc_symbol}: {data}")


Navigating to: https://worldwide.espacenet.com/patent/cpc-browser#!/CPC=A (Attempt 1)
Page HTML for CPC A retrieved successfully.
Classification data for CPC A: {'A': 'HUMAN NECESSITIES', 'A01': 'AGRICULTURE FORESTRY ANIMAL HUSBANDRY HUNTING TRAPPING FISHING', 'A21': 'BAKING EDIBLE DOUGHS', 'A22': 'BUTCHERING MEAT TREATMENT PROCESSING POULTRY OR FISH', 'A23': 'FOODS OR FOODSTUFFS TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES', 'A24': "TOBACCO CIGARS CIGARETTES SIMULATED SMOKING DEVICES SMOKERS' REQUISITES", 'A41': 'WEARING APPAREL', 'A42': 'HEADWEAR', 'A43': 'FOOTWEAR', 'A44': 'HABERDASHERY JEWELLERY', 'A45': 'HAND OR TRAVELLING ARTICLES', 'A46': 'BRUSHWARE', 'A47': 'FURNITURE DOMESTIC ARTICLES OR APPLIANCES COFFEE MILLS SPICE MILLS SUCTION CLEANERS IN GENERAL', 'A61': 'MEDICAL OR VETERINARY SCIENCE HYGIENE', 'A62': 'LIFE-SAVING FIRE-FIGHTING', 'A63': 'SPORTS GAMES AMUSEMENTS', 'A99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}
Navigating to: https://worldwide.espace

In [25]:
all_results

{'A': {'A': 'HUMAN NECESSITIES',
  'A01': 'AGRICULTURE FORESTRY ANIMAL HUSBANDRY HUNTING TRAPPING FISHING',
  'A21': 'BAKING EDIBLE DOUGHS',
  'A22': 'BUTCHERING MEAT TREATMENT PROCESSING POULTRY OR FISH',
  'A23': 'FOODS OR FOODSTUFFS TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES',
  'A24': "TOBACCO CIGARS CIGARETTES SIMULATED SMOKING DEVICES SMOKERS' REQUISITES",
  'A41': 'WEARING APPAREL',
  'A42': 'HEADWEAR',
  'A43': 'FOOTWEAR',
  'A44': 'HABERDASHERY JEWELLERY',
  'A45': 'HAND OR TRAVELLING ARTICLES',
  'A46': 'BRUSHWARE',
  'A47': 'FURNITURE DOMESTIC ARTICLES OR APPLIANCES COFFEE MILLS SPICE MILLS SUCTION CLEANERS IN GENERAL',
  'A61': 'MEDICAL OR VETERINARY SCIENCE HYGIENE',
  'A62': 'LIFE-SAVING FIRE-FIGHTING',
  'A63': 'SPORTS GAMES AMUSEMENTS',
  'A99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'},
 'B': {'B': 'PERFORMING OPERATIONS TRANSPORTING',
  'B01': 'PHYSICAL OR CHEMICAL PROCESSES OR APPARATUS IN GENERAL',
  'B02': 'CRUSHING, PULVERISING, OR DISINTEG

In [26]:
import pandas as pd

def dict_to_dataframe(all_results):
    """
    Converts the `all_results` dictionary into a pandas DataFrame.

    Args:
        all_results (dict): A dictionary where keys are CPC symbols and values are dictionaries
                            of classification data (symbols and titles).

    Returns:
        pd.DataFrame: A DataFrame with columns 'CPC Symbol' and 'Classification Title'.
    """
    # Initialize lists to store the data
    cpc_symbols = []
    classification_titles = []

    # Iterate through the all_results dictionary
    for cpc_symbol, classification_data in all_results.items():
        for symbol, title in classification_data.items():
            cpc_symbols.append(symbol)
            classification_titles.append(title)

    # Create a DataFrame
    df = pd.DataFrame({
        'CPC Symbol': cpc_symbols,
        'Classification Title': classification_titles
    })

    return df

In [27]:
classification_df = dict_to_dataframe(all_results)

In [28]:
classification_df

Unnamed: 0,CPC Symbol,Classification Title
0,A,HUMAN NECESSITIES
1,A01,AGRICULTURE FORESTRY ANIMAL HUSBANDRY HUNTING ...
2,A21,BAKING EDIBLE DOUGHS
3,A22,BUTCHERING MEAT TREATMENT PROCESSING POULTRY O...
4,A23,"FOODS OR FOODSTUFFS TREATMENT THEREOF, NOT COV..."
...,...,...
141,H99,SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN T...
142,Y,GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPME...
143,Y02,TECHNOLOGIES OR APPLICATIONS FOR MITIGATION OR...
144,Y04,INFORMATION OR COMMUNICATION TECHNOLOGIES HAVI...


In [29]:
all_results

{'A': {'A': 'HUMAN NECESSITIES',
  'A01': 'AGRICULTURE FORESTRY ANIMAL HUSBANDRY HUNTING TRAPPING FISHING',
  'A21': 'BAKING EDIBLE DOUGHS',
  'A22': 'BUTCHERING MEAT TREATMENT PROCESSING POULTRY OR FISH',
  'A23': 'FOODS OR FOODSTUFFS TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES',
  'A24': "TOBACCO CIGARS CIGARETTES SIMULATED SMOKING DEVICES SMOKERS' REQUISITES",
  'A41': 'WEARING APPAREL',
  'A42': 'HEADWEAR',
  'A43': 'FOOTWEAR',
  'A44': 'HABERDASHERY JEWELLERY',
  'A45': 'HAND OR TRAVELLING ARTICLES',
  'A46': 'BRUSHWARE',
  'A47': 'FURNITURE DOMESTIC ARTICLES OR APPLIANCES COFFEE MILLS SPICE MILLS SUCTION CLEANERS IN GENERAL',
  'A61': 'MEDICAL OR VETERINARY SCIENCE HYGIENE',
  'A62': 'LIFE-SAVING FIRE-FIGHTING',
  'A63': 'SPORTS GAMES AMUSEMENTS',
  'A99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'},
 'B': {'B': 'PERFORMING OPERATIONS TRANSPORTING',
  'B01': 'PHYSICAL OR CHEMICAL PROCESSES OR APPARATUS IN GENERAL',
  'B02': 'CRUSHING, PULVERISING, OR DISINTEG

In [30]:
classification_df.to_csv("classification_df.csv" , sep=";")

keep working to make a loop that gets the entire dataset eventually 

In [31]:
all_results['A']

{'A': 'HUMAN NECESSITIES',
 'A01': 'AGRICULTURE FORESTRY ANIMAL HUSBANDRY HUNTING TRAPPING FISHING',
 'A21': 'BAKING EDIBLE DOUGHS',
 'A22': 'BUTCHERING MEAT TREATMENT PROCESSING POULTRY OR FISH',
 'A23': 'FOODS OR FOODSTUFFS TREATMENT THEREOF, NOT COVERED BY OTHER CLASSES',
 'A24': "TOBACCO CIGARS CIGARETTES SIMULATED SMOKING DEVICES SMOKERS' REQUISITES",
 'A41': 'WEARING APPAREL',
 'A42': 'HEADWEAR',
 'A43': 'FOOTWEAR',
 'A44': 'HABERDASHERY JEWELLERY',
 'A45': 'HAND OR TRAVELLING ARTICLES',
 'A46': 'BRUSHWARE',
 'A47': 'FURNITURE DOMESTIC ARTICLES OR APPLIANCES COFFEE MILLS SPICE MILLS SUCTION CLEANERS IN GENERAL',
 'A61': 'MEDICAL OR VETERINARY SCIENCE HYGIENE',
 'A62': 'LIFE-SAVING FIRE-FIGHTING',
 'A63': 'SPORTS GAMES AMUSEMENTS',
 'A99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}

In [32]:
ipc_to_field_A={
    "A01": {
        "description": "Agriculture, Forestry, Animal Husbandry, Hunting, Trapping, Fishing",
        "fields": ["Environmental Science", "Biology", "Engineering"]
    },
    "A21": {
        "description": "Baking, Edible Doughs",
        "fields": ["Food Science", "Engineering", "Chemistry"]
    },
    "A22": {
        "description": "Butchering, Meat Treatment, Processing Poultry or Fish",
        "fields": ["Food Science", "Biology", "Engineering"]
    },
    "A23": {
        "description": "Foods or Foodstuffs, Treatment Not Covered by Other Classes",
        "fields": ["Food Science", "Chemistry", "Biology"]
    },
    "A24": {
        "description": "Tobacco, Cigars, Cigarettes, Simulated Smoking Devices",
        "fields": ["Medicine", "Public Health", "Engineering"]
    },
    "A41": {
        "description": "Wearing Apparel",
        "fields": ["Materials Science", "Engineering", "Art"]
    },
    "A42": {
        "description": "Headwear",
        "fields": ["Materials Science", "Engineering", "Art"]
    },
    "A43": {
        "description": "Footwear",
        "fields": ["Materials Science", "Engineering", "Ergonomics"]
    },
    "A44": {
        "description": "Haberdashery, Jewellery",
        "fields": ["Art", "Materials Science", "Design"]
    },
    "A45": {
        "description": "Hand or Travelling Articles",
        "fields": ["Engineering", "Design", "Materials Science"]
    },
    "A46": {
        "description": "Brushware",
        "fields": ["Engineering", "Materials Science"]
    },
    "A47": {
        "description": "Furniture, Domestic Articles, Coffee Mills, Suction Cleaners",
        "fields": ["Engineering", "Design", "Materials Science"]
    },
    "A61": {
        "description": "Medical or Veterinary Science, Hygiene",
        "fields": ["Medicine", "Biology", "Veterinary Medicine"]
    },
    "A62": {
        "description": "Life-Saving, Fire-Fighting",
        "fields": ["Engineering", "Medicine", "Safety Science"]
    },
    "A63": {
        "description": "Sports, Games, Amusements",
        "fields": ["Sports Science", "Psychology", "Engineering"]
    },
    "A99": {
        "description": "Subject Matter Not Otherwise Provided for in This Section",
        "fields": ["Engineering", "Medicine", "Sociology", "Design"]
    }
}


In [33]:
import pandas as pd

df_A=pd.DataFrame.from_dict(ipc_to_field_A,orient='index').reset_index().rename(columns={'index': 'IPC'})



In [34]:
all_results['B']

{'B': 'PERFORMING OPERATIONS TRANSPORTING',
 'B01': 'PHYSICAL OR CHEMICAL PROCESSES OR APPARATUS IN GENERAL',
 'B02': 'CRUSHING, PULVERISING, OR DISINTEGRATING PREPARATORY TREATMENT OF GRAIN FOR MILLING',
 'B03': 'SEPARATION OF SOLID MATERIALS USING LIQUIDS OR USING PNEUMATIC TABLES OR JIGS MAGNETIC OR ELECTROSTATIC SEPARATION OF SOLID MATERIALS FROM SOLID MATERIALS OR FLUIDS SEPARATION BY HIGH-VOLTAGE ELECTRIC FIELDS',
 'B04': 'CENTRIFUGAL APPARATUS OR MACHINES FOR CARRYING-OUT PHYSICAL OR CHEMICAL PROCESSES',
 'B05': 'SPRAYING OR ATOMISING IN GENERAL APPLYING FLUENT MATERIALS TO SURFACES, IN GENERAL',
 'B06': 'GENERATING OR TRANSMITTING MECHANICAL VIBRATIONS IN GENERAL',
 'B07': 'SEPARATING SOLIDS FROM SOLIDS SORTING',
 'B08': 'CLEANING',
 'B09': 'DISPOSAL OF SOLID WASTE RECLAMATION OF CONTAMINATED SOIL',
 'B21': 'MECHANICAL METAL-WORKING WITHOUT ESSENTIALLY REMOVING MATERIAL PUNCHING METAL',
 'B22': 'CASTING POWDER METALLURGY',
 'B23': 'MACHINE TOOLS METAL-WORKING NOT OTHERWISE PROV

In [35]:
ipc_to_field_B = {
    "B01": {
        "description": "Physical or chemical processes or apparatus in general",
        "fields": ["Chemistry", "Engineering", "Materials Science"]
    },
    "B02": {
        "description": "Crushing, pulverising, or disintegrating; preparatory treatment of grain for milling",
        "fields": ["Engineering", "Agricultural Science", "Food Science"]
    },
    "B03": {
        "description": "Separation of solid materials using liquids or pneumatic tables/jigs, magnetic/electrostatic separation",
        "fields": ["Engineering", "Physics", "Materials Science"]
    },
    "B04": {
        "description": "Centrifugal apparatus or machines for carrying-out physical or chemical processes",
        "fields": ["Engineering", "Chemistry", "Physics"]
    },
    "B05": {
        "description": "Spraying or atomising in general; applying fluent materials to surfaces",
        "fields": ["Materials Science", "Engineering", "Chemistry"]
    },
    "B06": {
        "description": "Generating or transmitting mechanical vibrations in general",
        "fields": ["Engineering", "Physics"]
    },
    "B07": {
        "description": "Separating solids from solids; sorting",
        "fields": ["Engineering", "Computer Science", "Logistics"]
    },
    "B08": {
        "description": "Cleaning",
        "fields": ["Engineering", "Environmental Science", "Materials Science"]
    },
    "B09": {
        "description": "Disposal of solid waste; reclamation of contaminated soil",
        "fields": ["Environmental Science", "Engineering", "Geology"]
    },
    "B21": {
        "description": "Mechanical metal-working without essentially removing material; punching metal",
        "fields": ["Mechanical Engineering", "Materials Science"]
    },
    "B22": {
        "description": "Casting; powder metallurgy",
        "fields": ["Materials Science", "Engineering"]
    },
    "B23": {
        "description": "Machine tools; metal-working not otherwise provided for",
        "fields": ["Engineering", "Manufacturing"]
    },
    "B24": {
        "description": "Grinding; polishing",
        "fields": ["Engineering", "Materials Science"]
    },
    "B25": {
        "description": "Hand tools; portable power-driven tools; manipulators",
        "fields": ["Engineering", "Robotics"]
    },
    "B26": {
        "description": "Hand cutting tools; cutting; severing",
        "fields": ["Engineering", "Mechanical Engineering"]
    },
    "B27": {
        "description": "Working or preserving wood or similar material; nailing or stapling machines",
        "fields": ["Engineering", "Materials Science", "Forestry"]
    },
    "B28": {
        "description": "Working cement, clay, or stone",
        "fields": ["Civil Engineering", "Materials Science"]
    },
    "B29": {
        "description": "Working of plastics; working of substances in a plastic state",
        "fields": ["Materials Science", "Chemical Engineering"]
    },
    "B30": {
        "description": "Presses",
        "fields": ["Engineering", "Mechanical Engineering"]
    },
    "B31": {
        "description": "Making articles of paper/cardboard or material worked similarly",
        "fields": ["Engineering", "Materials Science"]
    },
    "B32": {
        "description": "Layered products",
        "fields": ["Materials Science", "Engineering"]
    },
    "B33": {
        "description": "Additive manufacturing technology",
        "fields": ["Computer Science", "Engineering", "Materials Science"]
    },
    "B41": {
        "description": "Printing, lining machines, typewriters, stamps",
        "fields": ["Engineering", "Computer Science"]
    },
    "B42": {
        "description": "Bookbinding, albums, files, special printed matter",
        "fields": ["Library Science", "Engineering"]
    },
    "B43": {
        "description": "Writing or drawing implements; bureau accessories",
        "fields": ["Art", "Engineering", "Design"]
    },
    "B44": {
        "description": "Decorative arts",
        "fields": ["Art", "Design"]
    },
    "B60": {
        "description": "Vehicles in general",
        "fields": ["Engineering", "Transportation", "Design"]
    },
    "B61": {
        "description": "Railways",
        "fields": ["Engineering", "Transportation"]
    },
    "B62": {
        "description": "Land vehicles for travelling otherwise than on rails",
        "fields": ["Automotive Engineering", "Mechanical Engineering"]
    },
    "B63": {
        "description": "Ships or other waterborne vessels; related equipment",
        "fields": ["Marine Engineering", "Mechanical Engineering"]
    },
    "B64": {
        "description": "Aircraft; aviation; cosmonautics",
        "fields": ["Aerospace Engineering", "Physics"]
    },
    "B65": {
        "description": "Conveying, packing, storing; handling thin/filamentary material",
        "fields": ["Logistics", "Engineering", "Materials Science"]
    },
    "B66": {
        "description": "Hoisting, lifting, hauling",
        "fields": ["Mechanical Engineering", "Robotics", "Logistics"]
    },
    "B67": {
        "description": "Opening, closing or cleaning bottles/jars; liquid handling",
        "fields": ["Engineering", "Chemical Engineering"]
    },
    "B68": {
        "description": "Saddlery; upholstery",
        "fields": ["Design", "Materials Science", "Engineering"]
    },
    "B81": {
        "description": "Microstructural technology",
        "fields": ["Nanotechnology", "Engineering", "Physics"]
    },
    "B82": {
        "description": "Nanotechnology",
        "fields": ["Nanotechnology", "Materials Science", "Engineering"]
    },
    "B99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Engineering", "Multidisciplinary Science"]
    }
}


In [36]:
import pandas as pd

df_B = pd.DataFrame.from_dict(ipc_to_field_B, orient='index').reset_index().rename(columns={'index': 'IPC'})


In [37]:
all_results['C']

{'C': 'CHEMISTRY METALLURGY',
 'C01': 'INORGANIC CHEMISTRY',
 'C02': 'TREATMENT OF WATER, WASTE WATER, SEWAGE, OR SLUDGE',
 'C03': 'GLASS MINERAL OR SLAG WOOL',
 'C04': 'CEMENTS CONCRETE ARTIFICIAL STONE CERAMICS REFRACTORIES',
 'C05': 'FERTILISERS MANUFACTURE THEREOF',
 'C06': 'EXPLOSIVES MATCHES',
 'C07': 'ORGANIC CHEMISTRY',
 'C08': 'ORGANIC MACROMOLECULAR COMPOUNDS THEIR PREPARATION OR CHEMICAL WORKING-UP COMPOSITIONS BASED THEREON',
 'C09': 'DYES PAINTS POLISHES NATURAL RESINS ADHESIVES COMPOSITIONS NOT OTHERWISE PROVIDED FOR APPLICATIONS OF MATERIALS NOT OTHERWISE PROVIDED FOR',
 'C10': 'PETROLEUM, GAS OR COKE INDUSTRIES TECHNICAL GASES CONTAINING CARBON MONOXIDE FUELS LUBRICANTS PEAT',
 'C11': 'ANIMAL OR VEGETABLE OILS, FATS, FATTY SUBSTANCES OR WAXES FATTY ACIDS THEREFROM DETERGENTS CANDLES',
 'C12': 'BIOCHEMISTRY BEER SPIRITS WINE VINEGAR MICROBIOLOGY ENZYMOLOGY MUTATION OR GENETIC ENGINEERING',
 'C13': 'SUGAR INDUSTRY',
 'C14': 'SKINS HIDES PELTS LEATHER',
 'C21': 'METALLURGY

In [38]:
ipc_to_field_C = {
    "C": {
        "description": "Chemistry, Metallurgy",
        "fields": ["Chemistry", "Materials Science"]
    },
    "C01": {
        "description": "Inorganic Chemistry",
        "fields": ["Chemistry"]
    },
    "C02": {
        "description": "Treatment of water, waste water, sewage, or sludge",
        "fields": ["Environmental Science", "Chemical Engineering"]
    },
    "C03": {
        "description": "Glass, mineral or slag wool",
        "fields": ["Materials Science", "Engineering"]
    },
    "C04": {
        "description": "Cements, concrete, artificial stone, ceramics, refractories",
        "fields": ["Civil Engineering", "Materials Science"]
    },
    "C05": {
        "description": "Fertilisers manufacture thereof",
        "fields": ["Agricultural Science", "Chemical Engineering"]
    },
    "C06": {
        "description": "Explosives, matches",
        "fields": ["Chemical Engineering", "Materials Science", "Physics"]
    },
    "C07": {
        "description": "Organic Chemistry",
        "fields": ["Chemistry"]
    },
    "C08": {
        "description": "Organic macromolecular compounds, their preparation or chemical working-up, compositions based thereon",
        "fields": ["Polymer Science", "Chemistry"]
    },
    "C09": {
        "description": "Dyes, paints, polishes, natural resins, adhesives; applications of materials not otherwise provided for",
        "fields": ["Chemistry", "Materials Science", "Engineering"]
    },
    "C10": {
        "description": "Petroleum, gas or coke industries; technical gases containing carbon monoxide; fuels, lubricants, peat",
        "fields": ["Chemical Engineering", "Energy", "Geology"]
    },
    "C11": {
        "description": "Animal or vegetable oils, fats, fatty substances or waxes; fatty acids therefrom; detergents, candles",
        "fields": ["Biochemistry", "Chemical Engineering", "Food Science"]
    },
    "C12": {
        "description": "Biochemistry; beer, spirits, wine, vinegar; microbiology; enzymology; mutation or genetic engineering",
        "fields": ["Biochemistry", "Biology", "Chemical Engineering"]
    },
    "C13": {
        "description": "Sugar industry",
        "fields": ["Food Science", "Chemical Engineering"]
    },
    "C14": {
        "description": "Skins, hides, pelts, leather",
        "fields": ["Materials Science", "Biology", "Engineering"]
    },
    "C21": {
        "description": "Metallurgy of iron",
        "fields": ["Metallurgy", "Materials Science"]
    },
    "C22": {
        "description": "Metallurgy ferrous or non-ferrous alloys; treatment of alloys or non-ferrous metals",
        "fields": ["Metallurgy", "Materials Science", "Engineering"]
    },
    "C23": {
        "description": "Coating metallic material; coating material with metallic material; chemical surface treatment; diffusion treatment; vacuum evaporation, sputtering, ion implantation or chemical vapour deposition; inhibiting corrosion or incrustation in general",
        "fields": ["Surface Engineering", "Materials Science", "Chemical Engineering"]
    },
    "C25": {
        "description": "Electrolytic or electrophoretic processes; apparatus therefor",
        "fields": ["Electrochemistry", "Chemical Engineering"]
    },
    "C30": {
        "description": "Crystal growth",
        "fields": ["Crystallography", "Materials Science", "Physics"]
    },
    "C40": {
        "description": "Combinatorial technology",
        "fields": ["Chemistry", "Materials Science", "Computer Science"]
    },
    "C99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Chemistry", "Materials Science"]
    }
}


In [39]:
import pandas as pd

df_C = pd.DataFrame.from_dict(ipc_to_field_C, orient='index').reset_index().rename(columns={'index': 'IPC'})
df_C.head()


Unnamed: 0,IPC,description,fields
0,C,"Chemistry, Metallurgy","[Chemistry, Materials Science]"
1,C01,Inorganic Chemistry,[Chemistry]
2,C02,"Treatment of water, waste water, sewage, or sl...","[Environmental Science, Chemical Engineering]"
3,C03,"Glass, mineral or slag wool","[Materials Science, Engineering]"
4,C04,"Cements, concrete, artificial stone, ceramics,...","[Civil Engineering, Materials Science]"


In [40]:
all_results['D']

{'D': 'TEXTILES PAPER',
 'D01': 'NATURAL OR MAN-MADE THREADS OR FIBRES SPINNING',
 'D02': 'YARNS MECHANICAL FINISHING OF YARNS OR ROPES WARPING OR BEAMING',
 'D03': 'WEAVING',
 'D04': 'BRAIDING LACE-MAKING KNITTING TRIMMINGS NON-WOVEN FABRICS',
 'D05': 'SEWING EMBROIDERING TUFTING',
 'D06': 'TREATMENT OF TEXTILES OR THE LIKE LAUNDERING FLEXIBLE MATERIALS NOT OTHERWISE PROVIDED FOR',
 'D07': 'ROPES CABLES OTHER THAN ELECTRIC',
 'D10': 'INDEXING SCHEME ASSOCIATED WITH SUBLASSES OF SECTIOND, RELATING TO TEXTILES',
 'D21': 'PAPER-MAKING PRODUCTION OF CELLULOSE',
 'D99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}

In [41]:
ipc_to_field_D = {
    "D": {
        "description": "Textiles, Paper",
        "fields": ["Materials Science", "Engineering"]
    },
    "D01": {
        "description": "Natural or man-made threads or fibres; spinning",
        "fields": ["Materials Science", "Textile Engineering"]
    },
    "D02": {
        "description": "Yarns; mechanical finishing of yarns or ropes; warping or beaming",
        "fields": ["Materials Science", "Textile Engineering"]
    },
    "D03": {
        "description": "Weaving",
        "fields": ["Textile Engineering", "Materials Science"]
    },
    "D04": {
        "description": "Braiding, lace-making, knitting, trimmings, non-woven fabrics",
        "fields": ["Materials Science", "Textile Engineering"]
    },
    "D05": {
        "description": "Sewing, embroidering, tufting",
        "fields": ["Textile Engineering", "Design"]
    },
    "D06": {
        "description": "Treatment of textiles or the like; laundering; flexible materials not otherwise provided for",
        "fields": ["Materials Science", "Chemical Engineering"]
    },
    "D07": {
        "description": "Ropes or cables other than electric",
        "fields": ["Mechanical Engineering", "Materials Science"]
    },
    "D10": {
        "description": "Indexing scheme associated with subclasses of section D, relating to textiles",
        "fields": ["Materials Science", "Information Systems"]
    },
    "D21": {
        "description": "Paper-making; production of cellulose",
        "fields": ["Chemical Engineering", "Materials Science"]
    },
    "D99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Materials Science"]
    }
}


In [42]:
import pandas as pd

df_D = pd.DataFrame.from_dict(ipc_to_field_D, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [43]:
all_results['E']

{'E': 'FIXED CONSTRUCTIONS',
 'E01': 'CONSTRUCTION OF ROADS, RAILWAYS, OR BRIDGES',
 'E02': 'HYDRAULIC ENGINEERING FOUNDATIONS SOIL SHIFTING',
 'E03': 'WATER SUPPLY SEWERAGE',
 'E04': 'BUILDING',
 'E05': 'LOCKS KEYS WINDOW OR DOOR FITTINGS SAFES',
 'E06': 'DOORS, WINDOWS, SHUTTERS, OR ROLLER BLINDS IN GENERAL LADDERS',
 'E21': 'EARTH OR ROCK DRILLING MINING',
 'E99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}

In [44]:
ipc_to_field_E = {
    "E": {
        "description": "Fixed Constructions",
        "fields": ["Civil Engineering", "Architecture"]
    },
    "E01": {
        "description": "Construction of roads, railways, or bridges",
        "fields": ["Civil Engineering", "Transportation Engineering"]
    },
    "E02": {
        "description": "Hydraulic engineering; foundations; soil shifting",
        "fields": ["Civil Engineering", "Geotechnical Engineering"]
    },
    "E03": {
        "description": "Water supply; sewerage",
        "fields": ["Civil Engineering", "Environmental Science", "Urban Planning"]
    },
    "E04": {
        "description": "Building",
        "fields": ["Architecture", "Civil Engineering"]
    },
    "E05": {
        "description": "Locks, keys, window or door fittings, safes",
        "fields": ["Mechanical Engineering", "Industrial Design", "Architecture"]
    },
    "E06": {
        "description": "Doors, windows, shutters, or roller blinds in general; ladders",
        "fields": ["Architecture", "Industrial Design"]
    },
    "E21": {
        "description": "Earth or rock drilling; mining",
        "fields": ["Mining Engineering", "Geotechnical Engineering"]
    },
    "E99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Civil Engineering"]
    }
}


In [45]:
import pandas as pd

df_E = pd.DataFrame.from_dict(ipc_to_field_E, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [46]:
all_results['F']

{'F': 'MECHANICAL ENGINEERING LIGHTING HEATING WEAPONS BLASTING',
 'F01': 'MACHINES OR ENGINES IN GENERAL ENGINE PLANTS IN GENERAL STEAM ENGINES',
 'F02': 'COMBUSTION ENGINES HOT-GAS OR COMBUSTION-PRODUCT ENGINE PLANTS',
 'F03': 'MACHINES OR ENGINES FOR LIQUIDS WIND, SPRING, OR WEIGHT MOTORS PRODUCING MECHANICAL POWER OR A REACTIVE PROPULSIVE THRUST, NOT OTHERWISE PROVIDED FOR',
 'F04': 'POSITIVE - DISPLACEMENT MACHINES FOR LIQUIDS PUMPS FOR LIQUIDS OR ELASTIC FLUIDS',
 'F05': 'INDEXING SCHEMES RELATING TO ENGINES OR PUMPS IN VARIOUS SUBCLASSES OF CLASSESF01-F04',
 'F15': 'FLUID-PRESSURE ACTUATORS HYDRAULICS OR PNEUMATICS IN GENERAL',
 'F16': 'ENGINEERING ELEMENTS AND UNITS GENERAL MEASURES FOR PRODUCING AND MAINTAINING EFFECTIVE FUNCTIONING OF MACHINES OR INSTALLATIONS THERMAL INSULATION IN GENERAL',
 'F17': 'STORING OR DISTRIBUTING GASES OR LIQUIDS',
 'F21': 'LIGHTING',
 'F22': 'STEAM GENERATION',
 'F23': 'COMBUSTION APPARATUS COMBUSTION PROCESSES',
 'F24': 'HEATING RANGES VENTILATIN

In [47]:
ipc_to_field_F = {
    "F": {
        "description": "Mechanical Engineering, Lighting, Heating, Weapons, Blasting",
        "fields": ["Mechanical Engineering", "Applied Physics"]
    },
    "F01": {
        "description": "Machines or engines in general; engine plants in general; steam engines",
        "fields": ["Mechanical Engineering", "Energy"]
    },
    "F02": {
        "description": "Combustion engines (hot-gas or combustion-product engine plants)",
        "fields": ["Mechanical Engineering", "Chemical Engineering", "Energy"]
    },
    "F03": {
        "description": ("Machines or engines for liquids; wind, spring, or weight motors producing "
                        "mechanical power or a reactive propulsive thrust, not otherwise provided for"),
        "fields": ["Mechanical Engineering", "Energy"]
    },
    "F04": {
        "description": "Positive-displacement machines for liquids; pumps for liquids or elastic fluids",
        "fields": ["Mechanical Engineering", "Fluid Mechanics"]
    },
    "F05": {
        "description": ("Indexing schemes relating to engines or pumps in various subclasses of F01-F04"),
        "fields": ["Mechanical Engineering", "Control Systems"]
    },
    "F15": {
        "description": "Fluid-pressure actuators; hydraulics or pneumatics in general",
        "fields": ["Mechanical Engineering", "Fluid Mechanics"]
    },
    "F16": {
        "description": ("Engineering elements and units; general measures for producing and maintaining "
                        "effective functioning of machines or installations; thermal insulation in general"),
        "fields": ["Mechanical Engineering", "Materials Science", "Thermal Engineering"]
    },
    "F17": {
        "description": "Storing or distributing gases or liquids",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F21": {
        "description": "Lighting",
        "fields": ["Electrical Engineering", "Optics"]
    },
    "F22": {
        "description": "Steam generation",
        "fields": ["Mechanical Engineering", "Energy"]
    },
    "F23": {
        "description": "Combustion apparatus; combustion processes",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F24": {
        "description": "Heating ranges; ventilating",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F25": {
        "description": ("Refrigeration or cooling; combined heating and refrigeration systems; heat pump systems; "
                        "manufacture or storage of ice; liquefaction; solidification of gases"),
        "fields": ["Mechanical Engineering", "Chemical Engineering", "Thermodynamics"]
    },
    "F26": {
        "description": "Drying",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F27": {
        "description": "Furnaces; kilns; ovens; retorts",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F28": {
        "description": "Heat exchange in general",
        "fields": ["Mechanical Engineering", "Chemical Engineering"]
    },
    "F41": {
        "description": "Weapons",
        "fields": ["Mechanical Engineering", "Defense Technology"]
    },
    "F42": {
        "description": "Ammunition; blasting",
        "fields": ["Mechanical Engineering", "Defense Technology"]
    },
    "F99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Mechanical Engineering"]
    }
}


In [48]:
import pandas as pd

df_F = pd.DataFrame.from_dict(ipc_to_field_F, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [49]:
all_results['G']

{'G': 'PHYSICS',
 'G01': 'MEASURING TESTING',
 'G02': 'OPTICS',
 'G03': 'PHOTOGRAPHY CINEMATOGRAPHY ANALOGOUS TECHNIQUES USING WAVES OTHER THAN OPTICAL WAVES ELECTROGRAPHY HOLOGRAPHY',
 'G04': 'HOROLOGY',
 'G05': 'CONTROLLING REGULATING',
 'G06': 'COMPUTING CALCULATING OR COUNTING',
 'G07': 'CHECKING-DEVICES',
 'G08': 'SIGNALLING',
 'G09': 'EDUCATION CRYPTOGRAPHY DISPLAY ADVERTISING SEALS',
 'G10': 'MUSICAL INSTRUMENTS ACOUSTICS',
 'G11': 'INFORMATION STORAGE',
 'G12': 'INSTRUMENT DETAILS',
 'G16': 'INFORMATION AND COMMUNICATION TECHNOLOGY [ICT] SPECIALLY ADAPTED FOR SPECIFIC APPLICATION FIELDS',
 'G21': 'NUCLEAR PHYSICS NUCLEAR ENGINEERING',
 'G99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}

In [50]:
ipc_to_field_G = {
    "G": {
        "description": "Physics",
        "fields": ["Physics"]
    },
    "G01": {
        "description": "Measuring; Testing",
        "fields": ["Physics", "Engineering"]
    },
    "G02": {
        "description": "Optics",
        "fields": ["Physics", "Optics", "Engineering"]
    },
    "G03": {
        "description": ("Photography, cinematography, analogous techniques using waves other than optical "
                        "waves; electrography; holography"),
        "fields": ["Optics", "Electrical Engineering", "Imaging Science"]
    },
    "G04": {
        "description": "Horology",
        "fields": ["Engineering", "Applied Physics"]
    },
    "G05": {
        "description": "Controlling; Regulating",
        "fields": ["Control Systems", "Engineering"]
    },
    "G06": {
        "description": "Computing, Calculating or Counting",
        "fields": ["Computer Science", "Mathematics"]
    },
    "G07": {
        "description": "Checking-devices",
        "fields": ["Quality Control", "Engineering"]
    },
    "G08": {
        "description": "Signalling",
        "fields": ["Electrical Engineering", "Communications"]
    },
    "G09": {
        "description": "Education, Cryptography, Display, Advertising, Seals",
        "fields": ["Computer Science", "Communications", "Design"]
    },
    "G10": {
        "description": "Musical instruments; Acoustics",
        "fields": ["Acoustics", "Physics", "Engineering"]
    },
    "G11": {
        "description": "Information storage",
        "fields": ["Computer Science", "Information Technology"]
    },
    "G12": {
        "description": "Instrument details",
        "fields": ["Engineering", "Instrumentation"]
    },
    "G16": {
        "description": ("Information and Communication Technology [ICT] specially adapted for "
                        "specific application fields"),
        "fields": ["Computer Science", "Information Technology", "Engineering"]
    },
    "G21": {
        "description": "Nuclear Physics; Nuclear Engineering",
        "fields": ["Nuclear Engineering", "Physics"]
    },
    "G99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Physics"]
    }
}


In [51]:
import pandas as pd

df_G = pd.DataFrame.from_dict(ipc_to_field_G, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [52]:
all_results['H']

{'H': 'ELECTRICITY',
 'H01': 'ELECTRIC ELEMENTS',
 'H02': 'GENERATION CONVERSION OR DISTRIBUTION OF ELECTRIC POWER',
 'H03': 'ELECTRONIC CIRCUITRY',
 'H04': 'ELECTRIC COMMUNICATION TECHNIQUE',
 'H05': 'ELECTRIC TECHNIQUES NOT OTHERWISE PROVIDED FOR',
 'H10': 'SEMICONDUCTOR DEVICES ELECTRIC SOLID-STATE DEVICES NOT OTHERWISE PROVIDED FOR',
 'H99': 'SUBJECT MATTER NOT OTHERWISE PROVIDED FOR IN THIS SECTION'}

In [53]:
ipc_to_field_H = {
    "H": {
        "description": "Electricity",
        "fields": ["Electrical Engineering"]
    },
    "H01": {
        "description": "Electric elements",
        "fields": ["Electrical Engineering", "Materials Science"]
    },
    "H02": {
        "description": "Generation, conversion or distribution of electric power",
        "fields": ["Electrical Engineering", "Energy Engineering"]
    },
    "H03": {
        "description": "Electronic circuitry",
        "fields": ["Electrical Engineering", "Computer Engineering"]
    },
    "H04": {
        "description": "Electric communication technique",
        "fields": ["Electrical Engineering", "Communications"]
    },
    "H05": {
        "description": "Electric techniques not otherwise provided for",
        "fields": ["Electrical Engineering", "Multidisciplinary Science"]
    },
    "H10": {
        "description": "Semiconductor devices; electric solid-state devices not otherwise provided for",
        "fields": ["Electrical Engineering", "Electronics", "Materials Science"]
    },
    "H99": {
        "description": "Subject matter not otherwise provided for in this section",
        "fields": ["Multidisciplinary Science", "Electrical Engineering"]
    }
}


In [54]:
import pandas as pd

df_H = pd.DataFrame.from_dict(ipc_to_field_H, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [55]:
all_results['Y']

{'Y': 'GENERAL TAGGING OF NEW TECHNOLOGICAL DEVELOPMENTS GENERAL TAGGING OF CROSS-SECTIONAL TECHNOLOGIES SPANNING OVER SEVERAL SECTIONS OF THE IPC TECHNICAL SUBJECTS COVERED BY FORMER USPC CROSS-REFERENCE ART COLLECTIONS [XRACs] AND DIGESTS',
 'Y02': 'TECHNOLOGIES OR APPLICATIONS FOR MITIGATION OR ADAPTATION AGAINST CLIMATE CHANGE',
 'Y04': 'INFORMATION OR COMMUNICATION TECHNOLOGIES HAVING AN IMPACT ON OTHER TECHNOLOGY AREAS',
 'Y10': 'TECHNICAL SUBJECTS COVERED BY FORMER USPC'}

In [56]:
ipc_to_field_Y = {
    "Y": {
        "description": "General tagging of new technological developments, cross-sectional technologies, and former USPC XRACs/digests",
        "fields": ["Multidisciplinary Science", "Engineering"]
    },
    "Y02": {
        "description": "Technologies or applications for mitigation or adaptation against climate change",
        "fields": ["Environmental Science", "Sustainability", "Engineering"]
    },
    "Y04": {
        "description": "Information or communication technologies having an impact on other technology areas",
        "fields": ["Computer Science", "Information Systems", "Engineering"]
    },
    "Y10": {
        "description": "Technical subjects covered by former USPC",
        "fields": ["Engineering", "Science Policy", "Technology Management"]
    }
}


In [57]:
import pandas as pd

df_Y = pd.DataFrame.from_dict(ipc_to_field_Y, orient='index').reset_index().rename(columns={'index': 'IPC'})



In [58]:
import pandas as pd

# List your DataFrames in the desired order:
dfs = [df_A, df_B, df_C, df_D, df_E, df_F, df_G, df_H, df_Y]

# Concatenate all the DataFrames, ignoring the existing indexes if you want a fresh index:
final_df = pd.concat(dfs, ignore_index=True)

# To view the merged DataFrame:



In [59]:
final_df.head()

Unnamed: 0,IPC,description,fields
0,A01,"Agriculture, Forestry, Animal Husbandry, Hunti...","[Environmental Science, Biology, Engineering]"
1,A21,"Baking, Edible Doughs","[Food Science, Engineering, Chemistry]"
2,A22,"Butchering, Meat Treatment, Processing Poultry...","[Food Science, Biology, Engineering]"
3,A23,"Foods or Foodstuffs, Treatment Not Covered by ...","[Food Science, Chemistry, Biology]"
4,A24,"Tobacco, Cigars, Cigarettes, Simulated Smoking...","[Medicine, Public Health, Engineering]"


In [60]:
final_df.size

432

In [78]:
final_df.to_csv('IPC_to_fieldOfStudy.csv')

In [62]:
from sentence_transformers import SentenceTransformer

# Load a lightweight model (good for small data)
model = SentenceTransformer("all-MiniLM-L6-v2")  # 384-dimensional embeddings

descriptions = final_df['description']
# Generate embeddings
embeddings = model.encode(descriptions)
print(embeddings.shape)  # (n_samples, 384)

(144, 384)


In [72]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd

# Binarize labels
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(final_df["fields"])

# Identify and remove rare labels
label_counts = pd.DataFrame(y, columns=mlb.classes_).sum(axis=0)
valid_labels = label_counts[label_counts > 1].index
y_filtered = pd.DataFrame(y, columns=mlb.classes_)[valid_labels].values
mlb.classes_ = valid_labels.tolist()

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, y_filtered, test_size=0.2, random_state=42
)

print("Training shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

Training shape: (115, 384) (115, 37)
Test shape: (29, 384) (29, 37)


In [73]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier

# Initialize classifier with class balancing
clf = MultiOutputClassifier(
    RandomForestClassifier(
        n_estimators=100,
        class_weight="balanced",  # Handles imbalanced labels
        random_state=42
    )
)

# Train the model
clf.fit(X_train, y_train)

In [74]:
from sklearn.metrics import classification_report

# Predict on test data
y_pred = clf.predict(X_test)

# Generate classification report
print(classification_report(
    y_test, 
    y_pred, 
    target_names=mlb.classes_,  # Use filtered label names
    zero_division=0  # Handle labels with no predicted samples
))

                           precision    recall  f1-score   support

     Agricultural Science       0.00      0.00      0.00         0
          Applied Physics       0.00      0.00      0.00         0
             Architecture       0.00      0.00      0.00         0
                      Art       0.00      0.00      0.00         0
             Biochemistry       0.00      0.00      0.00         2
                  Biology       0.00      0.00      0.00         2
     Chemical Engineering       0.00      0.00      0.00         6
                Chemistry       0.00      0.00      0.00         2
        Civil Engineering       0.00      0.00      0.00         0
           Communications       0.00      0.00      0.00         1
         Computer Science       0.00      0.00      0.00         1
          Control Systems       0.00      0.00      0.00         0
       Defense Technology       0.00      0.00      0.00         1
                   Design       0.00      0.00      0.00     

In [80]:
import pandas as pd
import ast

# Sample DataFrames
classification_df = pd.DataFrame({
    'IPC': ['A01', 'A21', 'A22', 'A23', 'A24', 'B60'],
    'description': [
        'Agriculture, Forestry...',
        'Baking, Edible Doughs...',
        'Butchering, Meat Treatment...',
        'Foods or Foodstuffs...',
        'Tobacco Products...',
        'Arrangements or mounting of propulsion units...'
    ],
    'fields': [
        ['Environmental Science', 'Biology', 'Engineering'],
        ['Food Science', 'Engineering', 'Chemistry'],
        ['Food Science', 'Biology', 'Engineering'],
        ['Food Science', 'Chemistry', 'Biology'],
        ['Medicine', 'Public Health', 'Engineering'],
        ['Transportation Engineering', 'Electrical Systems']
    ]
})

patents_df = pd.DataFrame({
    'PatentID': [1, 2, 3],
    'IPC': [
        "['B60L3/00', 'B60L53/16', 'B60L53/51']",
        "['A21B3/00', 'A21D13/00']",
        "['A23L33/00', 'A23G9/00']"
    ]
})

# Create IPC prefix to fields mapping
ipc_mapping = classification_df.set_index('IPC')['fields'].to_dict()

def get_patent_fields(ipc_str):
    try:
        ipc_list = ast.literal_eval(ipc_str)
        if not ipc_list:
            return []
        first_ipc = ipc_list[0][:3]  # Get first 3 chars of first IPC code
        return ipc_mapping.get(first_ipc, ['General Technology'])
    except (SyntaxError, ValueError, TypeError):
        return ['Unclassified']

# Apply to patents DataFrame
patents_df['fields'] = patents_df['IPC'].apply(get_patent_fields)

# Show results
patents_df[['PatentID', 'IPC', 'fields']]

Unnamed: 0,PatentID,IPC,fields
0,1,"['B60L3/00', 'B60L53/16', 'B60L53/51']","[Transportation Engineering, Electrical Systems]"
1,2,"['A21B3/00', 'A21D13/00']","[Food Science, Engineering, Chemistry]"
2,3,"['A23L33/00', 'A23G9/00']","[Food Science, Chemistry, Biology]"


originality rate 

In [81]:
import os
import time
import requests
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Token expires in ~58 minutes
    return TOKEN

def get_patent_biblio():
    """Fetch bibliographic data for CN112508743A from the EPO OPS API."""
    token = get_access_token()
    url = f"{BASE_URL}/published-data/publication/docdb/CN112508743A/biblio"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/xml"
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()  # Raise HTTPError for bad responses
    return response.text

if __name__ == "__main__":
    try:
        xml_data = get_patent_biblio()
        print(xml_data)
    except requests.exceptions.HTTPError as e:
        print(f"HTTP Error: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

<?xml version="1.0" encoding="UTF-8"?><?xml-stylesheet type="text/xsl" href="../../../../../style/exchange.xsl"?>
<ops:world-patent-data xmlns="http://www.epo.org/exchange" xmlns:ops="http://ops.epo.org" xmlns:xlink="http://www.w3.org/1999/xlink">
    <exchange-documents>
        <exchange-document system="ops.epo.org" family-id="74922494" country="CN" doc-number="112508743" kind="A">
            <bibliographic-data>
                <publication-reference>
                    <document-id document-id-type="docdb">
                        <country>CN</country>
                        <doc-number>112508743</doc-number>
                        <kind>A</kind>
                        <date>20210316</date>
                    </document-id>
                    <document-id document-id-type="epodoc">
                        <doc-number>CN112508743</doc-number>
                        <date>20210316</date>
                    </document-id>
                </publication-reference>
            

In [89]:
import xml.etree.ElementTree as ET

def retrieve_citation_publication_numbers(xml_string):
    """
    Parses an EPO world patent data XML string and retrieves publication numbers 
    from each citation's <document-id> element with document-id-type="docdb". 
    Each publication number is constructed as: country + doc-number + kind.
    
    Args:
        xml_string (str): The XML string containing world patent data.
    
    Returns:
        list of str: A list of publication numbers from the citations.
    """
    # Define the namespace mapping for the XML (default namespace for EPO exchange)
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    publication_numbers = []
    # Parse the XML string into an ElementTree
    root = ET.fromstring(xml_string)
    
    # The citations are located under the <references-cited> element inside bibliographic-data.
    # Using the namespace prefix 'ex' (since the default xmlns is "http://www.epo.org/exchange")
    citations = root.findall(".//ex:bibliographic-data/ex:references-cited/ex:citation", ns)
    
    for citation in citations:
        # Look for <document-id document-id-type="docdb"> inside the citation (usually inside <patcit>)
        docdb = citation.find(".//ex:document-id[@document-id-type='docdb']", ns)
        if docdb is not None:
            country = docdb.findtext("ex:country", default="", namespaces=ns)
            doc_number = docdb.findtext("ex:doc-number", default="", namespaces=ns)
            kind = docdb.findtext("ex:kind", default="", namespaces=ns)
            pub_number = f"{country}{doc_number}{kind}"
            if pub_number:
                publication_numbers.append(pub_number)
    
    return publication_numbers

# Example usage with the provided XML input:

pub_numbers = retrieve_citation_publication_numbers(xml_data)
print(pub_numbers)

['CN103164540A', 'CN104318502A', 'CN105427204A', 'CN107180305A', 'CN107194568A', 'CN107423350A', 'CN109493261A']


In [90]:
import os
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500  # Token expires in ~58 minutes
    return TOKEN

def get_patent_biblio(publication_number: str) -> str:
    """
    Fetch bibliographic data for a given patent number from the EPO OPS API.
    
    Args:
        publication_number (str): The publication number (e.g., "CN112508743A")
        
    Returns:
        str: The XML response text.
    """
    token = get_access_token()
    # Construct the static endpoint URL by inserting the publication number into the URL.
    url = f"{BASE_URL}/published-data/publication/docdb/{publication_number}/biblio"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/xml"
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()  # Raise HTTPError for bad responses
    return response.text

def retrieve_citation_publication_numbers(xml_string: str) -> list:
    """
    Parses an EPO world patent data XML string and retrieves publication numbers 
    from each citation's <document-id> element with document-id-type="docdb". 
    Each publication number is constructed as: country + doc-number + kind.
    
    Args:
        xml_string (str): The XML string containing patent data.
    
    Returns:
        list of str: A list of citation publication numbers.
    """
    # Define the namespace mapping for the XML (default namespace for EPO exchange)
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    publication_numbers = []
    # Parse the XML string into an ElementTree
    root = ET.fromstring(xml_string)
    
    # The citations are located under the <references-cited> element inside bibliographic-data.
    citations = root.findall(".//ex:bibliographic-data/ex:references-cited/ex:citation", ns)
    
    for citation in citations:
        # Look for <document-id> with attribute document-id-type="docdb" (inside <patcit>)
        docdb = citation.find(".//ex:document-id[@document-id-type='docdb']", ns)
        if docdb is not None:
            country = docdb.findtext("ex:country", default="", namespaces=ns)
            doc_number = docdb.findtext("ex:doc-number", default="", namespaces=ns)
            kind = docdb.findtext("ex:kind", default="", namespaces=ns)
            pub_number = f"{country}{doc_number}{kind}"
            if pub_number:
                publication_numbers.append(pub_number)
    
    return publication_numbers

def get_citations_for_patent(publication_number: str) -> list:
    """
    Given a patent publication number, fetch the bibliographic data from the OPS API,
    parse out and return the citation publication numbers.
    
    Args:
        publication_number (str): The patent's publication number.
    
    Returns:
        list: List of citation publication numbers for the given patent.
    """
    try:
        xml_data = get_patent_biblio(publication_number)
        citations = retrieve_citation_publication_numbers(xml_data)
        return citations
    except Exception as e:
        print(f"Error processing {publication_number}: {e}")
        return []

# --- Example using a DataFrame --- #
# Let's assume you have a DataFrame with a column 'publication_number'
df = pd.DataFrame({
    'publication_number': [
        "CN112508743A", 
        "CN103164540A",   # Replace/add other valid patent publication numbers as needed.
        "CN104318502A"
    ]
})

# Define a new column 'citation_numbers' where we store the retrieved citations for each patent.
df['citation_numbers'] = df['publication_number'].apply(get_citations_for_patent)

# Display the updated DataFrame
print(df)


  publication_number                                   citation_numbers
0       CN112508743A  [CN103164540A, CN104318502A, CN105427204A, CN1...
1       CN103164540A                       [CN101055585A, CN101714150A]
2       CN104318502A                     [US2004078316A1, CN101916408A]


In [93]:
import os
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    # Token expires in ~58 minutes; refresh slightly before expiry.
    TOKEN_EXPIRY = time.time() + 3500  
    return TOKEN

def get_patent_biblio(publication_number: str) -> str:
    """
    Fetch bibliographic data for a given patent number from the EPO OPS API.
    
    Args:
        publication_number (str): The publication number (e.g., "CN112508743A")
        
    Returns:
        str: The XML response text.
    """
    token = get_access_token()
    # Construct the static endpoint URL using the provided publication number
    url = f"{BASE_URL}/published-data/publication/docdb/{publication_number}/biblio"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/xml"
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.text

def retrieve_citation_publication_numbers(xml_string: str) -> list:
    """
    Parses an EPO patent XML string and retrieves citation publication numbers 
    from each citation's <document-id> element with document-id-type="docdb".
    The publication number is constructed as: country + doc-number + kind.
    
    Args:
        xml_string (str): The XML string containing patent data.
    
    Returns:
        list of str: A list of citation publication numbers.
    """
    # Define the namespace mapping for the XML (default namespace for EPO exchange)
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    publication_numbers = []
    # Parse the XML string into an ElementTree
    root = ET.fromstring(xml_string)
    
    # Find citation elements under bibliographic-data/references-cited
    citations = root.findall(".//ex:bibliographic-data/ex:references-cited/ex:citation", ns)
    
    for citation in citations:
        # Locate <document-id document-id-type='docdb'> inside each citation (usually within <patcit>)
        docdb = citation.find(".//ex:document-id[@document-id-type='docdb']", ns)
        if docdb is not None:
            country = docdb.findtext("ex:country", default="", namespaces=ns)
            doc_number = docdb.findtext("ex:doc-number", default="", namespaces=ns)
            kind = docdb.findtext("ex:kind", default="", namespaces=ns)
            pub_number = f"{country}{doc_number}{kind}"
            if pub_number:
                publication_numbers.append(pub_number)
    
    return publication_numbers

def retrieve_ipc_classifications(xml_string: str) -> list:
    """
    Parses the given patent XML string and extracts the IPC classification texts
    from the <classifications-ipcr> element.
    
    Args:
        xml_string (str): The XML string from the OPS API.
        
    Returns:
        list of str: A list of IPC classification texts.
    """
    # Define the namespace mapping as used in the XML
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    ipcs = []
    root = ET.fromstring(xml_string)
    
    # Locate all <classification-ipcr> elements under <classifications-ipcr>
    for cl in root.findall(".//ex:classifications-ipcr/ex:classification-ipcr", ns):
        text = cl.findtext("ex:text", default="", namespaces=ns)
        if text:
            ipcs.append(text.strip())
    
    return ipcs

def get_citations_ipc_for_patent(publication_number: str) -> list:
    """
    For a given citation publication number, fetch bibliographic data and
    return its IPC classifications.
    
    Args:
        publication_number (str): A citation publication number.
        
    Returns:
        list: A list of IPC classification texts.
    """
    try:
        xml_data = get_patent_biblio(publication_number)
        ipc_classifications = retrieve_ipc_classifications(xml_data)
        return ipc_classifications
    except Exception as e:
        print(f"Error fetching IPC for {publication_number}: {e}")
        return []

def get_all_citations_ipc(citation_nums: list) -> list:
    """
    Given a list of citation publication numbers, retrieve the IPC classifications
    for each citation and aggregate them into one list.
    
    Args:
        citation_nums (list): List of citation publication numbers.
        
    Returns:
        list: Aggregated list of IPC classification texts from the citations.
    """
    ipc_results = []
    for num in citation_nums:
        ipc = get_citations_ipc_for_patent(num)
        ipc_results.extend(ipc)
    return ipc_results

# --- Example DataFrame Integration --- #
if __name__ == "__main__":
    # Example DataFrame with a column 'publication_number'
    df = pd.DataFrame({
    'publication_number': [
        "CN112508743A", 
        "CN103164540A",   # Replace/add other valid patent publication numbers as needed.
        "CN104318502A"
    ]
})
    
    # For each patent in the DataFrame, fetch its bibliographic data,
    # extract the citation publication numbers, and store them in a new column.
    df['citation_numbers'] = df['publication_number'].apply(lambda pub: 
        retrieve_citation_publication_numbers(get_patent_biblio(pub))
    )
    
    # For each row, for all citation publication numbers, retrieve their IPC classifications
    df['citations_ipc'] = df['citation_numbers'].apply(get_all_citations_ipc)
    
    # Display the resulting DataFrame
df   


Unnamed: 0,publication_number,citation_numbers,citations_ipc
0,CN112508743A,"[CN103164540A, CN104318502A, CN105427204A, CN1...","[G06F 17/ 30 A I, G06Q 50/ ..."
1,CN103164540A,"[CN101055585A, CN101714150A]","[G06F 17/ 30 A I, G06F 17/ ..."
2,CN104318502A,"[US2004078316A1, CN101916408A]","[G06Q 10/ 10 A I, G06Q 30/ ..."


In [100]:
df['citations_ipc'].iloc[0]

['G06F  17',
 'G06Q  50',
 'G06Q  50',
 'G06Q  10',
 'G06Q  10',
 'G06F  17',
 'G06F  17',
 'G06Q  50',
 'G06Q  50',
 'G06Q  30',
 'G06Q  40']

In [99]:
import os
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    # Token expires in ~58 minutes; refresh a little before that.
    TOKEN_EXPIRY = time.time() + 3500  
    return TOKEN

def get_patent_biblio(publication_number: str) -> str:
    """
    Fetch bibliographic data for a given patent number from the EPO OPS API.
    
    Args:
        publication_number (str): The publication number (e.g., "CN112508743A")
        
    Returns:
        str: The XML response text.
    """
    token = get_access_token()
    # Construct the static endpoint URL using the provided publication number
    url = f"{BASE_URL}/published-data/publication/docdb/{publication_number}/biblio"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/xml"
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()  # Raise an HTTPError for bad responses
    return response.text

def retrieve_citation_publication_numbers(xml_string: str) -> list:
    """
    Parses an EPO patent XML string and retrieves citation publication numbers 
    from each citation's <document-id> element with document-id-type="docdb".
    The publication number is constructed as: country + doc-number + kind.
    
    Args:
        xml_string (str): The XML string containing patent data.
    
    Returns:
        list of str: A list of citation publication numbers.
    """
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    publication_numbers = []
    root = ET.fromstring(xml_string)
    
    citations = root.findall(".//ex:bibliographic-data/ex:references-cited/ex:citation", ns)
    
    for citation in citations:
        docdb = citation.find(".//ex:document-id[@document-id-type='docdb']", ns)
        if docdb is not None:
            country = docdb.findtext("ex:country", default="", namespaces=ns)
            doc_number = docdb.findtext("ex:doc-number", default="", namespaces=ns)
            kind = docdb.findtext("ex:kind", default="", namespaces=ns)
            pub_number = f"{country}{doc_number}{kind}"
            if pub_number:
                publication_numbers.append(pub_number)
    
    return publication_numbers

def retrieve_ipc_classifications(xml_string: str) -> list:
    """
    Parses the given patent XML string and extracts the IPC classification texts
    from the <classifications-ipcr> element. For each classification text, any text
    after (and including) the '/' character is stripped out.
    
    Args:
        xml_string (str): The XML string from the OPS API.
        
    Returns:
        list of str: A list of cleaned IPC classification texts.
    """
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    ipcs = []
    root = ET.fromstring(xml_string)
    
    for cl in root.findall(".//ex:classifications-ipcr/ex:classification-ipcr", ns):
        text = cl.findtext("ex:text", default="", namespaces=ns)
        if text:
            # Strip whitespace and remove everything after the first '/'
            cleaned_text = text.strip().split('/')[0].strip()
            ipcs.append(cleaned_text)
    
    return ipcs

def get_citations_ipc_for_patent(publication_number: str) -> list:
    """
    For a given citation publication number, fetch bibliographic data and
    return its IPC classifications.
    
    Args:
        publication_number (str): A citation publication number.
        
    Returns:
        list: A list of cleaned IPC classification texts.
    """
    try:
        xml_data = get_patent_biblio(publication_number)
        ipc_classifications = retrieve_ipc_classifications(xml_data)
        return ipc_classifications
    except Exception as e:
        print(f"Error fetching IPC for {publication_number}: {e}")
        return []

def get_all_citations_ipc(citation_nums: list) -> list:
    """
    Given a list of citation publication numbers, retrieve the IPC classifications
    for each citation and aggregate them into one list.
    
    Args:
        citation_nums (list): List of citation publication numbers.
        
    Returns:
        list: Aggregated list of cleaned IPC classification texts from the citations.
    """
    ipc_results = []
    for num in citation_nums:
        ipc = get_citations_ipc_for_patent(num)
        ipc_results.extend(ipc)
    return ipc_results

# --- Example DataFrame Integration --- #
if __name__ == "__main__":
    # Example DataFrame with a column 'publication_number'
    df = pd.DataFrame({
    'publication_number': [
        "CN112508743A", 
        "CN103164540A",   # Replace/add other valid patent publication numbers as needed.
        "CN104318502A"
    ]
})
    
    
    # For each patent, fetch its bibliographic data and extract citation publication numbers.
    df['citation_numbers'] = df['publication_number'].apply(lambda pub: 
        retrieve_citation_publication_numbers(get_patent_biblio(pub))
    )
    
    # For each row, for all citation publication numbers, retrieve and aggregate their IPC classifications.
    df['citations_ipc'] = df['citation_numbers'].apply(get_all_citations_ipc)
    
df


Unnamed: 0,publication_number,citation_numbers,citations_ipc
0,CN112508743A,"[CN103164540A, CN104318502A, CN105427204A, CN1...","[G06F 17, G06Q 50, G06Q 50, G06Q 10, G06Q ..."
1,CN103164540A,"[CN101055585A, CN101714150A]","[G06F 17, G06F 17]"
2,CN104318502A,"[US2004078316A1, CN101916408A]","[G06Q 10, G06Q 30, G06Q 40, G06Q 30, H04N ..."


In [103]:
import os
import time
import requests
import pandas as pd
import xml.etree.ElementTree as ET
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    """Get or refresh the OAuth access token."""
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    # Token expires in ~58 minutes; refresh slightly before expiry.
    TOKEN_EXPIRY = time.time() + 3500  
    return TOKEN

def get_patent_biblio(publication_number: str) -> str:
    """
    Fetch bibliographic data for a given patent number from the EPO OPS API.
    
    Args:
        publication_number (str): The publication number (e.g., "CN112508743A")
        
    Returns:
        str: The XML response text.
    """
    token = get_access_token()
    # Construct the static endpoint URL using the provided publication number
    url = f"{BASE_URL}/published-data/publication/docdb/{publication_number}/biblio"
    headers = {
        "Authorization": f"Bearer {token}",
        "Accept": "application/xml"
    }
    response = requests.get(url, headers=headers, timeout=15)
    response.raise_for_status()
    return response.text

def retrieve_citation_publication_numbers(xml_string: str) -> list:
    """
    Parses an EPO patent XML string and retrieves citation publication numbers 
    from each citation's <document-id> element with document-id-type="docdb".
    The publication number is constructed as: country + doc-number + kind.
    
    Args:
        xml_string (str): The XML string containing patent data.
    
    Returns:
        list of str: A list of citation publication numbers.
    """
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    publication_numbers = []
    root = ET.fromstring(xml_string)
    
    citations = root.findall(".//ex:bibliographic-data/ex:references-cited/ex:citation", ns)
    
    for citation in citations:
        docdb = citation.find(".//ex:document-id[@document-id-type='docdb']", ns)
        if docdb is not None:
            country = docdb.findtext("ex:country", default="", namespaces=ns)
            doc_number = docdb.findtext("ex:doc-number", default="", namespaces=ns)
            kind = docdb.findtext("ex:kind", default="", namespaces=ns)
            pub_number = f"{country}{doc_number}{kind}"
            if pub_number:
                publication_numbers.append(pub_number)
    
    return publication_numbers

def retrieve_ipc_classifications(xml_string: str) -> list:
    """
    Parses the given patent XML string and extracts the IPC classification texts
    from the <classifications-ipcr> element. For each classification text:
      - Everything after (and including) the '/' character is removed.
      - All spaces are removed from the remaining text.
      
    Args:
        xml_string (str): The XML string from the OPS API.
        
    Returns:
        list of str: A list of cleaned IPC classification texts.
    """
    ns = {
        'ex': "http://www.epo.org/exchange",
        'ops': "http://ops.epo.org"
    }
    
    ipcs = []
    root = ET.fromstring(xml_string)
    
    for cl in root.findall(".//ex:classifications-ipcr/ex:classification-ipcr", ns):
        text = cl.findtext("ex:text", default="", namespaces=ns)
        if text:
            # Remove everything after the first '/'
            cleaned_text = text.strip().split('/')[0].strip()
            # Remove all spaces from the cleaned text
            cleaned_text = cleaned_text.replace(" ", "")
            ipcs.append(cleaned_text)
    
    return ipcs

def get_citations_ipc_for_patent(publication_number: str) -> list:
    """
    For a given citation publication number, fetch bibliographic data and
    return its IPC classifications.
    
    Args:
        publication_number (str): A citation publication number.
        
    Returns:
        list: A list of cleaned IPC classification texts.
    """
    try:
        xml_data = get_patent_biblio(publication_number)
        ipc_classifications = retrieve_ipc_classifications(xml_data)
        return ipc_classifications
    except Exception as e:
        print(f"Error fetching IPC for {publication_number}: {e}")
        return []

def get_all_citations_ipc(citation_nums: list) -> list:
    """
    Given a list of citation publication numbers, retrieve the IPC classifications
    for each citation and aggregate them into one list.
    
    Args:
        citation_nums (list): List of citation publication numbers.
        
    Returns:
        list: Aggregated list of cleaned IPC classification texts from the citations.
    """
    ipc_results = []
    for num in citation_nums:
        ipc = get_citations_ipc_for_patent(num)
        ipc_results.extend(ipc)
    return ipc_results

# --- Example DataFrame Integration --- #
if __name__ == "__main__":
    # Example DataFrame with a column 'publication_number'
    df = pd.DataFrame({
    'publication_number': [
        "CN112508743A", 
        "CN103164540A",   # Replace/add other valid patent publication numbers as needed.
        "CN104318502A"
    ]
})
    
    # For each patent, fetch its bibliographic data and extract citation publication numbers.
    df['citation_numbers'] = df['publication_number'].apply(lambda pub: 
        retrieve_citation_publication_numbers(get_patent_biblio(pub))
    )
    
    # For each row, for all citation publication numbers, retrieve and aggregate their IPC classifications.
    df['citations_ipc'] = df['citation_numbers'].apply(get_all_citations_ipc)
    
df


Unnamed: 0,publication_number,citation_numbers,citations_ipc
0,CN112508743A,"[CN103164540A, CN104318502A, CN105427204A, CN1...","[G06F17, G06Q50, G06Q50, G06Q10, G06Q10, G06F1..."
1,CN103164540A,"[CN101055585A, CN101714150A]","[G06F17, G06F17]"
2,CN104318502A,"[US2004078316A1, CN101916408A]","[G06Q10, G06Q30, G06Q40, G06Q30, H04N7]"


parallel processing

In [None]:
import concurrent.futures
import os
import requests
import time
from urllib.parse import quote
import pandas as pd
from dotenv import load_dotenv

# Global token cache
TOKEN = None
TOKEN_EXPIRY = 0

# Constants for API endpoints
TOKEN_URL = "https://ops.epo.org/3.2/auth/accesstoken"
BASE_URL = "https://ops.epo.org/3.2/rest-services"

# Load credentials from .env file
load_dotenv()
CONSUMER_KEY = os.getenv("CONSUMER_KEY").strip()
CONSUMER_SECRET = os.getenv("CONSUMER_SECRET").strip()

def get_access_token() -> str:
    global TOKEN, TOKEN_EXPIRY
    if TOKEN and time.time() < TOKEN_EXPIRY:
        return TOKEN
    data = {
        "grant_type": "client_credentials",
        "client_id": CONSUMER_KEY,
        "client_secret": CONSUMER_SECRET
    }
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    response = requests.post(TOKEN_URL, data=data, headers=headers, timeout=15)
    response.raise_for_status()
    TOKEN = response.json()["access_token"]
    TOKEN_EXPIRY = time.time() + 3500
    return TOKEN

def validate_patent_number(patent: str) -> bool:
    return bool(patent and len(patent.strip()) >= 4)

def extract_jurisdictions_and_members(data: dict) -> dict:
    jurisdictions = set()
    family_members = []
    world_data = data.get('ops:world-patent-data', {})
    patent_family = world_data.get('ops:patent-family', {})
    members = patent_family.get('ops:family-member', []) or []
    if isinstance(members, dict):
        members = [members]

    for member in members:
        pub_ref = member.get('publication-reference', {})
        docs = pub_ref.get('document-id', []) or []
        if isinstance(docs, dict):
            docs = [docs]
        for doc in docs:
            if doc.get('@document-id-type') == 'docdb':
                country = doc.get('country')
                country = country.get('$') if isinstance(country, dict) else country
                doc_number = doc.get('doc-number')
                doc_number = doc_number.get('$') if isinstance(doc_number, dict) else doc_number
                kind = doc.get('kind')
                kind = kind.get('$') if isinstance(kind, dict) else kind
                if country and doc_number and kind:
                    jurisdictions.add(country)
                    family_members.append(f"{country}{doc_number}{kind}")
    return {
        'jurisdictions': sorted(jurisdictions),
        'family_members': sorted(set(family_members))
    }

def process_patent(patent: str) -> dict:
    if not validate_patent_number(patent):
        return {'jurisdictions': None, 'family_members': None}
    try:
        token = get_access_token()
        url = f"{BASE_URL}/family/publication/docdb/{quote(patent)}"
        headers = {"Authorization": f"Bearer {token}", "Accept": "application/json"}
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code in (403, 404):
            return {'jurisdictions': None, 'family_members': None}
        response.raise_for_status()
        return extract_jurisdictions_and_members(response.json())
    except Exception:
        return {'jurisdictions': None, 'family_members': None}

def process_dataframe_parallel(df: pd.DataFrame, patent_col: str, max_workers: int = 10) -> pd.DataFrame:
    if patent_col not in df.columns:
        raise ValueError(f"Column '{patent_col}' not found in DataFrame")
    patents = df[patent_col].tolist()
    results = {}

    # Use ThreadPoolExecutor for I/O-bound API calls
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_patent = {executor.submit(process_patent, p): p for p in patents}
        for future in concurrent.futures.as_completed(future_to_patent):
            patent = future_to_patent[future]
            try:
                results[patent] = future.result()
            except Exception:
                results[patent] = {'jurisdictions': None, 'family_members': None}
            # Optional small sleep to space requests
            time.sleep(0.1)

    df['family_jurisdictions'] = df[patent_col].map(lambda p: results[p]['jurisdictions'])
    df['family_members'] = df[patent_col].map(lambda p: results[p]['family_members'])
    return df

# Example usage:
if __name__ == "__main__":
    df = pd.read_csv('your_patents.csv')
    processed_df = process_dataframe_parallel(df, 'first publication number', max_workers=20)
    print(processed_df[['first publication number', 'family_jurisdictions', 'family_members']])

