In [1]:
import re
import pandas as pd
import os

# S1.1.3 Patent Data Extraction RTF

Code in this file was written primarily ChatGPT with careful prompting and editing by VMC. LLMs excel at writing simple code such as extracting data from structured text files.}

In [2]:
def read_rdf_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        rdf_content = file.read()
    return rdf_content

def extract_patent_number(text):
    # Regex patterns for both "United States" and "World Intellectual Property Organization"
    us_pattern = r"United States,\s*(US[A-Z0-9]+)\s"
    world_pattern = r"World Intellectual Property Organization,\s*(WO[A-Z0-9]+)\s"
    
    # Search for "United States" pattern
    us_match = re.search(us_pattern, text)
    if us_match:
        return us_match.group(1)  # Return the patent number for "United States"
    
    # Search for "World Intellectual Property Organization" pattern
    world_match = re.search(world_pattern, text)
    if world_match:
        return world_match.group(1)  # Return the patent number for "World Intellectual Property Organization"
    
    # Return None if no patent number is found
    return None

def extract_reaction_components(text):
    # Regex patterns for reactants and products
    reactants_pattern = r'\$DTYPE RXN:RCT\(\d+\):CAS_RN\s*\$DATUM\s*([\d\-]+)'
    products_pattern = r'\$DTYPE RXN:PRO\(\d+\):CAS_RN\s*\$DATUM\s*([\d\-]+)'
    
    # Find all reactants
    reactants = re.findall(reactants_pattern, text)
    
    # Find all products
    products = re.findall(products_pattern, text)
    
    return {
        'reactants': reactants,
        'products': products
    }

def extract_reagents(text):
    # Regex pattern for reagents
    reagents_pattern = r'\$DTYPE RXN:VAR\(\d+\):RGT\(\d+\):CAS_RN\s*\$DATUM\s*([\d\-]+)'
    
    # Find all reagents
    reagents = re.findall(reagents_pattern, text)
    
    return reagents


In [None]:
def rdf_to_dataframe(rdf_filename):
    rdf_filename = "rdfs/"+rdf_filename

    # Read the RTF file
    text = read_rdf_file(rdf_filename)

    entries = text.split('$RXN')
    output = []

    for entry in entries[1:]:
        reactandprod = extract_reaction_components(entry)
        output.append({'patent_number' : extract_patent_number(entry),
                    'reagant_cas' : extract_reagents(entry),
                    'reactants' : reactandprod['reactants'],
                    'products' : reactandprod['products']})
        
    df = pd.DataFrame(output)
    return df


In [4]:
def augment_excel(rdf_filename):
    cas = rdf_filename.replace('.rdf', '')
    excel_filename = 'parsed_dataframes/'+cas+'.xlsx'
    
    rxndf = rdf_to_dataframe(rdf_filename)
    proceduredf = pd.read_excel(excel_filename)
    proceduredf=proceduredf.drop(columns = ['reagant_cas', 'products', 'reactants'], errors = 'ignore')

    merged_df = pd.merge(proceduredf, rxndf, on = 'patent_number', how = 'outer').drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'], errors='ignore')
    merged_df.to_excel(excel_filename)

In [5]:
def list_files_in_folder(folder_path):
    # List all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return file_names


folder_path = 'rdfs'
files = list_files_in_folder(folder_path)



for file in files:
    augment_excel(file)