In [3]:
import pandas as pd
import numpy as np
from striprtf.striprtf import rtf_to_text
import re
import os

# S1.1.2 Patent Data Extraction RTF

Code in this file was written primarily ChatGPT with careful prompting and editing by VMC. LLMs excel at writing simple code such as extracting data from structured text files.

In [7]:
def extract_temperatures(text):
    text=text.replace('rt', '25 °C')
    # Regex pattern to match temperatures or temperature ranges in degrees C
    pattern = r'(\d+)\s?°?C(?:[-–→](\d+)\s?°?C)?'
    
    # Find all matches in the text
    matches = re.findall(pattern, text)
    
    temperatures = []
    
    # Process matches
    for match in matches:
        start_temp = int(match[0])
        end_temp = match[1]
        
        if end_temp:  # If there is a range (e.g., "20-25°C")
            end_temp = int(end_temp)
            temperatures.extend([start_temp, end_temp])
        else:  # Single temperature (e.g., "20°C")
            temperatures.append(start_temp)
    
    return temperatures

def extract_patent_number(text):
    # Regular expression pattern to capture the patent number
    pattern = r'Patent Number:\s*([A-Z0-9]+)'
    match = re.search(pattern, text)
    
    if match:
        return match.group(1)  # Extract the patent number from the match
    else:
        return None  # Return None if no patent number is found

def extract_pressures(text):
    # Regex pattern to match pressure values or ranges in MPa, kPa, or psi
    pattern = r'(\d+\.?\d*)\s?(MPa|kPa|psi|bar|atm)(?:[-–](\d+\.?\d*)\s?(MPa|kPa|psi|bar|atm))?'
    
    # Find all matches in the text
    matches = re.findall(pattern, text)
    
    pressures = []
    
    # Process matches
    for match in matches:
        start_value = float(match[0])
        start_unit = match[1]
        end_value = match[2] if match[2] else None
        end_unit = match[3] if match[3] else None
        
        # Convert start value to MPa if necessary
        if start_unit == 'kPa':
            start_value /= 1000  # Convert kPa to MPa
        elif start_unit == 'psi':
            start_value /= 145  # Convert psi to MPa
        elif start_unit == 'bar':
            start_value /= 10  # Convert bar to MPa
        elif start_unit == 'atm':
            start_value /= 9.869  # Convert atm to MPa
        
        pressures.append(start_value)
        
        # If there's a range, process the end value
        if end_value:
            end_value = float(end_value)
            if end_unit == 'kPa':
                end_value /= 1000  # Convert kPa to MPa
            elif end_unit == 'psi':
                end_value /= 145  # Convert psi to MPa
            elif start_unit == 'bar':
                start_value /= 10  # Convert bar to MPa
            elif start_unit == 'atm':
                start_value /= 9.869  # Convert atm to MPa
            pressures.append(end_value)
    
    return pressures

In [8]:
def rtf_to_excel(rtf_filename):
    
    cas = rtf_filename.replace(".rtf", "")
    rtf_filename = "rtfs/"+rtf_filename
    excel_filename = "parsed_dataframes/" + cas + ".xlsx"

    # Read the RTF file
    with open(rtf_filename, 'r') as file:
        rtf_content = file.read()

    # Convert to plain text
    plain_text = rtf_to_text(rtf_content)

    entries = plain_text.split(" . ")[1:]
    output = []

    for entry in entries:
        # extract experimental section
        exp_sec = entry[entry.find("Stage|Reagents|Catalysts|Solvents|Conditions|")+len("Stage|Reagents|Catalysts|Solvents|Conditions|"):entry.rfind("|")]
        exp_sec=exp_sec.replace("\n", ", ")
        steps = re.split(r'\d\|', exp_sec)[1:]

        reagants = []
        catalysts = []
        solvents = []
        temperatures = []
        pressures = []

        for step in steps:
            step = step.strip()
            if step[-2:] == "|,":
                step = step[:-2]

            array = step.replace("||", "|").split("|")

            if len(array) < 4:
                print("entry skipped")
                print(step)
                continue
                
            reagants.extend(array[0].split(", ")[:-1])
            catalysts.extend(array[1].split(", ")[:-1])
            solvents.extend(array[2].split(", ")[:-1])
            temperatures.extend(extract_temperatures(array[3]))
            pressures.extend(extract_pressures(array[3]))

        output.append({'patent_number' : extract_patent_number(entry),
                    'reagants' : reagants,
                    'catalysts' : catalysts,
                    'solvents' : solvents,
                    'temperatures_c' : np.mean(temperatures) if temperatures else None,
                    'pressures_mpa' : np.mean(pressures) if pressures else None})
        
    df = pd.DataFrame(output)
    df.insert(0, "CAS", cas)
    df.to_excel(excel_filename)


In [9]:
def list_files_in_folder(folder_path):
    # List all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return file_names


folder_path = 'rtfs'
files = list_files_in_folder(folder_path)



for file in files:
    rtf_to_excel(file)

entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
entry skipped
,
