In [1]:
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI
import os, json, sys

class MOFCrystalData(BaseModel):
    """Formatted response for crystal structure from an LLM """
    
    a: float = Field(default=0, description="The cell parameter a in Angstrom")
    b: float = Field(default=0, description="The cell parameter b in Angstrom")
    c: float = Field(default=0, description="The cell parameter c in Angstrom")
    alpha: float = Field(default=0, description="The angle alpha in degree")
    beta: float = Field(default=0, description="The angle beta in degree")
    gamma: float = Field(default=0, description="The angle gamma in degree")
    ccdc_number: str = Field(default="", description="The CCDC number of the material")

class MOFDataExtractor:
    def __init__(self,model_name="gpt-4o-mini",api_key=None,temperature=0):
        llm = ChatOpenAI(model=model_name, temperature=temperature, api_key=api_key)
        self.llm = llm

    def run(self, query, system_prompt=None):
        """
        Run the MOFDataExtractor with a query and optional system prompt.

        Args:
            query (str): The user query.
            system_prompt (str, optional): The system-level instruction to guide the LLM behavior.

        Returns:
            dict: The structured output from the LLM.
        """
        structured_llm = self.llm.with_structured_output(MOFCrystalData)

        # Create the messages with optional system prompt
        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt})
        messages.append({"role": "user", "content": query})

        # Invoke the structured LLM with messages
        response = structured_llm.invoke(messages)
        return response


In [12]:
"""
Script to extract crystallographic data from PDF text files using LLM.

This script processes DOIs from command line arguments, reads associated isotherm files,
and extracts crystallographic information for each adsorbent using the MOFDataExtractor.
The extracted data is saved as JSON files.
"""
sys_prompt = "You are an expert in extracting crystallographic information about metal-organic frameworks (MOFs). Provide accurate and concise answers based solely on the provided information. Do not include prior-knowledge or speculative details in your response. If the text does not contain the crystallographic information of the material inquired, return the cell parameters as [0,0,0,0,0,0]."

mof_data_extractor = MOFDataExtractor()
dois = ["10.1039c4cc09212k"]
#basepath = '/home/tdpham/Dropbox/Northwestern/work/Experimental_isotherm_project/OpenAI/steps/automated_steps/convert_pdf_to_txt/batch3'
basepath = '/home/tdpham/Dropbox/Northwestern/work/Experimental_isotherm_project/OpenAI/scripts/convert_pdf_to_text'
isodb_path = '/home/tdpham/Dropbox/Northwestern/work/Experimental_isotherm_project/OpenAI/steps/1_match_by_DOIs/csd/'

for doi in dois:
    if os.path.isdir(doi):
        continue
    try:
        isotherms = os.listdir(os.path.join(isodb_path, doi))
    except FileNotFoundError:
        print(f"Missing {os.path.join(isodb_path, doi)}")
        continue
    # Get adsorbent name
    isotherms = [i for i in isotherms if '.cif' not in i]
    adsorbents = []
    for iso in isotherms:
        isopath = os.path.join(isodb_path, doi, iso)
        with open(isopath, 'r') as f:
            iso_data = json.load(f)

        adsorbent = iso_data["adsorbent"]["name"]
        if adsorbent not in adsorbents:
            adsorbents.append(adsorbent)
    # Get text from pdf
    with open(os.path.join(basepath, doi, 'combined_text.txt'), 'r') as f:
        text = f.read()

    for ida, adsorbent in enumerate(adsorbents):
        print(adsorbent)
        task_prompt = f"Analyze the attached text, which contains information about metal-organic frameworks and answer the following questions:\n1. What is the crystal data of {adsorbent}?\n We are interested in the cell lengths (a, b and c) and cell angles (alpha, beta and gamma). This information is often mentioned in the crystallographic data section. If the text did not mention a certain parameter, infer it from the crystal system, or point group.\n2. What is the CCDC number of {adsorbent}? \n\n{text}\n\""
        cte = MOFDataExtractor()
        task_output = cte.run(task_prompt)

        with open(f"{adsorbent}.json", "w") as f:
            json.dump(task_output.model_dump(), f)
    print(adsorbents)

NU-700
MOF-143
['NU-700', 'MOF-143']
