In [53]:
import re
import json
import ast
import openai
import requests
import urllib.parse 
from selenium import webdriver

def convert_to_smiles(iupac_name):
    # URL encode the chemical name
    encoded_name = urllib.parse.quote(iupac_name)

    # the base URL of the web service
    base_url = "https://opsin.ch.cam.ac.uk/opsin/"

    # the URL of the request
    url = base_url + encoded_name + ".smi"

    # send the GET request
    response = requests.get(url)

    # return the SMILES code
    return response.text

# test the function with the IUPAC name "3-(2-carboxyvinyl)-5-carboxypyrazole"
#print('test function convert IUPAC to smiles')
#smiles_code = convert_to_smiles("3-(2-carboxyvinyl)-5-carboxypyrazole")
#print(smiles_code)
#smiles_code = convert_to_smiles("3,5-dicarboxypyrazole")
#print(smiles_code)
#print('test done')



def smiles_to_zinc_name(smiles):
    # Validate that smiles is a string containing valid SMILES characters
    pattern = r'^[a-zA-Z0-9()=+#:@/.\\%-]*$'
    if not isinstance(smiles, str) or not re.match(pattern, smiles):
        print("The input must be a valid SMILES string containing only valid characters.")
        return "N/A"
    
    # Setting up Chrome options for headless browsing
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    driver = webdriver.Chrome(options=options)

    # Navigating to the URL
    url = "https://zinc20.docking.org/substances/home/"
    driver.get(url)

    # Finding the search box and entering the smiles string
    search_box = driver.find_element("name", "q")
    search_box.send_keys(smiles)

    # Finding and clicking the search button
    search_button = driver.find_element("xpath", "//button[text()='Search']")
    search_button.click()

    # Extracting image tags using regular expressions
    pattern_img = r'<img\s+src=".*?"\s+alt=".*?">'
    img_tags = re.findall(pattern_img, driver.page_source)

    # Quitting the driver
    driver.quit()

    # If img_tags is not empty, extract the alt text, else return "N/A"
    if img_tags:
        pattern_alt = r'alt="(.*?)"'
        alt_texts = re.findall(pattern_alt, img_tags[0])
        return alt_texts[0]
    else:
        return "N/A"



def get_inventory(zinc_id):
    # If zinc_id is "N/A", return unavailable
    if zinc_id == "N/A":
        return "Availability: N/A  Vendors: N/A"
    else:
        # Constructing the URL using the medical name
        url = "https://zinc20.docking.org/substances/" + zinc_id
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-dev-shm-usage")
        options.add_argument("--no-sandbox")
        driver = webdriver.Chrome(options=options)

        # Navigating to the URL
        driver.get(url)

        # Extracting availability information
        availability = driver.find_element("xpath", "/html/body/div[1]/div/div/div[1]/div[1]/div[3]/table[1]/tbody/tr/td[2]").accessible_name
        
        # Extracting vendors number information
        vendors_number = driver.find_element("xpath", "/html/body/div/div/div/div[3]/div/div[1]/div[1]/h3").accessible_name
        num = re.findall(r'\d+', vendors_number)[0]

        # Extracting annotated catalogs information
        cate_num = driver.find_element("xpath", "/html/body/div/div/div/div[3]/div/div[2]/div[1]/h3/a").accessible_name
        cnum = re.findall(r'\d+', cate_num)[0]

        # Quitting the driver
        driver.quit()

        return "Availability: " + availability + "  Vendors: " + num + "  Annotated Catalogs: " + cnum

def check_inventory(smiles):
    zinc_id = smiles_to_zinc_name(smiles)
    return get_inventory(zinc_id)
    

#query_string = "OC=1C=C(C=CC1C(=O)O)C1=CC=C(C=C1)C1=CC(=C(C=C1)C(=O)O)O"
#medical_name = smiles_to_zinc_name(query_string)
#print(medical_name)
#print(get_state_num(get_medical_name("C(=O)(O)/C=C/C1=CC(=NN1)C(=O)O")))
#print(check_inventory("C(=O)(O)C1=NNC(=C1)C(=O)O"))
#print(check_inventory("FC(C1=C(C=CC(=C1)C(=O)O)C1=C(C=C(C=C1)C(=O)O)C(F)(F)F)(F)F"))
#print(check_inventory("XXXXXXXXXABC123"))

def execute_function_call(function_name, arguments):
    try:
        # Check if the function exists and call it
        if function_name in globals():
            return globals()[function_name](**arguments)
        else:
            # Function does not exist
            return f"Error: function {function_name} does not exist"
    except Exception as e:
        # Handle any errors that occur during function execution
        return f"Function execution failed with error: {e}"
    
    

class ChatBotWithFunctions:
    def __init__(self, api_key, model="gpt-4-0613", helper_functions=None):
        # Set the API key and model
        openai.api_key = api_key
        self.model = model
        self.system_prompt = """You are a chemistry expert. 
        If the user provide you with a common name of the chemical instead of standard IUPAC name, you shouold first convert
        it to IUPAC name based on your knowledge.
        
        """
        
        #back-up prompt
        """You are a chemistry expert. Do not assume if you do not know the exact parameter in the function. 
        If the input parameter is empty, please ask the user to provide.
        If the user provide you with a common name of the chemical instead of standard IUPAC name, you shouold first convert
        it to IUPAC name based on your knowledge, and then use it as the input paramter for the function calling.
   
        """
        
        

        # Store the helper functions
        self.helper_functions = helper_functions or []
        
        # Initialize the chat history
        self.chat_history = [{"role": "system", "content":self.system_prompt }]

    def chat(self, user_input, temp=None, max_tokens=None, max_fc=20):
        # Add the user's input to the chat history
        self.chat_history.append({"role": "user", "content": user_input})

        # Generate a chat completion with the current chat history and helper functions
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=self.chat_history,
            functions=self.helper_functions,
            temperature=temp,
            max_tokens=max_tokens
        )

        # Get the assistant's message from the response
        assistant_message = response['choices'][0]['message']
        
        #limit the number of function call
        number_fc = 0
        max_function_call = max_fc
        
        # If the assistant's message contains a function call
        while assistant_message.get("function_call") and (number_fc < max_function_call) :
            # Parse the function call data
            #data = json.loads(assistant_message["function_call"])
            #function_name = data["name"]
            #arguments = ast.literal_eval(data["arguments"])

            data = assistant_message["function_call"]
            function_name = data["name"]
            try:
                arguments = ast.literal_eval(data["arguments"])
            except SyntaxError:
                # Handle the error here, perhaps by logging it or providing a default value
                if isinstance(data["arguments"], str):
                    try:
                        arguments = json.loads(data["arguments"])
                        
                        parsed_data = json.loads(data["arguments"])
                        if "smiles" in parsed_data:
                            arguments = parsed_data["smiles"]
                        elif "iupac_name" in parsed_data:
                            arguments = parsed_data["iupac_name"]
                        else:
                            raise ValueError("Neither 'smiles' nor 'iupac_name' found in arguments.")
        
                    except (json.JSONDecodeError, ValueError):
                        arguments = {}
                        print("Failed to parse arguments from below message, setting to empty")
                        print(assistant_message)
                else:
                    arguments = {}
                    print("Arguments are not in a recognizable format, setting to empty")


            # If a function with the given name exists in the global scope
            if function_name in globals():
                print(f"Function:'{function_name}' was used with input '{arguments}'.")
                # Call the function with the provided arguments and store the result
                function_results = execute_function_call(function_name, arguments)
                print(f"The result is:'{function_results}' ")
                self.chat_history.append({"role": "function", "name": function_name, "content": function_results})

                # Generate a new chat completion with the updated chat history
                response = openai.ChatCompletion.create(
                    model=self.model,
                    messages=self.chat_history,
                    functions=self.helper_functions
                )

                # Update the assistant's message with the new response
                #print(response)
                assistant_message = response['choices'][0]['message']
            else:
                print(f"No function named '{function_name}' was found.")
            
            #count number of total function calls, should not be larger than the max function call number
            number_fc+=1
            
        # Get the content of the assistant's message
        assistant_message_content = assistant_message['content']

        # Add the assistant's message to the chat history
        self.chat_history.append({"role": "assistant", "content": assistant_message_content})

        # Return the assistant's message
        return assistant_message_content

    def reset(self):
        # Reset the chat history
        self.chat_history = [{"role": "system", "content": self.system_prompt}]

functions_definition = [
    {
        "name": "convert_to_smiles",
        "description": "Converts an IUPAC name to a SMILES code",
        "parameters": {
            "type": "object",
            "properties": {
                "iupac_name": {
                    "type": "string",
                    "description": "The IUPAC name of the chemical"
                }
            },
            "required": ["iupac_name"]
        }
    },
    {
        "name": "check_inventory",
        "description": "Checks the inventory for a given SMILES code. Output is the commericial availability and number of vendors for this compound.",
        "parameters": {
            "type": "object",
            "properties": {
                "smiles": {
                    "type": "string",
                    "description": "The SMILES code of the chemical"
                }
            },
            "required": ["smiles"]
        }
    }
]


# Test the class
bot = ChatBotWithFunctions('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', helper_functions=functions_definition)

In [54]:

print(bot.chat("What is the SMILES code for 3-(2-carboxyvinyl)-5-carboxypyrazole? Can I buy it from commericial vendor?", max_tokens=2000))
bot.reset()  # Reset the conversation


Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvinyl)-5-carboxypyrazole'}'.
The result is:'C(=O)(O)C=CC1=NNC(=C1)C(=O)O' 
Function:'convert_to_smiles' was used with input '{'iupac_name': 'C(=O)(O)C=CC1=NNC(=C1)C(=O)O'}'.
The result is:'OPSIN has failed to process the following request:
C(=O)(O)C=CC1=NNC(=C1)C(=O)O
The reason given for this failure is as follows:
C(=O)(O)C=CC1=NNC(=C1)C(=O)O was uninterpretable due to the following section of the name: C(=O)(O)C=CC1=NNC(=C1)C(=O)O  The following was not understandable in the context it was used: C(=O)(O)C=CC1=NNC(=C1)C(=O)
' 
Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvinyl)-5-carboxypyrazole'}'.
The result is:'C(=O)(O)C=CC1=NNC(=C1)C(=O)O' 
Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvinyl)-5-carboxypyrazole'}'.
The result is:'C(=O)(O)C=CC1=NNC(=C1)C(=O)O' 
Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvin

In [20]:
bot.reset() 
print(bot.chat("If I replace the five member ring in 3-(2-carboxyvinyl)-5-carboxypyrazole with furan, it this new compound commericially avaialble? You can have O at different position of the ring"))
print('\n')

Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvinyl)-5-carboxyfuran'}'.
The result is:'C(=O)(O)C=CC1=COC(=C1)C(=O)O' 
Function:'check_inventory' was used with input '{'smiles': 'C(=O)(O)C=CC1=COC(=C1)C(=O)O'}'.
The result is:'Availability: N/A  Vendors: N/A' 
The compound you referred to is not commercially available.




In [21]:
bot.reset() 
print(bot.chat("What is the SMILES code for KK-3-(2-carboxyvinyl)-5-carboxypyrazole?"))
print('\n')

bot.reset() 
print(bot.chat("What is the SMILES code for 3-(2-carboxyvinyl)-5-carboxypyrazole?",temp =1 ))
print('\n')

bot.reset() 
print(bot.chat("Is 3,5-dicarboxypyrazole commericial available?",temp =1 ))
print('\n')

bot.reset() 
print(bot.chat("How many vendors sell H2BDC? ",temp =1 ))
print('\n')

bot.reset() 
print(bot.chat("""What is the SMILES code for 3,3''-dihydroxy-[1,1':4',1''-terphenyl]-4,4''-dicarboxylic acid""" ))

Function:'convert_to_smiles' was used with input '{'iupac_name': 'KK-3-(2-carboxyvinyl)-5-carboxypyrazole'}'.
The result is:'OPSIN has failed to process the following request:
KK-3-(2-carboxyvinyl)-5-carboxypyrazole
The reason given for this failure is as follows:
KK-3-(2-carboxyvinyl)-5-carboxypyrazole was uninterpretable due to the following section of the name: K  The following was not understandable in the context it was used: K
' 
I'm sorry, but it seems there was a problem with the chemical name you provided. "KK-3-(2-carboxyvinyl)-5-carboxypyrazole" isn't recognizable. Could you please confirm the correct chemical name?


Function:'convert_to_smiles' was used with input '{'iupac_name': '3-(2-carboxyvinyl)-5-carboxypyrazole'}'.
The result is:'C(=O)(O)C=CC1=NNC(=C1)C(=O)O' 
The SMILES code for 3-(2-carboxyvinyl)-5-carboxypyrazole is "C(=O)(O)C=CC1=NNC(=C1)C(=O)O".


Function:'convert_to_smiles' was used with input '{'iupac_name': '3,5-dicarboxypyrazole'}'.
The result is:'C(=O)(O)C

In [12]:
bot.reset() 
print(bot.chat("Can I buy MOF linker H3BTC?"))
print('\n')



Function:'convert_to_smiles' was used with input '{'iupac_name': '1,3,5-benzenetricarboxylic acid'}'.
The result is:'C1(=CC(=CC(=C1)C(=O)O)C(=O)O)C(=O)O' 
Function:'check_inventory' was used with input '{'smiles': 'C1(=CC(=CC(=C1)C(=O)O)C(=O)O)C(=O)O'}'.
The result is:'Availability: In-Stock  Vendors: 70  Annotated Catalogs: 18' 
Yes, the MOF linker H3BTC is commercially available. There are 70 vendors who have it in their current inventory.




In [22]:
bot.reset() 
input_query ="""

For below structure:

Ethyl 5-formyl-1H-pyrazole-3-carboxylate


Please make modifications and I want you to suggest 10 structures that fit below critieria

A 5-membered aromatic ring
A formyl group at the directly connected to the ring.
An ethyl or methyl carboxylate group directly connected to the ring.
the carbon atom that formyl group connected to and the carbon atom ethyl or methyl carboxylate group connected to are not neighboring.
the ring can have all five atoms to be carbon, or up to three carbons be replaced by N, S or O
No other substituents on the ring.

You should think for structure first, and then give their IUPAC name.

Your output should have the following format:
Compound 1: IUPAC name
Compound 2: IUPAC name
...
Compound 10: IUPAC name

"""

linker_names_response =bot.chat(input_query)

print(linker_names_response)


Here are 10 structures that fit your criteria along with their IUPAC names:

Compound 1: Ethyl 5-formylthiophene-2-carboxylate
Compound 2: Ethyl 4-formylfuran-2-carboxylate
Compound 3: Ethyl 5-formyl-1H-pyrazole-3-carboxylate
Compound 4: Ethyl 5-formyl-1H-pyrazole-4-carboxylate
Compound 5: Ethyl 5-formyloxazole-4-carboxylate
Compound 6: Methyl 5-formylthiophene-2-carboxylate
Compound 7: Methyl 4-formylfuran-2-carboxylate
Compound 8: Methyl 5-formyl-1H-pyrazole-3-carboxylate
Compound 9: Methyl 5-formyl-1H-pyrazole-4-carboxylate
Compound 10: Methyl 5-formyloxazole-4-carboxylate

Here, the number directly preceding the name of the ring refers to the position of the carboxylic acid ester (either ethyl or methyl), and the number directly following the word "formyl" refers to the position of the formyl group on the ring. Please note that in each case, these two groups are not on neighboring carbon atoms, as per your criteria.



In [26]:
print(bot.chat("""please tell me their inventory availability. If one fails, please continue untill you check them all.

Your output should have the following format:
Compound 1: IUPAC name; Availability
Compound 2: IUPAC name;  Availability
...
Compound 10: IUPAC name;Availability

"""))
              

Function:'convert_to_smiles' was used with input '{'iupac_name': 'Ethyl 5-formyl-1H-pyrazole-4-carboxylate'}'.
The result is:'C(=O)C1=C(C=NN1)C(=O)OCC' 
Function:'check_inventory' was used with input '{'smiles': 'C(=O)C1=CC(=NN1)C(=O)OCC'}'.
The result is:'Availability: In-Stock  Vendors: 43  Annotated Catalogs: 2' 
Function:'convert_to_smiles' was used with input '{'iupac_name': 'Ethyl 5-formyloxazole-4-carboxylate'}'.
The result is:'C(=O)C1=C(N=CO1)C(=O)OCC' 
Function:'check_inventory' was used with input '{'smiles': 'C(=O)C1=C(N=CO1)C(=O)OCC'}'.
The result is:'Availability: N/A  Vendors: N/A' 
Function:'convert_to_smiles' was used with input '{'iupac_name': 'Methyl 5-formylthiophene-2-carboxylate'}'.
The result is:'C(=O)C1=CC=C(S1)C(=O)OC' 
Function:'convert_to_smiles' was used with input '{'iupac_name': 'Methyl 4-formylfuran-2-carboxylate'}'.
The result is:'C(=O)C=1C=C(OC1)C(=O)OC' 
Function:'check_inventory' was used with input '{'smiles': 'C(=O)C=1C=C(OC1)C(=O)OC'}'.
The result i

In [38]:
print(bot.chat("""please suggest 10 more structures and check their availability. If one fails, please continue untill you check them all.

Your output should have the following format:
Compound 11: IUPAC name; Availability
Compound 12: IUPAC name;  Availability
...
Compound 20: IUPAC name;Availability

"""))

SyntaxError: invalid syntax (2969826794.py, line 3)

In [35]:
print(bot.chat("""show me what's your finding so far
"""))

Compound 11: Ethyl 4-formylpyrimidine-2-carboxylate; Availability: In-Stock, Vendors: 35
Compound 12: Ethyl 4-formylpyrazine-2-carboxylate; Availability: In-Stock, Vendors: 41
Compound 13: Ethyl 5-formylpyrimidine-2-carboxylate; Availability: N/A
Compound 14: Ethyl 4-formylpyrazine-2-carboxylate; Availability: In-Stock, Vendors: 35
Compound 15: Ethyl 5-formylpyrazine-3-carboxylate; Availability: In-Stock, Vendors: 41
Compound 16: Methyl 4-formylpyrimidine-2-carboxylate; Availability: N/A
Compound 17: Methyl 4-formylpyrazine-2-carboxylate; Availability: In-Stock, Vendors: 35
Compound 18: Methyl 5-formylpyrazine-3-carboxylate; Availability: In-Stock, Vendors: 41
Compound 19: Ethyl 4-formylpyrimidine-2-carboxylate; Availability: N/A
Compound 20: Ethyl 4-formylpyrazine-2-carboxylate; Availability: In-Stock, Vendors: 35

It seems like Compounds 13, 16, and 19 are not commercially available. The rest are available with varying number of vendors.


In [None]:
"""



Idea:

"""


start with one, then use for loop to have it mutate once, then add to the prompt

In [60]:
bot.reset() 
input_query ="""



For below structure:

Ethyl 5-formyl-1H-pyrazole-3-carboxylate

I'm trying to synthesize similar linker to make water havresting MOFs.

Please make modifications on the given structure to suggest 30 structures

You should think for structure first, and then give their IUPAC name, also give the reasoning.

**
I understand that you might not have the capability to generate new structures. 
Please be aware that you are merely making theoretical assumptions about these structures based on your knowledge of SMILES nomenclature
and IUPAC names. 
I will consider your input as a suggestion and will later consult with a qualified material chemist to assess its synthetic feasibility.
**


Your output should have the following format:
Compound 1: IUPAC name; reasoning
Compound 2: IUPAC name; reasoning
...
Compound 30: IUPAC name; reasoning

"""

linker_names_response =bot.chat(input_query)

print(linker_names_response)

Being an AI language model developed by OpenAI, I'm unable to generate molecular structures or predict their IUPAC names. However, I can offer some suggestions for substitutions and modifications based on known reactions and structural motifs that may be useful in developing new linker molecules for MOF's. Please consult with a qualified material chemist to assess its synthetic feasibility. Here are my suggestions:

1. Ethyl 5-formyl-1H-pyrazole-3-methylcarboxylate: Adding a methyl group to the carboxylate increases the steric effects.
2. Ethyl 5-formyl-1H-pyrazole-3-ethylcarboxylate: Changing methyl to ethyl group increses the carbon chain length.
3. Ethyl 5-formyl-1H-pyrazol-3-yl acetic acid: Converts the ester group to a carboxylic acid, thus increasing potential sites for coordination.
4. Ethyl 5-hydroxy-1H-pyrazole-3-carboxylate: Formyl group replaced by hydroxyl for increased potential sites of hydrogen bonding.
5. Ethyl 5-amino-1H-pyrazole-3-carboxylate: Formyl group replaced by

In [3]:

import selfies as sf
LA2 = 'C(=O)(O)C=CC1=NNC(=C1)C(=O)O'
LA2_sf = sf.encoder(LA2) 
LA2_sf

'[C][=Branch1][C][=O][Branch1][C][O][C][=C][C][=N][N][C][=Branch1][Ring2][=C][Ring1][Branch1][C][=Branch1][C][=O][O]'

In [8]:
test1 = 'O=C(O)C=1C=CC(=CC1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C=4C=CC(=CC4)C=5C=CC(=CC5)C(=O)O)C=6C=CC(=CC6)C=7C=CC(=CC7)C(=O)O'
test1_sf = sf.encoder(test1) 
print(test1_sf)
test1_smi = sf.decoder(test1_sf)
test1_smi

[O][=C][Branch1][C][O][C][C][=C][C][=Branch1][Branch1][=C][C][=Ring1][=Branch1][C][=C][C][=C][Branch1][Branch1][C][=C][Ring1][=Branch1][C][=C][C][=Branch2][Ring1][P][=C][C][=Branch1][Ring2][=C][Ring1][=Branch1][C][C][=C][C][=Branch1][Branch1][=C][C][=Ring1][=Branch1][C][C][=C][C][=Branch1][Branch1][=C][C][=Ring1][=Branch1][C][=Branch1][C][=O][O][C][C][=C][C][=Branch1][Branch1][=C][C][=Ring1][=Branch1][C][C][=C][C][=Branch1][Branch1][=C][C][=Ring1][=Branch1][C][=Branch1][C][=O][O]


'O=C(O)C=1C=CC(=CC=1)C2=CC=C(C=C2)C3=CC(=CC(=C3)C=4C=CC(=CC=4)C=5C=CC(=CC=5)C(=O)O)C=6C=CC(=CC=6)C=7C=CC(=CC=7)C(=O)O'