In [60]:
import os
import openai
import requests
import pandas as pd
import numpy as np
import time

openai.api_key = os.getenv("OPENAI_API_KEY")

In [132]:
def smi2iupac(smiles):
    response = openai.Completion.create(
      engine="text-davinci-002",
      prompt="This program provides SMILES for the given molecule. It looks them up in a database, so it cannot be wrong. \n\nMolecule 1: 3-(3,4-dichlorophenyl)-1,1-dimethylurea\nSMILES: CN(C)C(=O)Nc1ccc(Cl)c(Cl)c1\n\nMolecule 2: 7-methoxy-N-[4-(4-quinoxalin-5-ylpiperazin-1-yl)butyl]-1-benzofuran-2-carboxamide\nSMILES: COc1cccc2cc(C(=O)NCCCCN3CCN(c4cccc5nccnc54)CC3)oc21\n\nMolecule 3: (4-phenylphenyl) acetate\nSMILES: CC(=O)OC1=CC=C(C=C1)C2=CC=CC=C2\n\nMolecule 4: ",
      suffix=f"\nSMILES: {smiles}",
      temperature=0.2,
      max_tokens=256,
      top_p=1,
      best_of=3,
      frequency_penalty=0,
      presence_penalty=0
    )    
    text = response['choices'][0]['text']
    return text.split('SMILES')[0].strip()

smi2iupac('CCO')

'carbonic acid'

In [129]:
def smi2iupac_ref(smiles):
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{requests.utils.quote(smiles)}/property/IUPACName/JSON"
    try:
        reply = requests.get(
            url,
            params={"Threshold": 90, "MaxRecords": 1},
            headers={"accept": "text/json"},
            timeout=10,
        )
    except requests.exceptions.Timeout:
        print("Pubchem seems to be down right now ️☠️☠️")
        return [], []
    try:
        data = reply.json()['PropertyTable']['Properties'][0]['IUPACName']
    except KeyError as e:
        print(reply.json())
        data = None
    return data

def random_compound(n = 10, filter_length=60):
    cids = np.random.choice(10**8, size=10).astype(str)
    s = ','.join(cids)    
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{s}/property/IUPACName,CanonicalSMILES/JSON"
    try:
        reply = requests.get(
            url,
            params={"Threshold": 90, "MaxRecords": 1},
            headers={"accept": "text/json"},
            timeout=10,
        )
    except requests.exceptions.Timeout:
        print("Pubchem seems to be down right now ️☠️☠️")
        return []
    data = []
    for i in range(n):
        try:
            smi, name = reply.json()['PropertyTable']['Properties'][i]['CanonicalSMILES'], reply.json()['PropertyTable']['Properties'][i]['IUPACName']
            if len(name) < filter_length and '.' not in smi:
                data.append((smi, name))            
        except KeyError as e:
            continue
    return data

In [130]:
# get 100 random copounds
examples = []
while len(examples) < 100:
    examples.extend(random_compound())    
    print(len(examples))
    time.sleep(0.2)
examples = examples[:100]

1
2
5
6
11
12
12
13
16
19
19
19
22
25
28
29
30
31
33
34
35
37
38
40
40
43
46
48
52
53
54
56
56
58
60
61
63
64
65
68
69
69
71
74
76
77
79
79
80
84
84
87
88
90
92
94
95
99
100


In [134]:
result = []
for e in examples:
    ref_smi = e[1]    
    pred_smi = smi2iupac(e[0]) 
    print(ref_smi, '|', pred_smi)
    result.append((ref_smi, pred_smi))

methyl 2-(2-chloroanilino)-2-(3-chlorophenyl)acetate | 2,2-dichloro-N-(2-chlorophenyl)-N-(4-chlorophenyl)acetamide
3-oxo-4H-1,4-benzoxazine-7-carbohydrazide | 2-oxiranecarboxylic acid
2-(4-bromo-2-chlorophenoxy)-1-(4-methylphenyl)propan-1-one | 4-bromo-2-chloro-5-fluorophenyl acetate
N-(8-azabicyclo[3.2.1]octan-3-yl)-N-methylquinolin-4-amine | 2-cyano-N-(4-methoxyphenyl)-3-phenylpropanamide
N-[[2-(ethoxymethyl)phenyl]methyl]-4-nitrobenzamide | 2-chloro-N-(3-methoxypropyl)-5-nitrobenzamide
3-(4-methyl-2-pyridin-4-ylpyrimidin-5-yl)propanoic acid | 2-chloro-N-(2-chlorophenyl)-N-(methoxycarbonyl)acetamide
2-(2-chlorophenyl)sulfinyl-1-(4-propylphenyl)ethanamine | 4-chloro-2-methylphenol
1-cyclopropyl-3-(2-phenylmethoxypropanoylamino)urea | 2-chloro-N-(2-chloroethyl)-N-(2-chlorophenyl)-urea
4-hydroxy-1-[4-hydroxy-2-(hydroxymethyl)butyl]pyridin-2-one | 2-ethyl-1-hexanol
(2R)-2-amino-N-(3-hydroxypropyl)-3-methylbutanamide | 2-chloro-N-(2-chlorophenyl)-N-(methoxycarbonyl)acetamide
2-chloro-4-(2

1-benzyl-3-[(4-methoxyphenyl)carbamothioylamino]thiourea | 2-chloro-N-(2-chloro-5-fluorophenyl)-N-methylacetamide
2-(4-chlorophenyl)sulfonyl-5-nitro-1,3-thiazole | 2-chloro-5-(trifluoromethyl)benzene
1-butyl-N-[(5-chlorofuran-2-yl)methyl]piperidin-4-amine | 2-chloro-N-(2-chlorophenyl)-N-(4-methoxyphenyl)acetamide
1-ethyl-2-(2,4,6-trimethylphenyl)azepan-3-amine | 3-methyl-2-oxiranecarboxylic acid
1-methyl-4-(2-methylimidazol-1-yl)piperidine | 2-cyano-N-(4-methoxy-3-nitrophenyl)-3-phenylpropanamide
2-bromo-1-fluoro-3-[(3-fluorophenyl)sulfanylmethyl]benzene | 2-bromo-4-fluoro-5-methylbenzene
N-[(1R)-1-(2,5-dimethoxyphenyl)ethyl]-2-methylaniline | 2-methyl-2-propanol
N,N-bis[(4-nitrophenyl)methyl]-1,3-thiazol-2-amine | 2-chloro-N-(2-chlorophenyl)-N-methoxyacetamide
2-ethylbicyclo[2.2.1]heptane-2-carbaldehyde | 2-chloro-N-(2-chlorophenyl)-N-(methoxycarbonyl)acetamide


In [140]:
data = pd.DataFrame(dict(smiles=[e[0] for e in examples], 
                         ref_iupac=[e[1] for e in examples], 
                         pred_iupac = [r[1] for r in result],
                        result = [r[1] == r[0] for r in result]))

In [148]:
data.to_csv('gpt3-iupac.csv')