In [1]:
import pandas as pd
import numpy as np
import math
import requests
import json
from urllib.parse import quote
import chemical_conversions

print(chemical_conversions.get_smiles_from_name('benzene')) #check for Name 'benzene', should be 'c1ccccc1'
print(chemical_conversions.get_smiles_from_cas('110-82-7')) #check for CAS '110-82-7', should be 'C1CCCCC1'




c1ccccc1
C1CCCCC1


In [2]:
#pass SMILES and Name from an excel sheet to a dataframe and pass them to different arrays
file_name = 'compounds.xlsx'
dataframe = pd.read_excel(file_name) #pass data to a dataframe called "dataframe"
smiles_array = dataframe['SMILES'].to_numpy() #pass SMILES to an array called "smiles_array"
name_array = dataframe['Name'].to_numpy() #pass Name to an array called "name_array"
cas_array = dataframe['CAS'].to_numpy() #pass CAS to an array called "cas_array"

if type(smiles_array[0]) != str:
    print('There is no SMILES. Use Name instead.') #if there is no SMILES, use Name
    if type(name_array[0]) != str:
        print('There is no Name. Use CAS instead.') #if there is no Name, use CAS
        if type(cas_array[0]) != str:
            print('There is no CAS. Cannot fetch data.') #if there is no CAS, cannot fetch data
        else:
            smiles_array = []
            for i in range(len(cas_array)):
                smiles_array.append(chemical_conversions.get_smiles_from_cas(cas_array[i])) #get SMILES from CAS
            dataframe['SMILES'] = smiles_array #pass CAS-based SMILES to the dataframe
    else:
        smiles_array = []
        for i in range(len(name_array)):
            smiles_array.append(chemical_conversions.get_smiles_from_name(name_array[i])) #get SMILES from Name
        dataframe['SMILES'] = smiles_array #pass Name-based SMILES to the dataframe
dataframe

There is no SMILES. Use Name instead.
(1R,3S)-1-ethyl-3-methylcyclohexane, (1S,3R)-1-ethyl-3-methylcyclohexane
1,2,4-trimethylcyclopentane, (1S,2S)-1,2,4-trimethylcyclopentane
dec-4-ene, (E)-dec-4-ene


Unnamed: 0,SMILES,Name,CAS
0,C=CCCCC,1-hexene,
1,CC=CCCC,2-hexene,
2,C/C=C/CCC,trans-2-hexene,
3,C/C=C/CCC,(E)-2-hexene,
4,CC/C(=C\C(C)C)C(C)C,"3-Hexene, 3-ethyl-2,5-dimethyl-",
5,CCO/C=C/C#N,3-Ethoxyacrylonitrile,
6,,cis-1-Ethyl-3-methyl-cyclohexane,
7,C=C(C)C(C)/C=C/C,"1,4-Hexadiene, 2,3-dimethyl-",
8,,"Cyclopentane, 1,2,4-trimethyl-, (1.alpha.,2.be...",
9,CC/C(=C\C(C)C)C(C)C,"3-Hexene, 3-ethyl-2,5-dimethyl-",


In [3]:
# create an array of API strings with SMILES appended
api_array = []
no_smile_count = 0
total_smiles = len(smiles_array)

for i in range(len(smiles_array)):
    api = 'https://ysi.ml.nrel.gov/api?smiles=' # this is compatible with isomeric SMILES while 'https://ysi.ml.nrel.gov/api/' isn't
    if type(smiles_array[i]) != str:
        smiles_array[i] = 'NO_SMILES'
        print('No SMILES at row =', i)
        no_smile_count += 1
    api += smiles_array[i]
    api_array.append(api)

No SMILES at row = 6
No SMILES at row = 8
No SMILES at row = 10


In [4]:
#pass measured YSI and predicted YSI to the dataframe
ysi_measure_array = []
ysi_predict_array = []
no_response_count = 0
invalid_smiles_count = 0
outlier_count = 0
valid_ysi_count = 0

for i in range(len(api_array)):
    response = requests.get(api_array[i]) #use APIs to get reponses via requests
    if not response: #if the API reponses a invalid result, measured YSI and predicted YSI are set to None
        print('No API response at row =', i, '. SMILES =', smiles_array[i], '. Name =', name_array[i], '. CAS =', cas_array[i])
        no_response_count += 1
        ysi_measure_array.append(None)
        ysi_predict_array.append(None)
        continue
    raw_data = response.json()
    if raw_data['status'] == 'invalid smiles':
        print('Invalid SMILES at row =', i, '. SMILES =', smiles_array[i], '. Name =', name_array[i], '. CAS =', cas_array[i])
        invalid_smiles_count += 1
        ysi_measure_array.append(None)
        ysi_predict_array.append(None)
        continue
    if raw_data['outlier']: #if the compound is an outlier (i.e., 'outlier' = True), measured YSI and predicted YSI are set to None
        print('An outlier at row =', i, '. SMILES =', smiles_array[i], '. Name =', name_array[i], '. CAS =', cas_array[i])
        outlier_count += 1
        ysi_measure_array.append(None)
        ysi_predict_array.append(None)
        continue
    valid_ysi_count += 1
    ysi_measure_array.append(raw_data['exp_mean']) #add 'exp_mean' to an array called 'ysi_measure_array'
    ysi_predict_array.append(raw_data['mean']) #add 'mean' to an array called 'ysi_predict_array'  
print('There are', total_smiles, 'compounds in total,', valid_ysi_count, 'with valid YSIs,', no_response_count, 'without API response,', invalid_smiles_count, 'with invalid SMILES (', no_smile_count, 'with no SMILES), and', outlier_count, 'outliers.')
dataframe['measured YSI'] = ysi_measure_array
dataframe['predicted YSI'] = ysi_predict_array
dataframe

Invalid SMILES at row = 6 . SMILES = NO_SMILES . Name = cis-1-Ethyl-3-methyl-cyclohexane . CAS = nan
Invalid SMILES at row = 8 . SMILES = NO_SMILES . Name = Cyclopentane, 1,2,4-trimethyl-, (1.alpha.,2.beta.,4.alpha.)- . CAS = nan
Invalid SMILES at row = 10 . SMILES = NO_SMILES . Name = 4-Decene . CAS = nan
There are 11 compounds in total, 8 with valid YSIs, 0 without API response, 3 with invalid SMILES ( 3 with no SMILES), and 0 outliers.


Unnamed: 0,SMILES,Name,CAS,measured YSI,predicted YSI
0,C=CCCCC,1-hexene,,42.4,40.634906
1,CC=CCCC,2-hexene,,,46.461815
2,C/C=C/CCC,trans-2-hexene,,45.8,46.461815
3,C/C=C/CCC,(E)-2-hexene,,45.8,46.461815
4,CC/C(=C\C(C)C)C(C)C,"3-Hexene, 3-ethyl-2,5-dimethyl-",,,91.308211
5,CCO/C=C/C#N,3-Ethoxyacrylonitrile,,,47.105569
6,,cis-1-Ethyl-3-methyl-cyclohexane,,,
7,C=C(C)C(C)/C=C/C,"1,4-Hexadiene, 2,3-dimethyl-",,,81.963227
8,,"Cyclopentane, 1,2,4-trimethyl-, (1.alpha.,2.be...",,,
9,CC/C(=C\C(C)C)C(C)C,"3-Hexene, 3-ethyl-2,5-dimethyl-",,,91.308211


In [5]:
dataframe.to_excel('./YSI results.xlsx')