In [1]:
import pandas as pd
import numpy as np
import math
import requests
import json
from urllib.parse import quote
import chemical_conversions

print(chemical_conversions.get_smiles_from_name('benzene')) #check for Name 'benzene', should be 'c1ccccc1'
print(chemical_conversions.get_smiles_from_cas('110-82-7')) #check for CAS '110-82-7', should be 'C1CCCCC1'



c1ccccc1
C1CCCCC1


# import data

In [2]:
#pass SMILES and Name from an excel sheet to a dataframe and pass them to different arrays
file_name = 'compounds.xlsx'
dataframe = pd.read_excel(file_name) #pass data to a dataframe called "dataframe"
smiles_array = dataframe['SMILES'].to_numpy() #pass SMILES to an array called "smiles_array"
name_array = dataframe['Name'].to_numpy() #pass Name to an array called "name_array"
cas_array = dataframe['CAS'].to_numpy() #pass CAS to an array called "cas_array"

if type(smiles_array[0]) != str:
    print('There is no SMILES. Use Name instead.') #if there is no SMILES, use Name
    if type(name_array[0]) != str:
        print('There is no Name. Use CAS instead.') #if there is no Name, use CAS
        if type(cas_array[0]) != str:
            print('There is no CAS. Cannot fetch data.') #if there is no CAS, cannot fetch data
        else:
            smiles_array = []
            for i in range(len(cas_array)):
                smiles_array.append(chemical_conversions.get_smiles_from_cas(cas_array[i])) #get SMILES from CAS
            dataframe['SMILES'] = smiles_array #pass CAS-based SMILES to the dataframe
    else:
        smiles_array = []
        for i in range(len(name_array)):
            smiles_array.append(chemical_conversions.get_smiles_from_name(name_array[i])) #get SMILES from Name
        dataframe['SMILES'] = smiles_array #pass Name-based SMILES to the dataframe
dataframe

Unnamed: 0,Name,SMILES,CAS
0,R-limonene,C=C(C)[C@H]1CC=C(C)CC1,
1,limonane,CC1CCC(C(C)C)CC1,
2,sabinene,C=C1CCC2(C(C)C)CC12,
3,dihydrosabinene,CC1CCC2(C(C)C)CC12,
4,tetrahydrosabinene,CC1CCC(C1)(C)C(C)C,
5,α-pinene,CC1=CCC2CC1C2(C)C,
6,β-pinene,CC1(C)C2CCC(C1C2)=C,
7,pinane,CC1CCC2CC1C2(C)C,
8,3-carene,CC1=CCC2C(C1)C2(C)C,
9,dihydrocarene,CC1CCC2C(C1)C2(C)C,


# create APIs by passing SMILES

In [3]:
# create an array of API strings with SMILES appended
api_array = []
no_smile_count = 0
total_smiles = len(smiles_array)

for i in range(len(smiles_array)):
    api = 'https://ysi.ml.nrel.gov/predict?smiles=' # this is compatible with isomeric SMILES while 'https://ysi.ml.nrel.gov/api/' isn't
    if type(smiles_array[i]) != str:
        smiles_array[i] = 'NO_SMILES'
        print('No SMILES at row =', i)
        no_smile_count += 1
    api_array.append(api+smiles_array[i])

# YSI prediction

In [4]:
#pass measured YSI and predicted YSI to the dataframe
ysi_measure_array = []
ysi_predict_array = []
no_response_count = 0
invalid_smiles_count = 0
outlier_count = 0
valid_ysi_count = 0

for i in range(len(api_array)):
    response = requests.get(api_array[i]) #use APIs to get reponses via requests
    if not response: #if the API reponses a invalid result, measured YSI and predicted YSI are set to None
        print('No API response at row = {}. SMILES = {}. Name = {}. CAS = {}'.format(i, smiles_array[i], name_array[i], cas_array[i]))
        no_response_count += 1
        ysi_measure_array.append('No API response')
        ysi_predict_array.append('No API response')
        continue
    raw_data = response.json()
    if raw_data['status'] == 'invalid smiles':
        print('Invalid SMILES at row = {}. SMILES = {}. Name = {}. CAS = {}'.format(i, smiles_array[i], name_array[i], cas_array[i]))
        invalid_smiles_count += 1
        ysi_measure_array.append('invalid smiles')
        ysi_predict_array.append('invalid smiles')
        continue
    if raw_data['outlier']: #if the compound is an outlier (i.e., 'outlier' = True), measured YSI and predicted YSI are set to None
        print('An outlier at row = {}. SMILES = {}. Name = {}. CAS = {}'.format(i, smiles_array[i], name_array[i], cas_array[i]))
        outlier_count += 1
        ysi_measure_array.append('outlier')
        ysi_predict_array.append('outlier')
        continue
    valid_ysi_count += 1
    ysi_measure_array.append(raw_data['exp_mean']) #add 'exp_mean' to an array called 'ysi_measure_array'
    ysi_predict_array.append(raw_data['mean']) #add 'mean' to an array called 'ysi_predict_array'  
print('''There are: {} compounds in total,
{} with valid YSIs,
{} outliers,
{} without API response, 
and {} with invalid SMILES, 
({} with no SMILES).'''.format(total_smiles, valid_ysi_count, outlier_count, no_response_count, invalid_smiles_count, no_smile_count))

dataframe['measured YSI'] = ysi_measure_array
dataframe['predicted YSI'] = ysi_predict_array
dataframe

An outlier at row = 2. SMILES = C=C1CCC2(C(C)C)CC12. Name = sabinene. CAS = nan
An outlier at row = 3. SMILES = CC1CCC2(C(C)C)CC12. Name = dihydrosabinene. CAS = nan
An outlier at row = 8. SMILES = CC1=CCC2C(C1)C2(C)C. Name = 3-carene. CAS = nan
An outlier at row = 9. SMILES = CC1CCC2C(C1)C2(C)C. Name = dihydrocarene. CAS = nan
An outlier at row = 14. SMILES = CC1=CC[C@@H](CC1=O)C(=C)C. Name = S-carvone. CAS = nan
An outlier at row = 15. SMILES = CC1=CC[C@H](CC1=O)C(=C)C . Name = R-carvone. CAS = nan
There are: 25 compounds in total,
19 with valid YSIs,
6 outliers,
0 without API response, 
and 0 with invalid SMILES, 
(0 with no SMILES).


Unnamed: 0,Name,SMILES,CAS,measured YSI,predicted YSI
0,R-limonene,C=C(C)[C@H]1CC=C(C)CC1,,,137.106
1,limonane,CC1CCC(C(C)C)CC1,,,89.5603
2,sabinene,C=C1CCC2(C(C)C)CC12,,outlier,outlier
3,dihydrosabinene,CC1CCC2(C(C)C)CC12,,outlier,outlier
4,tetrahydrosabinene,CC1CCC(C1)(C)C(C)C,,,96.2897
5,α-pinene,CC1=CCC2CC1C2(C)C,,,149.625
6,β-pinene,CC1(C)C2CCC(C1C2)=C,,,141.759
7,pinane,CC1CCC2CC1C2(C)C,,,112.707
8,3-carene,CC1=CCC2C(C1)C2(C)C,,outlier,outlier
9,dihydrocarene,CC1CCC2C(C1)C2(C)C,,outlier,outlier


In [5]:
dataframe.to_excel('./YSI results.xlsx', index = False)