# Check that solvents.csv is well set up

In [10]:
import pandas as pd
from rdkit import Chem

In [11]:
solvents_df = pd.read_csv('orderly/data/solvents.csv')
solvents_df.columns
solvents_smiles = solvents_df['smiles']

In [12]:
# check all smiles are canonicalisable
# Canonicalise and remove stoichiometry
def clean_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(smiles)
        return smiles
    else:
        return Chem.MolToSmiles(mol, isomericSmiles=False)

# Apply the function to all columns in the DataFrame
solvents_smiles.apply(clean_smiles)



0           C1=CCCC=CCC1
1               CC=CC=CC
2        ClC(Cl)c1ccccc1
3                C=CC=CC
4                CC=CC#N
             ...        
615    CC1(C)CCC(C)(C)O1
616         Cc1ccc(C)cc1
617           Cc1ccccc1C
618         Cc1cccc(C)c1
619         Cc1ccc(C)cc1
Name: smiles, Length: 620, dtype: object

In [13]:
solvents_df

Unnamed: 0,solvent_name_1,solvent_name_2,solvent_name_3,cas_number,chemical_formula,smiles,source
0,"(1z,5z)-cycloocta-1,5-diene","1,5-cyclooctadiene",,111-78-4,C8H12,C1=CCCC=CCC1,https://doi.org/10.1039/C9SC01844A
1,"(2e,4e)-2,4-hexadiene","trans,trans-2,4-hexadiene",,5194-51-4,C6H10,CC=CC=CC,https://doi.org/10.1039/C9SC01844A
2,(dichloromethyl)benzene,(dichloromethyl)-benzene,,98-87-3,C7H6Cl2,ClC(Cl)c1ccccc1,https://doi.org/10.1039/C9SC01844A
3,"(e)-1,3-pentadiene",1-trans-3-pentadiene,,2004-70-8,C5H8,C=CC=CC,https://doi.org/10.1039/C9SC01844A
4,(e)-2-butenenitrile,,,627-26-9,C4H5N,CC=CC#N,https://doi.org/10.1039/C9SC01844A
...,...,...,...,...,...,...,...
615,tetramethyl tetrahydrofurane,tmthf,,15045-43-9,C8H16O,CC1(CCC(O1)(C)C)C,MOLECULES FROM https://github.com/sustainable-...
616,xylenes,xylol,,1330-20-7,C8H10,Cc1ccc(C)cc1,MOLECULES FROM https://github.com/sustainable-...
617,o-xylene,,,,,Cc1c(C)cccc1\t,MOLECULES FROM https://github.com/sustainable-...
618,m-xylene,,,,,Cc1cc(C)ccc1\t,MOLECULES FROM https://github.com/sustainable-...


# Fixing solvents csv

In [46]:
import pandas as pd
from rdkit import Chem
import numpy as np

In [2]:
solvents_df = pd.read_csv('orderly/data/solvents.csv')
df1 = pd.read_csv('orderly/data/subset_1.csv')
df2 = pd.read_csv('orderly/data/subset_2.csv')

In [3]:
# create set of strings in df1
strings_to_remove = df1[['solvent_name_1', 'solvent_name_2', 'solvent_name_3']].values.flatten().tolist()
strings_to_remove = [x for x in strings_to_remove if not pd.isna(x)]

In [4]:
df2 = df2[~df2['solvent_name_1'].isin(strings_to_remove)]
df2 = df2[~df2['solvent_name_2'].isin(strings_to_remove)]
df2 = df2[~df2['solvent_name_3'].isin(strings_to_remove)]

In [6]:
df2.to_csv('orderly/data/subset_2_.csv', index=False)

# Check for duplicates in the three name columns

In [3]:
# we want to look for duplicates in the 'solvent_name' columns
solvent_names = solvents_df[['solvent_name_1', 'solvent_name_2', 'solvent_name_3']]
# check for duplicates
duplicates = solvent_names.duplicated()

# print duplicate rows
print(solvent_names[duplicates])

Empty DataFrame
Columns: [solvent_name_1, solvent_name_2, solvent_name_3]
Index: []


In [4]:
solvent_names_list = solvent_names.values.flatten().tolist()
solvent_names_list = [x for x in solvent_names_list if not pd.isna(x)]

In [5]:
print(len(solvent_names_list))

915


In [6]:
print(len(set(solvent_names_list)))

915


In [7]:
# manually remove duplicates to ensure we don't lose any data
# create dictionary to count occurrences
count_dict = {}
for item in solvent_names_list:
    count_dict[item] = count_dict.get(item, 0) + 1

# extract elements with count of 2 or more
duplicates = [item for item, count in count_dict.items() if count >= 2]

In [8]:
len(duplicates)

0

In [9]:
duplicates

[]

In [10]:
# check for any trailing or leading spaces
for s in solvent_names_list:
    if s != s.strip():
        print(s)

In [13]:
# # make lower
# # make specified columns lower case
# solvents_df[['solvent_name_1', 'solvent_name_2', 'solvent_name_3']] = solvents_df[['solvent_name_1', 'solvent_name_2', 'solvent_name_3']].applymap(lambda x: x.lower() if type(x) == str else x)

# # save to csv
# solvents_df.to_csv('orderly/data/solvents.csv', index=False)

# Use pura on each column

In [1]:
import pandas as pd

# Import pura
from pura.resolvers import resolve_identifiers
from pura.compound import CompoundIdentifierType
from pura.services import PubChem, CIR, Opsin, CAS, ChemSpider, STOUT

solvents_df = pd.read_csv('orderly/data/solvents.csv')

2023-04-26 18:41:43.245303: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def apply_pura_name_to_smiles(lst, services=[PubChem(autocomplete=True), Opsin(), CIR(),]):
    resolved = resolve_identifiers(
        lst,
        input_identifer_type=CompoundIdentifierType.NAME,
        output_identifier_type=CompoundIdentifierType.SMILES,
        services=services,
        agreement=2,
        silent=True,
    )
    return resolved

def apply_pura_cas_to_smiles(lst):
    resolved = resolve_identifiers(
        lst,
        input_identifer_type=CompoundIdentifierType.CAS_NUMBER,
        output_identifier_type=CompoundIdentifierType.SMILES,
        services=[CAS()],
        agreement=1,
        silent=True,
    )
    return resolved

In [3]:
col1_lst_1 = solvents_df['solvent_name_1'].dropna().tolist()
col1_lst_2 = solvents_df['solvent_name_2'].dropna().tolist()
col1_lst_3 = solvents_df['solvent_name_3'].dropna().tolist()
names = col1_lst_1 + col1_lst_2 + col1_lst_3

In [4]:
len(set(names)) == len(names)
print(len(names))

915


In [5]:
names_pura = apply_pura_name_to_smiles(names)

Batch 0 Progress: 100%|██████████| 100/100 [00:20<00:00,  5.00it/s]
Batch 1 Progress: 100%|██████████| 100/100 [00:09<00:00, 10.31it/s]
Batch:  20%|██        | 2/10 [00:29<01:51, 13.96s/it]ERROR:pura.resolvers:<pura.services.opsin.Opsin object at 0x145a96f20>, identifier_type=<CompoundIdentifierType.NAME: 6> value='d-carvone' details=None: Not Found
ERROR:pura.resolvers:<pura.services.opsin.Opsin object at 0x145a96f20>, identifier_type=<CompoundIdentifierType.NAME: 6> value='4,n,n-trimethylaniline' details=None: Not Found
ERROR:pura.resolvers:Not sufficient agreement for identifiers=[CompoundIdentifier(identifier_type=<CompoundIdentifierType.NAME: 6>, value='d-carvone', details=None)] amount=None mass=None volume=None (outputs: [[CompoundIdentifier(identifier_type=<CompoundIdentifierType.SMILES: 2>, value='C=C(C)[C@H]1CC=C(C)C(=O)C1', details=None)]])
ERROR:pura.resolvers:Not sufficient agreement for identifiers=[CompoundIdentifier(identifier_type=<CompoundIdentifierType.NAME: 6>, valu

In [7]:
names_dict = dict(names_pura)

In [15]:
names_dict

{'1-iodopentane': ['CCCCCI'],
 '1,4-cyclohexadiene': ['C1=CCC=CC1'],
 '1,4-dimethylbenzene': ['Cc1ccc(C)cc1'],
 '1-chloro-2-methylbenzene': ['Cc1ccccc1Cl'],
 '2-aminoethanol': ['NCCO'],
 '1-bromoheptane': ['CCCCCCCBr'],
 '1,4-dibromobenzene': ['Brc1ccc(Br)cc1'],
 '1-bromonaphthalene': ['Brc1cccc2ccccc12'],
 '1,4-dichlorobutane': ['ClCCCCCl'],
 '1-dodecanol': ['CCCCCCCCCCCCO'],
 '1,3-dichlorobenzene': ['Clc1cccc(Cl)c1'],
 '1,1-diethoxymethane': ['CCOCOCC'],
 '1,5-dichloropentane': ['ClCCCCCCl'],
 '1-phenyl-1-propanone': ['CCC(=O)c1ccccc1'],
 '1-methoxy-3-methylbenzene': ['COc1cccc(C)c1'],
 '1,1,2-trichloroethane': ['ClCC(Cl)Cl'],
 '1,3-dichloropropane': ['ClCCCCl'],
 '1-ethyl-4-methylbenzene': ['CCc1ccc(C)cc1'],
 '1-chloro-3-methylbutane': ['CC(C)CCCl'],
 '1,2,3,4-tetrachlorobenzene': ['Clc1ccc(Cl)c(Cl)c1Cl'],
 '1,3,5-trimethylbenzene': ['Cc1cc(C)cc(C)c1'],
 '1-bromo-2-methylpropane': ['CC(C)CBr'],
 '1,1,1-trichloroethane': ['CC(Cl)(Cl)Cl'],
 '1-chloronaphthalene': ['Clc1cccc2ccccc12'],

In [26]:
solvents_df.columns

Index(['solvent_name_1', 'solvent_name_2', 'solvent_name_3', 'cas_number',
       'chemical_formula', 'smiles', 'source'],
      dtype='object')

In [28]:
cas_names = apply_pura_cas_to_smiles(solvents_df['cas_number'].dropna().tolist())

Batch 0 Progress: 100%|██████████| 100/100 [00:03<00:00, 26.30it/s]
Batch 1 Progress: 100%|██████████| 100/100 [00:00<00:00, 132.21it/s]
Batch:  40%|████      | 2/5 [00:04<00:06,  2.02s/it]ERROR:pura.resolvers:<pura.services.cas.CAS object at 0x14835a7d0>, identifier_type=<CompoundIdentifierType.CAS_NUMBER: 7> value='2244-16-8  ' details=None: Not Found
Batch 2 Progress: 100%|██████████| 100/100 [00:00<00:00, 163.64it/s]
Batch 3 Progress: 100%|██████████| 100/100 [00:00<00:00, 158.27it/s]
Batch:  80%|████████  | 4/5 [00:05<00:01,  1.09s/it]ERROR:pura.resolvers:<pura.services.cas.CAS object at 0x14835a7d0>, identifier_type=<CompoundIdentifierType.CAS_NUMBER: 7> value='"7719-09-7"' details=None: Not Found
ERROR:pura.resolvers:<pura.services.cas.CAS object at 0x14835a7d0>, identifier_type=<CompoundIdentifierType.CAS_NUMBER: 7> value='"7719-12-2"' details=None: Not Found
ERROR:pura.resolvers:<pura.services.cas.CAS object at 0x14835a7d0>, identifier_type=<CompoundIdentifierType.CAS_NUMBER: 

In [30]:
cas_names_dict = dict(cas_names)

In [31]:
cas_names_dict_2 ={key: value[0] if value else '' for key, value in cas_names_dict.items()}

In [17]:
names_dict_2 = {key: value[0] if value else '' for key, value in names_dict.items()}


In [21]:
replacement_df = solvents_df.replace(names_dict_2)

In [32]:
replacement_df_2 = replacement_df.replace(cas_names_dict_2)

In [33]:
replacement_df_2.to_csv('orderly/data/pura_solvents_.csv', index=False)

# Resolve pura smiles

In [42]:
pura_solvents = pd.read_csv('orderly/data/pura_solvents.csv')

In [43]:
pura_solvents

Unnamed: 0,smiles_1,smiles_2,smiles_3,smiles_4,smiles_5
0,C1=C\CC/C=C\CC/1,C1=CCCC=CCC1,,C1=CCCC=CCC1,C\1C\C=C/CC\C=C1
1,C/C=C/C=C/C,C/C=C/C=C/C,,CC=CC=CC,C/C=C/C=C/C
2,ClC(Cl)c1ccccc1,ClC(Cl)c1ccccc1,,ClC(Cl)c1ccccc1,ClC(Cl)c1ccccc1
3,C=C/C=C/C,,,C=CC=CC,C/C=C/C=C
4,C/C=C/C#N,,,CC=CC#N,C/C=C/C#N
...,...,...,...,...,...
610,CC(=O)OC(C)(C)C,,,CC(=O)OC(C)(C)C,O=C(OC(C)(C)C)C
611,CCOC(C)(C)CC,,,CCC(C)(C)OC,O(C(C)(C)CC)C
612,OCC1CCCO1,OCC1CCCO1,,OCC1CCCO1,C1CC(OC1)CO
613,CC1(C)CCOC1(C)C,,,CC1(C)CCC(C)(C)O1,CC1(CCC(O1)(C)C)C


In [44]:
# Canonicalise and remove stoichiometry
def clean_smiles(smiles):
    if pd.isna(smiles):
        return smiles
    else:
        mol = Chem.MolFromSmiles(smiles)
        return Chem.MolToSmiles(mol, isomericSmiles=False)

# Apply the function to all columns in the DataFrame
df = pura_solvents.applymap(clean_smiles)




In [54]:
df = df.drop('smiles_5', axis=1)

In [56]:
df = df.drop('final_smiles', axis=1)

In [57]:
df

Unnamed: 0,smiles_1,smiles_2,smiles_3,smiles_4
0,C1=CCCC=CCC1,C1=CCCC=CCC1,,C1=CCCC=CCC1
1,CC=CC=CC,CC=CC=CC,,CC=CC=CC
2,ClC(Cl)c1ccccc1,ClC(Cl)c1ccccc1,,ClC(Cl)c1ccccc1
3,C=CC=CC,,,C=CC=CC
4,CC=CC#N,,,CC=CC#N
...,...,...,...,...
610,CC(=O)OC(C)(C)C,,,CC(=O)OC(C)(C)C
611,CCOC(C)(C)CC,,,CCC(C)(C)OC
612,OCC1CCCO1,OCC1CCCO1,,OCC1CCCO1
613,CC1(C)CCOC1(C)C,,,CC1(C)CCC(C)(C)O1


In [60]:
def get_final_smiles(row):
    row_dropna = row.dropna()
    smiles_set = set(row_dropna)
    if len(smiles_set) == 1 and len(row_dropna) >= 1:
        return smiles_set.pop()
    else:
        return np.nan

# Apply the function to each row in the DataFrame to generate the 'final_smiles' column
df['final_smiles'] = df.apply(get_final_smiles, axis=1)

In [61]:
df['final_smiles'].dropna()

0            C1=CCCC=CCC1
1                CC=CC=CC
2         ClC(Cl)c1ccccc1
3                 C=CC=CC
4                 CC=CC#N
              ...        
606                    CO
608      Cc1ccc(C(C)C)cc1
609    C=C(C)C1CC=C(C)CC1
610       CC(=O)OC(C)(C)C
612             OCC1CCCO1
Name: final_smiles, Length: 590, dtype: object

In [63]:
# need to manually reosolve 22 compounds where there's disagreement
df.to_csv('orderly/data/solvents_check_agreement.csv', index=False)