# Name Resolution

In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd
from tqdm import tqdm
from collections import Counter
import pickle
import os
from rdkit import Chem

In [2]:
def merge_pickles_mol_names():
    #create one big list of all the pickled names
    folder_path = 'data/USPTO/molecule_names/'
    onlyfiles = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    full_lst = []
    for file in tqdm(onlyfiles):
        if file[0] != '.': #We don't want to try to unpickle .DS_Store
            filepath = folder_path+file 
            unpickled_lst = pd.read_pickle(filepath)
            full_lst = full_lst + unpickled_lst
            
    return full_lst

In [3]:
names_list = merge_pickles_mol_names()

100%|██████████| 490/490 [00:01<00:00, 489.94it/s] 


In [4]:
len(names_list)

942489

In [5]:
# Count the frequency of each item in the list
item_counts = Counter(names_list)

# Sort the items by frequency in descending order
sorted_items = sorted(item_counts.items(), key=lambda x: x[1], reverse=True)

# Print the sorted items
# for item, count in sorted_items:
#     print(f"{item}: {count}")


# Verify that all the cat, solv and reagents make sense

In [7]:
# read in data
data_df = pd.read_pickle('data/USPTO/clean_test_split_cat.pkl')

In [8]:
# Get list of catalysts
catalysts = list(set(list(data_df['catalyst_0'])))
print(len(catalysts))
catalysts

576


['CCCCCC',
 'OC1CCCC1',
 '[Ca+2].[Cl-].[Cl-]',
 'COS(=O)(=O)OC',
 'O=[N+]([O-])[O-].[Ag+]',
 'CC(C)(C)[P]([Pd][P](C(C)(C)C)(C(C)(C)C)C(C)(C)C)(C(C)(C)C)C(C)(C)C',
 'O=[N+]([O-])c1cccc([N+](=O)[O-])c1',
 '[Re]',
 'COc1ccc2c(c1)CN(C)CCn1c-2c(C2CCCCC2)c2ccc(C(=O)O)cc21',
 'C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.C.O=S(=O)(O)O.O=S(=O)(O)O.O=S(=O)(O)O.O=S(=O)(O)O.O=S(=O)(O)O.O=S(=O)(O)O.O=S(=O)(O)O',
 'NCCN',
 '[Co+2]',
 'Br[Ni]Br',
 'c1cnnnc1',
 'Cc1cc(C)c(N2CCN(c3c(C)cc(C)cc3C)C2=[Ru](Cl)(Cl)(=Cc2ccccc2)[P](C2CCCCC2)(C2CCCCC2)C2CCCCC2)c(C)c1',
 '[60Co]',
 '[N-]=[N+]=[N-].[Na+]',
 'CC(C)=O',
 'CCBr',
 'COCCOCCOCCOCCOCCOCCO',
 'CC(=O)[O-].CC(=O)[O-].[Cu+2]',
 'O=N[O-].[Na+]',
 'ClC(Cl)(Cl)Cl',
 'c1ccc(P(c2ccccc2)c2ccc3ccccc3c2-c2c(P(c3ccccc3)c3ccccc3)ccc3ccccc23)cc1',
 '[F-].[Na+]',
 'C[P+](c1ccccc1)(c1ccccc1)c1ccccc1.[Br-]',
 'COc1cc(O)ccc1O',
 '[N-]=[N+]=N[Zn]N=[N+]=[N-]',
 'O=C1OC(c2ccc(O)cc2)(c2ccc(O)cc2)c2ccccc21',
 'O=C1CCC(=O)N1Br

In [9]:
# Get list of solvents
solvents = list(set(list(data_df['solvent_0'])+list(data_df['solvent_1'])))
print(len(solvents))
solvents
# O is not a solvent or a catalyst?? Is it a fair reagent to include?? There's also Pd as a solvent lol

96


['C1CCOC1',
 'CCCCCC',
 'O=C([O-])O.[Na+]',
 'COCCO',
 'OCCO',
 'CCNCC',
 'CCOC(=O)CC',
 '[K+].[OH-]',
 'C1CCNCC1',
 'N',
 'O=C([O-])[O-].[Cs+].[Cs+]',
 'c1ccccc1',
 'C1CCOC1.CCO',
 'CC(C)CO',
 'COC(C)(C)C',
 'CN(C)C=O',
 'CC(=O)OC(C)C',
 'CO.O',
 'CC(C)=O',
 'COCCOC',
 'CCO.ClCCl',
 'CCOCCO',
 'O=CO',
 'O',
 'ClC(Cl)(Cl)Cl',
 'COCCOCCOC',
 'CC1CCCO1',
 'c1ccc2ncccc2c1',
 'Cc1ccccc1',
 'CCCCO',
 'CS(C)=O',
 'CC(=O)O',
 'CCC(=O)O',
 'Clc1ccccc1Cl',
 'CCN(CC)CC',
 '[Na+].[OH-]',
 'C1COCCO1',
 'ClCCl.O=C(O)C(F)(F)F',
 'CCC(C)O',
 'OCC(F)(F)F',
 'O=[N+]([O-])c1ccccc1',
 'CCOC(C)=O',
 'CN(C)P(=O)(N(C)C)N(C)C',
 'CCCCCC.CCOC(C)=O',
 'CC(C)(C)O',
 'CCO.CCOC(C)=O',
 'c1ccc(Oc2ccccc2)cc1',
 'Cc1cc(C)nc(C)c1',
 'OCCOCCO',
 'ClCCl',
 'ClC(Cl)Cl',
 '[H][H]',
 'C1CCCCC1',
 'CC(=O)N(C)C',
 'O=S1(=O)CCCC1',
 'CC(=O)CC(C)C',
 'ClCCCl',
 'O=C(O)C(F)(F)F',
 'CCCCCO',
 'CC(Cl)Cl',
 'CO.ClCCl',
 'CCO.O',
 'COc1ccccc1',
 '[O-][Cl+3]([O-])([O-])O',
 'Cc1ccc(C)cc1',
 'Cc1ccccc1C',
 'C1COCCO1.Cl',
 'C[N+](=O)

In [10]:
# Get list of reagents
reagents = list(set(list(data_df['reagent_0'])+list(data_df['reagent_1'])))
print(len(reagents))
reagents

# CCOCC is a solvent???
# There's "C", "N", and "O" as a reagent, this seems wrong

242


['',
 'O=C(OO)c1cccc(Cl)c1',
 'CCOC(OCC)OCC',
 'C1CCOC1',
 'c1ccc(P(c2ccccc2)c2ccccc2)cc1',
 'O=C(OOC(=O)c1ccccc1)c1ccccc1',
 'CCCCCC',
 'O=S([O-])S(=O)[O-]',
 'O=S([O-])[O-]',
 'CN(C)c1ccccc1',
 'CN(C)c1ccncc1',
 'COS(=O)(=O)OC',
 '[BH4-]',
 'CC(C)[O-]',
 'O=[Se]=O',
 'CN(C)C(=N)N(C)C',
 'CCNCC',
 'CC(C)[N-]C(C)C',
 'NCCN',
 'CC(C)(C)O[K]',
 'OO',
 'F[N+]12CC[N+](CCl)(CC1)CC2',
 'Nc1ccc2c(c1)OCCO2',
 'C1CCNCC1',
 'O=C(O)C(=O)O',
 'N',
 '[OH-]',
 'CC(=O)[O-]',
 'CCCCP(CCCC)CCCC',
 '[Li]C(C)(C)C',
 'CN(C)c1ccccn1',
 'c1ccccc1',
 'CC(C)C[Al+]CC(C)C',
 'CCN(CC)c1ccccc1',
 'Cc1ccc(S(=O)(=O)Cl)cc1',
 'CC[Mg]Br',
 'Oc1ccccc1',
 'CS(=O)(=O)Cl',
 'Cc1ccc(S(=O)(=O)O)cc1',
 'CN(C)C=O',
 'CN1CCOCC1',
 'COC(C)(C)C',
 'CC(C)=O',
 'C1=CCCCC1',
 '[Pb]',
 'CCOC(=O)N=NC(=O)OCC',
 'O=C=O',
 'Cc1ccccc1S(=O)(=O)O',
 'CC(C)N=C=NC(C)C',
 'ClCl',
 'C[Si](C)(C)I',
 '[Li]C',
 'O=[Cr](=O)([O-])Cl',
 'CCCC[SnH](CCCC)CCCC',
 'c1ccc(P(C2CCCCC2)C2CCCCC2)c(-c2ccccc2P(C2CCCCC2)C2CCCCC2)c1',
 '[Cu]I',
 'O=CO',
 'Cl[Sn

# Check out the solvents csv

In [16]:
solvents = pd.read_csv('data/USPTO/solvents.csv', index_col=0)
solvents.loc[375, 'smiles'] = 'ClP(Cl)Cl'
solvents.loc[405, 'smiles'] = 'ClS(Cl)=O'
methanol = {'cosmo_name': 'methanol', 'stenutz_name': 'methanol', 'smiles': 'CO'}
solvents = solvents.append(methanol, ignore_index=True)

  solvents = solvents.append(methanol, ignore_index=True)


In [17]:
solvents

Unnamed: 0,stenutz_name,cosmo_name,cas_number,chemical_formula,molecular_weight,density,molar_volume,refractive_index,molecular_refractive_power,dielectric_constant,...,boiling_point,viscosity,partition_coefficient,vapour_pressure,sigma_1,sigma_2,sigma_3,sigma_4,sigma_5,smiles
0,(trichloromethyl)benzene,(trichloromethyl)-benzene,98-07-7,C7H5Cl3,195.0,1.173,166.6,1.557,53.67,6.900,...,220.0,1.85,3.68,0.20,0.2054,18.7984,0.0000,0.00000,3.2973,ClC(Cl)(Cl)c1ccccc1
1,"(1Z,5Z)-cycloocta-1,5-diene","1,5-cyclooctadiene",111-78-4,C8H12,108.0,0.880,122.9,1.493,35.73,2.380,...,148.0,1.00,2.98,8.87,0.0000,15.2165,0.4711,0.00000,0.9742,C\1C\C=C/CC\C=C1
2,"(2E,4E)-2,4-hexadiene","trans,trans-2,4-hexadiene",5194-51-4,C6H10,82.0,0.727,112.9,1.456,30.70,2.220,...,79.0,0.37,3.05,225.18,0.0000,14.9023,0.0451,0.00000,1.3634,C/C=C/C=C/C
3,(dichloromethyl)benzene,(dichloromethyl)-benzene,98-87-3,C7H6Cl2,161.0,1.250,128.8,1.550,41.04,6.900,...,205.0,1.99,2.94,0.24,0.8684,16.7703,0.0000,0.00810,3.8876,ClC(Cl)c1ccccc1
4,"(E)-1,3-pentadiene",1-trans-3-pentadiene,2004-70-8,C5H8,68.0,0.683,99.7,1.430,25.77,2.320,...,42.0,0.30,2.47,702.08,0.0000,12.8734,0.0397,0.00000,1.8346,C/C=C/C=C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
455,triethoxymethane,triethyl orthoformate,122-51-0,C7H16O3,148.0,0.893,166.0,1.390,39.34,4.779,...,143.0,1.08,1.79,0.56,0.0000,18.6300,2.1400,0.00000,2.3563,CCOC(OCC)OCC
456,"N,N-dimethylacetamide","n,n-dimethylacetamide",127-19-5,C4H9NO,87.0,0.942,92.5,1.438,24.31,37.780,...,165.0,0.63,-0.50,10.11,0.0200,11.4300,1.8400,0.00000,2.8469,CN(C)C(C)=O
457,diisopropyl ether,diisopropylether,108-20-3,C6H14O,102.0,0.724,141.1,1.368,31.80,4.040,...,68.0,0.47,2.51,28.72,0.0000,15.5800,0.7900,0.00000,0.7435,CC(C)OC(C)C
458,2-propanol,2-propanol,67-63-0,C3H8O,60.0,0.785,76.5,1.377,17.61,19.920,...,82.0,2.40,0.35,34.41,0.7000,8.9100,1.1400,0.28855,1.0916,CC(C)O


In [19]:
has_nan_1 = solvents['stenutz_name'].isnull().any()
has_nan_2 = solvents['cosmo_name'].isnull().any()

# Print the result
print(f"The Series has NaN values: {has_nan_1}, {has_nan_2}")

The Series has NaN values: False, False


In [20]:
def canonicalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.MolToSmiles(mol, isomericSmiles=True)

# apply the function to the 'smiles' column of the dataframe
solvents['canonical_smiles'] = solvents['smiles'].apply(canonicalize_smiles)



In [5]:
for i in range(len(solvents)):
    if solvents['smiles'][i] == 'O':
        print(i)

426


In [21]:
## Check how many of the solvents in the data are actually solvents
s = data_df['solvent_0']
my_list = solvents['canonical_smiles']

# Count the number of values in the pd.series that are in my_list
in_list = s.isin(my_list).sum()

# Count the number of values in the pd.series that are not in my_list
not_in_list = (~s.isin(my_list)).sum()

# Count the number of values in the pd.series that are None or NaN
null_values = s.isnull().sum()

print(f"{in_list} values are in the list, {len(s)-in_list-null_values} values are not in the list, and {null_values} values are None or NaN")

338045 values are in the list, 5262 values are not in the list, and 144553 values are None or NaN


In [22]:
not_in_list = s[~s.isin(my_list)]

# Sort the values in descending order of frequency
freq_not_in_list = not_in_list.value_counts()

# # Print the values, starting with the most frequent
for value, count in freq_not_in_list.items():
    print(f"{value}: {count}")

O=S(=O)(O)O: 1171
[Na+].[OH-]: 941
CCCCCC.CCOC(C)=O: 463
ClCCl.O=C(O)C(F)(F)F: 367
O=C([O-])O.[Na+]: 299
N: 256
CCN(C(C)C)C(C)C: 190
CCCCCCC.CCOC(C)=O: 175
[K+].[OH-]: 155
CCO.CCOC(C)=O: 134
O=C([O-])[O-].[Na+].[Na+]: 125
[O-][Cl+3]([O-])([O-])O: 125
CCO.O: 116
C1CCOC1.CCO: 104
CS(=O)(=O)O: 76
O=P([O-])([O-])[O-]: 71
CCCCCO: 66
O=C([O-])[O-].[Cs+].[Cs+]: 64
CC(=O)OC(C)C: 57
OCC(F)(F)F: 55
CCO.ClCCl: 54
C=O: 54
[Pd]: 49
[H][H]: 49
COC(OC)N(C)C: 42
CO.ClCCl: 1
C1COCCO1.Cl: 1
CO.ClC(Cl)Cl: 1
CO.O: 1


In [12]:
my_list

0      ClC(Cl)(Cl)c1ccccc1
1         C1=C\CC/C=C\CC/1
2              C/C=C/C=C/C
3          ClC(Cl)c1ccccc1
4                C=C/C=C/C
              ...         
454            CCCCOC(C)=O
455           CCOC(OCC)OCC
456            CC(=O)N(C)C
457            CC(C)OC(C)C
458                 CC(C)O
Name: canonical_smiles, Length: 459, dtype: object

In [13]:
len(s)

487860

In [15]:
len(s.dropna())

343307

In [17]:
my_dict = {}
my_dict[['a', 'b', 'c']] = [1,2,3]

TypeError: unhashable type: 'list'

In [18]:
[1,2]+[1,2,3]

[1, 2, 1, 2, 3]