In [1]:
from csv import DictReader

with open('compounds.csv', 'r') as csv_file:
    reader = DictReader(csv_file)
    compounds = [r for r in reader]
csv_file.close()

print(len(compounds))
for i in range(5):
    print(compounds[i])

1277
{'CAS Number': '25-65-0', 'SMILES': 'CC(C)(C)O', 'Formula': 'C4H10O', 'IUPAC Name': '2-methylpropan-2-ol'}
{'CAS Number': '33-51-2', 'SMILES': 'COC1=C(C(=CC=C1)OC)O', 'Formula': 'C8H10O3', 'IUPAC Name': '2,6-dimethoxyphenol'}
{'CAS Number': '56-81-5', 'SMILES': 'C(C(CO)O)O', 'Formula': 'C3H8O3', 'IUPAC Name': 'propane-1,2,3-triol'}
{'CAS Number': '57-55-6', 'SMILES': 'CC(CO)O', 'Formula': 'C3H8O2', 'IUPAC Name': 'propane-1,2-diol'}
{'CAS Number': '60-01-5', 'SMILES': 'CCCC(=O)OCC(COC(=O)CCC)OC(=O)CCC', 'Formula': 'C15H26O6', 'IUPAC Name': '2,3-di(butanoyloxy)propyl butanoate'}


In [2]:
try:
    from . import smiles_encoder
except:
    import smiles_encoder

smiles_strings = [c['SMILES'] for c in compounds]

encoder = smiles_encoder.SmilesEncoder(smiles_strings)
print(f'Number of unique dictionary elements: {len(encoder.element_dict.keys())}')


Number of unique dictionary elements: 51


In [3]:
encoded_smiles = encoder.encode_many(smiles_strings)
print(f'Number of encoded SMILES strings: {len(encoded_smiles)}\n')
print(f'First sample: {len(encoded_smiles[0]), len(encoded_smiles[0][0])}')
print(smiles_strings[0])
print(encoded_smiles[0], '\n')
print(f'Second sample: {len(encoded_smiles[1]), len(encoded_smiles[1][0])}')
print(smiles_strings[1])
print(encoded_smiles[1])
print('(Shape [n_elements, n_dictionary_elements])\n')

Number of encoded SMILES strings: 1277

First sample: (9, 51)
CC(C)(C)O
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [4]:
decoded_smiles = encoder.decode_many(encoded_smiles)
print(decoded_smiles == smiles_strings)

True
