In [29]:
import selfies as sf
from rdkit import Chem

In [30]:
original_smiles = "O=Cc1ccccc1"

try:
    encoded_selfies = sf.encoder(original_smiles) #SMILEs->SELFIES
    decoded_smiles = sf.decoder(encoded_selfies) #SELFIES->SMILES

except sf.EncoderError as err:
    pass # sf.encoder error...
except sf.DecoderError as err:
    pass # sf.decoder error...

In [31]:
encoded_selfies

'[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]'

In [32]:
decoded_smiles

'O=CC1=CC=CC=C1'

In [33]:
Chem.CanonSmiles(original_smiles) == Chem.CanonSmiles(decoded_smiles)

True

In [34]:
sf.get_preset_constraints('default')

{'H': 1,
 'F': 1,
 'Cl': 1,
 'Br': 1,
 'I': 1,
 'B': 3,
 'B+1': 2,
 'B-1': 4,
 'O': 2,
 'O+1': 3,
 'O-1': 1,
 'N': 3,
 'N+1': 4,
 'N-1': 2,
 'C': 4,
 'C+1': 5,
 'C-1': 3,
 'P': 5,
 'P+1': 6,
 'P-1': 4,
 'S': 6,
 'S+1': 7,
 'S-1': 5,
 '?': 8}

In [35]:
sf.decoder("[Li][=C][C][S][=C][C][#S]")

'[Li]=CCS=CC#S'

In [36]:
new_constraints = sf.get_preset_constraints("default")
new_constraints['Li'] = 1
new_constraints['S'] = 2

sf.set_semantic_constraints(new_constraints)

In [37]:
sf.get_semantic_constraints()

{'H': 1,
 'F': 1,
 'Cl': 1,
 'Br': 1,
 'I': 1,
 'B': 3,
 'B+1': 2,
 'B-1': 4,
 'O': 2,
 'O+1': 3,
 'O-1': 1,
 'N': 3,
 'N+1': 4,
 'N-1': 2,
 'C': 4,
 'C+1': 5,
 'C-1': 3,
 'P': 5,
 'P+1': 6,
 'P-1': 4,
 'S': 2,
 'S+1': 7,
 'S-1': 5,
 '?': 8,
 'Li': 1}

In [38]:
sf.decoder("[Li][=C][C][S][=C][C][#S]")

'[Li]CCSCC=S'

In [39]:
sf.set_semantic_constraints()

In [40]:
sf.decoder("[Li][=C][C][S][=C][C][#S]")

'[Li]=CCS=CC#S'

In [41]:
smiles_dataset = ["COC", "FCF", "O=O", "O=Cc1ccccc1"]
selfies_dataset = list(map(sf.encoder, smiles_dataset))

selfies_dataset

['[C][O][C]',
 '[F][C][F]',
 '[O][=O]',
 '[O][=C][C][=C][C][=C][C][=C][Ring1][=Branch1]']

In [42]:
max_len = max(sf.len_selfies(s) for s in selfies_dataset)
max_len

10

In [43]:
alphabet = sf.get_alphabet_from_selfies(selfies_dataset)
alphabet.add("[nop]")

alphabet = list(sorted(alphabet))
alphabet

['[=Branch1]', '[=C]', '[=O]', '[C]', '[F]', '[O]', '[Ring1]', '[nop]']

In [44]:
vocab_stoi = {symbol: idx for idx, symbol in enumerate(alphabet)}
vocab_itos = {idx: symbol for symbol, idx in vocab_stoi.items()}

vocab_stoi

{'[=Branch1]': 0,
 '[=C]': 1,
 '[=O]': 2,
 '[C]': 3,
 '[F]': 4,
 '[O]': 5,
 '[Ring1]': 6,
 '[nop]': 7}

In [45]:
dimethyl_ether = selfies_dataset[0]
label,one_hot = sf.selfies_to_encoding(dimethyl_ether,vocab_stoi,pad_to_len=max_len)

In [46]:
label

[3, 5, 3, 7, 7, 7, 7, 7, 7, 7]

In [47]:
one_hot

[[0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 1]]

In [49]:
dimethyl_ether = sf.encoding_to_selfies(label, vocab_itos, enc_type="label")
dimethyl_ether

'[C][O][C][nop][nop][nop][nop][nop][nop][nop]'

In [50]:
dimethyl_ether = sf.encoding_to_selfies(one_hot, vocab_itos, enc_type="one_hot")
dimethyl_ether

'[C][O][C][nop][nop][nop][nop][nop][nop][nop]'

In [None]:
list(sf.split_selfies("[C][O][C]"))

['[C]', '[O]', '[C]']