In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
import math
import copy

from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torch.utils.data.dataset import Subset
from sklearn.model_selection import KFold
from torch.nn.utils.rnn import pad_sequence

import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys

# make symbol dictionary
save symbol dictionary as 'OdorCode-40 Symbol Dictionary'

('chembl_smi.txt', 'tgsc_odorant_1020.txt', 'tgsc_odorless_1020.txt' are three files recorded SMILES used in pretraining, change these files to you need.)

In [None]:
LIMIT_SMILES_LENGTH = 100  # maximum SMILES length
Num_NBDC_SMILES = 100000       # number of SMILES used
SMILES_NBDC = 'all_data/webScrapping/chembl_smi.txt' # the file path of SMILES list saved in your system


symbol_ID = {' ':0,'[CLS]':1,'[BOS]':2, '[EOS]':3, '▪':4, 'H': 5}  # ' 'is padding 
PAD_ID = 0
CLS_ID = 1
BOS_ID = 2
EOS_ID = 3
MSK_ID = 4
ID_symbol = [' ', '[CLS]', '[BOS]', '[EOS]', '▪', 'H']
sID = 6

#---------------------------------------------------------------------------------------------------------#
# extract SMILES whose length are less then 100, 
# Saved extracted SMILES in Pretrain_SMILES_STR_LIST, and 
# make corresponding symbol_ID, ID_symbol
#---------------------------------------------------------------------------------------------------------#

def arabic_number(c):
  return (c in '0123456789')


def make_list_and_dictionary(smiles_str):
  global sID
  global num_skip

  if len(smiles_str) > LIMIT_SMILES_LENGTH :
    return

  # if smiles include the number representing isotope then return
  for i in range(len(smiles_str)):
    if smiles_str[i] == '[' and arabic_number(smiles_str[i+1]):
      # print('This smiles includes isotope: ', smiles_str)
      num_skip += 1
      return

  mol = Chem.MolFromSmiles(smiles_str)
  if  mol == None :
    # print('The mol is None for SMILES: ', smiles_str)
    num_skip += 1
    return


  smiles_str1 = Chem.MolToSmiles(mol, rootedAtAtom=-1) 
  mol_tmp = Chem.MolFromSmiles(smiles_str1)
  smiles_str2 = Chem.MolToSmiles(mol_tmp, rootedAtAtom=-1)
  if smiles_str1 != smiles_str2:
    num_skip += 1
    return

  # assign ID to symbols
  for a in mol.GetAtoms():
    atom = a.GetSymbol()
    if a.GetIsAromatic() :
      atom = str.lower(atom)
    if atom not in symbol_ID:
      symbol_ID[atom] = sID
      sID += 1
      ID_symbol.append(atom)

  Pretrain_SMILES_STR_LIST.append(smiles_str1)




#----------------------------------------------------------------------
#  make the dictionary of symbols
#---------------------------------------------------------------------

Pretrain_SMILES_STR_LIST = []
num_skip = 0 # count skipped smiles (since the length of smiles is longer than 100)

#-------------- read SMILES from files ---------------------
count = 0
with open(SMILES_NBDC,'r') as inF:
  while True:
    count += 1
    if count % 10000 == 0:
      print(count, end=' ')
    if count % 100000 == 0:
      print()
    if count == Num_NBDC_SMILES:
      break
    line = inF.readline()

    if line == '':
      break
    
    x = line.split(" ")
    smiles_str = x[1]
    make_list_and_dictionary(smiles_str)


#-------------------- register symbols to dictionary --------------------------
Symbols1 = '()[]+-.\\/=#@'    
for i in range(len(Symbols1)):    
  c = Symbols1[i]

  symbol_ID[c] = sID
  sID += 1
  ID_symbol.append(c)
symbol_ID['@@'] = sID
sID += 1
ID_symbol.append('@@')


for i in range(1,10):
  symbol_ID[str(i)] = sID
  sID +=1
  ID_symbol.append(str(i))
for i in range(10,50):
  symbol_ID['%'+str(i)] = sID
  sID +=1
  ID_symbol.append('%'+str(i))

#---------------- save Pretrain_SMILES_STR_LIST, symbol_ID, ID_symbol, sID  ---------------
import pickle

f = open('CHEMBL/OdorCode-40 Pretrain_SMILES_STR_LIST', 'wb')
pickle.dump(Pretrain_SMILES_STR_LIST, f)
f.close()
print('Saved Pretrain_SMILES_STR_LIST')

f = open('CHEMBL/OdorCode-40 Symbol Dictionary', 'wb')
lists = [symbol_ID, ID_symbol, sID, ]
pickle.dump(lists, f)
f.close()
print('Saved Symbol Dictionary')

print('Number of smiles in Pretrain_SMILES_STR_LIST: ', len(Pretrain_SMILES_STR_LIST))
print('sID = ', sID)
print('number of skipped smiles (without long smiles) :', num_skip) 
print(symbol_ID)
print(ID_symbol)
print('max length of symbol = ', max([len(s) for s in ID_symbol]))

In [None]:
for smiles_str in Pretrain_SMILES_STR_LIST:
  mol = Chem.MolFromSmiles(smiles_str)
  cannonical_smiles_str = Chem.MolToSmiles(mol)
  if cannonical_smiles_str != smiles_str :
    print(' %s  is not cannonical'%smiles_str) 

# some functions 
(1) smiles_str2smiles: translate a SMILES to a list of symbols ID

(2) smiles2smiles_str: translate a list of symbols ID to a SMILES

In [None]:
#----------------------------------#
#           smiles_str2smiles      #
#----------------------------------#
# transpose smiles to the list of ID

max_length_symbol = max([len(s) for s in ID_symbol])

def smiles_str2smiles(smiles_str, flag=False):
  "# transpose smiles to the list of ID, symbols such as 'Na' will be seen as one symbol"

  smiles = []
  i=0
  while i < len(smiles_str):
    NotFindID = True
    for j in range(max_length_symbol,0,-1) :
      if i+j <= len(smiles_str) and smiles_str[i:i+j] in symbol_ID: 
        smiles.append(symbol_ID[smiles_str[i:i+j]])
        i += j-1 
        NotFindID = False
        break
    if NotFindID:
      print('something wrong on converting smiles_str to smiles')
      break
    i += 1
  return smiles

#----------------------------------#
#           smiles2smiles_str      #
#----------------------------------#
def smiles2smiles_str(smiles):
  smiles_str = ''
  for id in smiles:
    smiles_str += ID_symbol[id]
  return smiles_str

# save processed SMILES 
Translate each SMILES to a list of symbol ID.

Save SMILES (have translated to lists of symbol ID) to file 'OdorCode-40 Pretrain_smiles_list'

In [None]:
pretrain_smiles_list = []

no=0
for smiles_str in Pretrain_SMILES_STR_LIST:
  smiles = smiles_str2smiles(smiles_str)
  pretrain_smiles_list.append(smiles)

  no += 1
  if no%100000==0 :
    print(no)
  elif no%10000==0 :
    print(no,end=' ')

#----------------- 保存 --------------------
import pickle

f = open('CHEMBL/OdorCode-40 Pretrain_smiles_list', 'wb')
pickle.dump(pretrain_smiles_list, f)
f.close()
print('Saved pretrain_smiles_list')

# make input and target SMILES for 2-encoder model
Change canonical SMILES (input) to at most 5 different SMILES (targets) by using 'Chem.MolToSmiles(mol, rootedAtAtom)'.

Pair input and target SMILES and save them to file 'OdorCode-40 Pretrain MLM_data' (SMILES saved in file are represented by lists of symbol ID.)

In [None]:
#
# generate non-canonical smiles
#

NumTrys = 5  

NumSmiles = len(pretrain_smiles_list)   


canonical_smiles_list = []
smiles_list = []

for no in range(NumSmiles):
  if no%1000 == 0:
    print(no, end=' ')
  if no%10000 == 0:
    print('')

  x = pretrain_smiles_list[no]
  if len(x)<2 :
    continue

  x_str = Pretrain_SMILES_STR_LIST[no]

  if x_str.find('.') >= 0:
    continue  

  mol = Chem.MolFromSmiles(x_str)

  # generate non-canonical smiles (x2_str) 
  num_atoms = len(mol.GetAtoms())
  pos_list = list(range(num_atoms))

  for num_try in range(NumTrys) :
    pos = random.choice(pos_list)
    x2_str = Chem.MolToSmiles(mol, rootedAtAtom=pos)
    x2 = smiles_str2smiles(x2_str)

    canonical_smiles_list.append(x)
    smiles_list.append(x2)

for i in range(20):
  print('canonical  : ', smiles2smiles_str(canonical_smiles_list[i]))
  print('?canonical : ', smiles2smiles_str(smiles_list[i]))

#----------------- save to file --------------------
import pickle

pretraining_data = [canonical_smiles_list, smiles_list]
f = open('CHEMBL/OdorCode-40 Pretrain MLM_data', 'wb')
pickle.dump(pretraining_data, f)
f.close()
print('Saved pretraining data')