In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
import math
import copy

from torch.utils.data import DataLoader, Dataset, ConcatDataset
from torch.utils.data.dataset import Subset
from sklearn.model_selection import KFold
from torch.nn.utils.rnn import pad_sequence

import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys

# make symbol dictionary
save symbol dictionary as 'OdorCode-40 Symbol Dictionary'

('chembl_smi.txt', 'tgsc_odorant_1020.txt', 'tgsc_odorless_1020.txt' are three files recorded SMILES used in pretraining, change these files to you need.)

In [None]:
LIMIT_SMILES_LENGTH = 100  # とりあえず 100 以下
Num_NBDC_SMILES = 100000       # とりあえず 20万
SMILES_NBDC = 'all_data/webScrapping/chembl_smi.txt'
F_odor      = 'all_data/webScrapping/tgsc_odorant_1020.txt'           # 匂いのある物質の SMILES のファイル（実は１つだけ odorless）
F_odorless  = 'all_data/webScrapping/tgsc_odorless_1020.txt'  # 匂いのない物質の SMILES のファイル


symbol_ID = {' ':0,'[CLS]':1,'[BOS]':2, '[EOS]':3, '▪':4, 'H': 5}  # ' 'はpadding 
PAD_ID = 0
CLS_ID = 1
BOS_ID = 2
EOS_ID = 3
MSK_ID = 4
ID_symbol = [' ', '[CLS]', '[BOS]', '[EOS]', '▪', 'H']
sID = 6

#---------------------------------------------------------------------------------------------------------#
#  F_odor, F_odorless, および NBDC Nikkaji のデータベース（冒頭の Num_NBDC_SMILES 個）から，長さ          #
#  LIMIT_SMILES_LENGTH 以下で mol を持ち，[10B] のような同位体の質量数の記述を含まない smiles 抽出し      #
#  Pretrain_SMILES_STR_LIST を作成し，同時に元素記号を symbol_ID, ID_symbol に登録                        #
#---------------------------------------------------------------------------------------------------------#

def arabic_number(c):
  return (c in '0123456789')


#------ smiles_str が制約を満たす smiles のリストを作成し，symbol_ID, ID_symbol を作成 -----
#          制約のチェックの際に smiles_str を cannonical smiles に変換し、それをリストにしている
def make_list_and_dictionary(smiles_str):
  global sID
  global num_skip

  if len(smiles_str) > LIMIT_SMILES_LENGTH :
    return

  # '[数字' は同位体元素なので見つかれば return
  for i in range(len(smiles_str)):
    if smiles_str[i] == '[' and arabic_number(smiles_str[i+1]):
      # print('This smiles includes isotope: ', smiles_str)
      num_skip += 1
      return

  mol = Chem.MolFromSmiles(smiles_str)
  if  mol == None :
    # print('The mol is None for SMILES: ', smiles_str)
    num_skip += 1
    return

  ### 念のため追加 ###
  # シス・トランス異性体の一部などは smiles --> mol --> 標準形 smiles --> mol --> の変換を続けると構造の一部がシスになったりトランスになったりを繰り返すものもある．
  #  例：CCCc1c/c/2=c\3/[nH]/c(=C\C=c/4\c(c/c(=c\5/[nH]/c(=C\C=c\1/[nH]2)/c(c5)CCC)/[nH]4)CCC)/c(c3)CCC　(次のセルを参照)
  # このため，smiles_str0 (smiles_str) -> mol-> smiles_str1 -> mol -> smiles_str2 としたとき，smiles_str1 != smiles_str2 となるものはリストから削除

  smiles_str1 = Chem.MolToSmiles(mol, rootedAtAtom=-1) 
  mol_tmp = Chem.MolFromSmiles(smiles_str1)
  smiles_str2 = Chem.MolToSmiles(mol_tmp, rootedAtAtom=-1)
  if smiles_str1 != smiles_str2:
    num_skip += 1
    return

  # 元素記号のID化
  for a in mol.GetAtoms():
    atom = a.GetSymbol()
    if a.GetIsAromatic() :
      atom = str.lower(atom)
    if atom not in symbol_ID:
      symbol_ID[atom] = sID
      sID += 1
      ID_symbol.append(atom)

  Pretrain_SMILES_STR_LIST.append(smiles_str1)




#----------------------------------------------------------------------
#  記号のembedding 事前学習用の smiles_str のリスト，記号辞書の作成
#---------------------------------------------------------------------

Pretrain_SMILES_STR_LIST = []
num_skip = 0 # skip した smiles_str の数(長さ制限を満たさないものはカウントしていない)

#---------------- F_odor に対する処理 -------------------------
with open(F_odor,'r') as inF:
  while True:
    line = inF.readline()
    if line == '':
      break
    # x = line.split("\t")
    x = line.split(" ")
    smiles_str = x[1]
    make_list_and_dictionary(smiles_str)

#--------------- F_odorless に対する処理 -------------------
with open(F_odorless,'r') as inF:
  while True:
    line = inF.readline()
    if line == '':
      break
    # x = line.split("\t")
    x = line.split(" ")
    smiles_str = x[1]
    make_list_and_dictionary(smiles_str)

#-------------- SMLIES_NBDC に対する処理 ---------------------
count = 0
with open(SMILES_NBDC,'r') as inF:
  while True:
    count += 1
    if count % 10000 == 0:
      print(count, end=' ')
    if count % 100000 == 0:
      print()
    if count == Num_NBDC_SMILES:
      break
    line = inF.readline()

    if line == '':
      break
    
    x = line.split(" ")
    smiles_str = x[1]
    make_list_and_dictionary(smiles_str)


#-------------------- 元素記号以外の記号の登録 --------------------------
Symbols1 = '()[]+-.\\/=#@'    
for i in range(len(Symbols1)):    
  c = Symbols1[i]

  symbol_ID[c] = sID
  sID += 1
  ID_symbol.append(c)
symbol_ID['@@'] = sID
sID += 1
ID_symbol.append('@@')

#------------------ ラベル（1～9, %10, %11,・・・，%49）のID登録 -----------
for i in range(1,10):
  symbol_ID[str(i)] = sID
  sID +=1
  ID_symbol.append(str(i))
for i in range(10,50):
  symbol_ID['%'+str(i)] = sID
  sID +=1
  ID_symbol.append('%'+str(i))

#---------------- Pretrain_SMILES_STR_LIST, 記号辞書（symbol_ID, ID_symbol, sID） の保存 ---------------
import pickle

f = open('CHEMBL/OdorCode-40 Pretrain_SMILES_STR_LIST', 'wb')
pickle.dump(Pretrain_SMILES_STR_LIST, f)
f.close()
print('Saved Pretrain_SMILES_STR_LIST')

f = open('CHEMBL/OdorCode-40 Symbol Dictionary', 'wb')
lists = [symbol_ID, ID_symbol, sID, ]
pickle.dump(lists, f)
f.close()
print('Saved Symbol Dictionary')

print('Number of smiles in Pretrain_SMILES_STR_LIST: ', len(Pretrain_SMILES_STR_LIST))
print('sID = ', sID)
print('number of skipped smiles (without long smiles) :', num_skip) 
print(symbol_ID)
print(ID_symbol)
print('max length of symbol = ', max([len(s) for s in ID_symbol]))

In [None]:
for smiles_str in Pretrain_SMILES_STR_LIST:
  mol = Chem.MolFromSmiles(smiles_str)
  cannonical_smiles_str = Chem.MolToSmiles(mol)
  if cannonical_smiles_str != smiles_str :
    print(' %s  is not cannonical'%smiles_str) 

# some functions 
(1) smiles_str2smiles: translate a SMILES to a list of symbols ID

(2) smiles2smiles_str: translate a list of symbols ID to a SMILES

In [None]:
#----------------------------------#
#           smiles_str2smiles      #
#----------------------------------#
# smiles を記号（1文字記号だけでなく，2文字，3文字記号も含む）のＩＤのリストに変換

max_length_symbol = max([len(s) for s in ID_symbol])

def smiles_str2smiles(smiles_str, flag=False):
  "smiles を記号の列に変換（長さ2のNaなどの元素記号も1つのindexに変換）"

  smiles = []
  i=0
  while i < len(smiles_str):
    NotFindID = True
    for j in range(max_length_symbol,0,-1) :
      if i+j <= len(smiles_str) and smiles_str[i:i+j] in symbol_ID: # j長さの文字記号として辞書登録済み
        smiles.append(symbol_ID[smiles_str[i:i+j]])
        i += j-1 # while ブロックの最後で i++ されるが，j文字なので余分に i+=j-1
        NotFindID = False
        break
    if NotFindID:
      print('something wrong on converting smiles_str to smiles')
      break
    i += 1
  return smiles

#----------------------------------#
#           smiles2smiles_str      #
#----------------------------------#
def smiles2smiles_str(smiles):
  smiles_str = ''
  for id in smiles:
    smiles_str += ID_symbol[id]
  return smiles_str

# save processed SMILES 
Translate each SMILES to a list of symbol ID.

Save SMILES (have translated to lists of symbol ID) to file 'OdorCode-40 Pretrain_smiles_list'

In [None]:
pretrain_smiles_list = []

no=0
for smiles_str in Pretrain_SMILES_STR_LIST:
  smiles = smiles_str2smiles(smiles_str)
  pretrain_smiles_list.append(smiles)

  no += 1
  if no%100000==0 :
    print(no)
  elif no%10000==0 :
    print(no,end=' ')

#----------------- 保存 --------------------
import pickle

f = open('CHEMBL/OdorCode-40 Pretrain_smiles_list', 'wb')
pickle.dump(pretrain_smiles_list, f)
f.close()
print('Saved pretrain_smiles_list')

# make input and target SMILES for 2-encoder model
Change canonical SMILES (input) to at most 5 different SMILES (targets) by using 'Chem.MolToSmiles(mol, rootedAtAtom)'.

Pair input and target SMILES and save them to file 'OdorCode-40 Pretrain MLM_data' (SMILES saved in file are represented by lists of symbol ID.)

In [None]:
#
# smiles の同一性タスクのSample 生成
#

NumTrys = 5  # x2, y を求める際のトライ数

NumSmiles = len(pretrain_smiles_list)   # できるだけ MLM タスク用のデータを作っておく（学習時に使うデータサイズは指定できるため）


canonical_smiles_list = []
smiles_list = []

for no in range(NumSmiles):
  if no%1000 == 0:
    print(no, end=' ')
  if no%10000 == 0:
    print('')

  x = pretrain_smiles_list[no]
  if len(x)<2 :
    continue

  x_str = Pretrain_SMILES_STR_LIST[no]

  if x_str.find('.') >= 0:
    continue  #  '.' による緩やかな結合 ==> rootedAtAtom が異なるものを生成しない．

  mol = Chem.MolFromSmiles(x_str)

  # x_str と root が異なる x2_str を求める
  num_atoms = len(mol.GetAtoms())
  pos_list = list(range(num_atoms))

  for num_try in range(NumTrys) :
    pos = random.choice(pos_list)
    x2_str = Chem.MolToSmiles(mol, rootedAtAtom=pos)
    x2 = smiles_str2smiles(x2_str)

    canonical_smiles_list.append(x)
    smiles_list.append(x2)

for i in range(20):
  print('canonical  : ', smiles2smiles_str(canonical_smiles_list[i]))
  print('?canonical : ', smiles2smiles_str(smiles_list[i]))

#----------------- 保存 --------------------
import pickle

pretraining_data = [canonical_smiles_list, smiles_list]
f = open('CHEMBL/OdorCode-40 Pretrain MLM_data', 'wb')
pickle.dump(pretraining_data, f)
f.close()
print('Saved pretraining data')