In [None]:
from keras.models import load_model, save_model
from keras.layers import Dense, Dropout, Flatten, Input, Concatenate, Embedding
from keras.layers import Conv1D, MaxPooling1D, Activation, BatchNormalization, Flatten
from keras.models import Model
import pandas as pd
import numpy as np
import h5py as h5
import keras
import logging
import random

random.seed(1)

data = pd.read_csv("drive/MyDrive/trainDeepCCS_editrow.csv")
test = pd.read_csv("drive/MyDrive/testDeepCCS.csv")

#data = pd.read_csv("trainDeepCCS_editrow.csv")
#test = pd.read_csv("testDeepCCS.csv")


#Splitting SMILES from splitter.py

class SMILESsplitter:
    def split(self, smiles):
        """
        Split a single SMILES using chemical symbols and characters.
        Two letters chemical symbol that end with a 'c' might not be handled properly.
        Nitrogen, Sulfur and Oxygen can miss-handled if they are at the begining of an aromatic structure (ex: Coccc)
        As and Se will be splitted in two caracters if they are found in an aromatic structure.
        Only Co is seen in the current dataset and it is handled properly. TODO: better splitting.
        :param smiles: The SMILES to split
        :return: A list of chemical symbol/character ordered as
        """
        splitted_smiles = []
        for j, k in enumerate(smiles):
            if j == 0:
                if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
                    splitted_smiles.append(k + smiles[j + 1])
                else:
                    splitted_smiles.append(k)
            elif j != 0 and j < len(smiles) - 1:
                if k.isupper() and smiles[j + 1].islower() and smiles[j + 1] != "c":
                    splitted_smiles.append(k + smiles[j + 1])
                elif k.islower() and smiles[j - 1].isupper() and k != "c":
                    pass
                else:
                    splitted_smiles.append(k)
            elif j == len(smiles) - 1:
                if k.islower() and smiles[j - 1].isupper() and k != "c":
                    pass
                else:
                    splitted_smiles.append(k)
        return splitted_smiles

#SmilesToOneHotEncoder() modified
# Maximum Smile length that can be used with DeepCCS.
max_smiles_length = 250
MAX_SMILES_LENGTH = 250
# Accepter adducts by DeepCCS.
ADDUCTS_TO_KEEP = ["M+H", "M+Na", "M-H", "M-2H"]

smiles_splitter = SMILESsplitter()
_max_length = max_smiles_length

def _pad_smiles(smiles, padding_char=" "):
        _max_length = MAX_SMILES_LENGTH
        to_pad = int((_max_length - len(smiles)) / 2)
        s_padded_left = ([padding_char] * to_pad) + smiles
        return s_padded_left + ([padding_char] * (_max_length - len(s_padded_left)))



##Encoding the train
#X = data["SMILES"]
X = data["smiles"]
splitted_smiles = [smiles_splitter.split(s) for s in X]
padded_splitted_smiles = [_pad_smiles(s) for s in splitted_smiles]
lengths = [len(s) for s in padded_splitted_smiles]
chars = [char for s in padded_splitted_smiles for char in s ]
number_of_element = len(padded_splitted_smiles)

if len(set(lengths)) != 1:
  # print(lengths)
  raise ValueError("Items in X must be all of the same length")
else:
  _max_length = lengths[0]

converter = {}
for i,j in enumerate(set(chars)):
  converter[j] = i

number_of_element = len(padded_splitted_smiles)
encoded_smiles = np.zeros((number_of_element, _max_length, len(converter)))
for i,smiles in enumerate(padded_splitted_smiles):
  for j, letter in enumerate(smiles):
    encoded_smiles[i,j, converter[letter]] = 1

#One Hot Encoding the adducts
#adducts = data["Adducts"]
adducts = data["Adduct"]
converter = {}
for i, j in enumerate(set(adducts)):
  converter[j] = i

number_of_element = adducts.shape[0]
encoded_adducts = np.zeros((number_of_element, len(converter)))
for i, adduct in enumerate(adducts):
  encoded_adducts[i, converter[adduct]] = 1


#Neural Network
Y_train = data["CCS_AVG"]
smile_input_layer = Input(shape=encoded_smiles[1].shape, name = "smiles")
#smile_input_layer = Input(shape=(250, len(encoded_smiles)), name = "smiles")
#smile_input_layer = Input(shape=(250, encoded_smiles.shape[2]), name = "smiles")
conv = Conv1D(64, kernel_size = 4, activation = "relu", kernel_initializer = "normal")(smile_input_layer)

previous = conv
for i in range(6):
  conv = Conv1D(64, kernel_size = 4, activation = "relu", kernel_initializer = "normal")(previous)
  if i == 5:
    pool = MaxPooling1D(pool_size = 2, strides = 2)(conv)
  else:
    pool = MaxPooling1D(pool_size = 2, strides = 1)(conv)
  previous = pool

flat = Flatten()(previous)
#adduct_input_layer = Input(shape = (encoded_adducts.shape[1],),name = "adduct")
adduct_input_layer = Input(shape = encoded_adducts[1].shape,name = "adduct")
remix_layer = keras.layers.concatenate([flat, adduct_input_layer], axis = -1)

previous = remix_layer
for i in range(2):
  dense_layer = Dense(384, activation = "relu", kernel_initializer = "normal")(previous)
  previous = dense_layer

output = Dense(1, activation = "linear")(previous)

opt = keras.optimizers.Adam(lr = 0.0001)
model = Model([smile_input_layer, adduct_input_layer], output)
model.compile(optimizer=opt, loss = "mean_squared_error")
deepccs_model = model.fit([encoded_smiles, encoded_adducts], Y_train, epochs = 25, batch_size = 2, validation_split = 0.1)

In [None]:
adducts_test = test["Adduct"]
converter = {}
for i, j in enumerate(set(adducts_test)):
  converter[j] = i

number_of_element = adducts_test.shape[0]

#Make a zero arrays with
encoded_adducts_test = np.zeros((number_of_element, len(converter)))
for i, adduct in enumerate(adducts_test):
  encoded_adducts_test[i, converter[adducts_test]] = 1

encoded_adducts_test = np.zeros((number_of_element, len(converter)))
encoded_adducts_test

In [None]:
X_test = test["smiles"]
splitted_smiles = [smiles_splitter.split(s) for s in X_test]
padded_splitted_smiles = [_pad_smiles(s) for s in splitted_smiles]
lengths = [len(s) for s in padded_splitted_smiles]
chars = [char for s in padded_splitted_smiles for char in s ]

if len(set(lengths)) != 1:
  # print(lengths)
  raise ValueError("Items in X must be all of the same length")
else:
  _max_length = lengths[0]

converter = {}
for i,j in enumerate(set(chars)):
  converter[j] = i

number_of_element = len(padded_splitted_smiles)
encoded_smiles = np.zeros((number_of_element, _max_length, len(converter)))
for i,smiles in enumerate(padded_splitted_smiles):
  for j, letter in enumerate(smiles):
    encoded_smiles[i,j, converter[letter]] = 1