In [1]:
import os
PATH=r"C:\Users\thoma\Internship\RNN LSTM model\Weights\InorganicInChiWeights"
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow_addons as tfa

In [2]:
import requests

In [2]:

def generateAsciiAlphabet(start_code=32, final_code=126, start_token=None, end_token=None, pad_token='Ø', additions=[]):
    """ (int, int, char, char, char, list of chars) -> list
    Generates a set of ascii tokens with decimal code from start_code to final_code inclusive.
    Also adds any extra chars in additions.
    The pad token will be the first item in the list
    """
    alphabet = []
    if pad_token and len(pad_token) == 1:
        alphabet.append(pad_token)
    if start_token and len(start_token) == 1:
        alphabet.append(start_token)
    if end_token and len(end_token) == 1:
        alphabet.append(end_token)

    asc = [chr(x) for x in range(start_code, final_code+1)]
    alphabet.extend(asc)
    alphabet.extend(additions)

    return alphabet

# Define start and end for decoder
start_token = '\t'
end_token = '\n'
pad_token = 'Ø'

# Define alphabets
input_alphabet = generateAsciiAlphabet(pad_token=pad_token)
target_alphabet = generateAsciiAlphabet(start_token=start_token, end_token=end_token, pad_token=pad_token,
                                                                                            additions=['λ','²','⁶','λ','₇','⁵','⁴','⁷','³','⁸','¹','⁰','⁹','₂','₄','₃','₁','₉','₆','₅','⁻','₈'])
num_encoder_tokens = len(input_alphabet)
num_decoder_tokens = len(target_alphabet)

# Build token -> index dictionaries
inchi_token_index = dict([(char, i) for i, char in enumerate(input_alphabet)])
iupac_token_index = dict([(char, i) for i, char in enumerate(target_alphabet)])

# Define sampling models
# Restore the model and construct the encoder and decoder.

model = keras.models.load_model(PATH)

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[4].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

latent_dim = model.layers[5].output_shape[0][2] # lstm_2

decoder_inputs = model.input[1]  # input_2
decoder_embedding_layer = model.layers[3] # Decoder embedding
decoder_lstm = model.layers[5]
decoder_dense = model.layers[6]

def create_input_sequence(inchi):
    inp = np.array([inchi_token_index[c] for c in inchi], ndmin=2)
    encoder_input = tf.constant(inp)
    return encoder_input

def decode_sequence(inchi, beam_width=4, max_seq_length=128):
    # We are only going to deal with batch size 1
    inference_batch_size = 1

    # Get encoded sequence
    input_seq = create_input_sequence(inchi)

    # Encode the input as state vectors.
    # Returns [state_h, state_c]
    encoder_state = encoder_model.predict(input_seq)
    
    # Instantiate the beam decoder
    beam_decoder = tfa.seq2seq.BeamSearchDecoder(decoder_lstm.cell, embedding_fn=decoder_embedding_layer, beam_width=beam_width,
            output_layer=decoder_dense)

    # Initialize the beam decoder
    #decoder_embedding_matrix = tf.eye(num_decoder_tokens)
    decoder_initial_state = tfa.seq2seq.tile_batch(encoder_state, multiplier=beam_width)
    (first_finished, first_inputs, first_state) = beam_decoder.initialize(
                                    embedding=None,
                                    start_tokens = tf.constant([iupac_token_index[start_token]], dtype='int32'),
                                    end_token= tf.constant(iupac_token_index[end_token]),
                                    initial_state = decoder_initial_state)
    
    inputs = first_inputs
    state = first_state
    predictions = np.empty((inference_batch_size, beam_width,0), dtype = np.uint8)
    for j in range(max_seq_length):
        # beam_search_outputs: instance of BeamSearchDecoderOutput, containing
        #   scores - cumulative log prob., includes end token score (bs, bw)
        #   predicted_ids - predicted output tokens for current step (bs, bw)
        #   parent_ids - index of the parent beam of each beam (bs, bw)
        # next_state: instance of BeamSearchDecoderState, containing
        #   cell_state - list of tensors [h_states and c_states], each of shape (bs, bw, hidden_dim) 
        #   log_probs - same as cumulative scores (bs, bw)
        #   finished - boolean tensor (bs, bw)
        #   lengths - accumulated length of each beam (bs, bw)
        #   accumulated_attention_prob: only used when wraps attention
        # next_inputs: tensor containing next (embedded) input for each candidate
        #   (batch_size, beam_width, num_decoder_tokens)
        # finished: boolean tensor that indicates which candidates are complete (batch_size, beam_width)
        beam_search_outputs, next_state, next_inputs, finished = beam_decoder.step(j,inputs,state,training=False)
        inputs = next_inputs
        state = next_state
        outputs = np.expand_dims(beam_search_outputs.predicted_ids,axis = -1)
        parent_ids = beam_search_outputs.parent_ids.numpy()
        # Rebuild predictions using parent ids
        for k in range(inference_batch_size):
            predictions[k] = predictions[k, parent_ids[k]]
        # Predictions: (bs, bw, current_length)
        predictions = np.append(predictions, outputs, axis = -1)
        # Break if all candidates have finished
        if tf.math.reduce_all(finished):
            break
    finished_array = finished.numpy()
    beam_scores = beam_search_outputs.scores.numpy()
    # I'm not sure if we can use these lengths. Fix parent_ids first then check
    lengths = state.lengths.numpy()

    print("-----------------")
    print("Inchi:")
    print(inchi)
    print("-----------------")
    print("\nIupac name:")
    print("---------------------------------------------")
    # Looping through item in batch
    for i in range(len(predictions)):
        # Looping through all candidates in beam
        results_list = []
        for j in range(len(finished_array[i])):
            # Only proceed if truth table shows candidate finished
            if finished_array[i,j]:
                l = lengths[i,j]
                # Get seq up to and including end token
                seq = predictions[i,j,:l]
                predicted = "".join([target_alphabet[x] for x in seq])
                # Store prediction and its beam score
                results_list.append((predicted, beam_scores[i,j]))

        # Sort by beam score
        results_sorted = sorted(results_list, key=lambda res: res[1], reverse=True)

        if len(results_sorted) < 3:
            print_max = len(results_sorted)
        else:
            print_max = 3

        for i in range(print_max):
            print(repr(results_sorted[i][0]), " beam score: ", results_sorted[i][1])


In [9]:
from openbabel import pybel as pb
smiles=['c1ccc(cc1)P(c2ccccc2)C34C5[Fe]3678912(C5C6C74)C3C8C9C1(C23)P(c1ccccc1)c1ccccc1','[NH3+][Co-3]([NH3+])([NH3+])([NH3+])([NH3+])[NH3+].[Cl-].[Cl-].[Cl-]','[C-]#[O+].[C-]#[O+].[C-]#[O+].[N-]=O.[Co]','C[Mg]Br','[Li]CCCC']
mols = [pb.readstring("smi", x) for x in smiles]
reconnected=[x.write("inchi", opt={'w':None, 'M':None}) for x in mols]
inchi=[x.write("inchi", opt={'w':None}) for x in mols]
reconnected

['InChI=1/2C17H14P.Fe/c2*1-3-9-15(10-4-1)18(17-13-7-8-14-17)16-11-5-2-6-12-16;/h2*1-14H;/rC34H28FeP2/c1-5-13-21(14-6-1)36(22-15-7-2-8-16-22)33-29-25-26-30(33)35(25,26,29,33)27-28(35)32(35)34(35,31(27)35)37(23-17-9-3-10-18-23)24-19-11-4-12-20-24/h1-20,25-32H\n',
 'InChI=1/3ClH.Co.6H3N/h3*1H;;6*1H3/q;;;+3;;;;;;/p-3/r3ClH.CoH18N6/c;;;2-1(3,4,5,6)7/h3*1H;2-7H3/q;;;+3/p-3\n',
 'InChI=1/3CO.Co.NO/c3*1-2;;1-2/q;;;;-1\n',
 'InChI=1/CH3.BrH.Mg/h1H3;1H;/q;;+1/p-1/rCH3BrMg/c1-3-2/h1H3\n',
 'InChI=1/C4H9.Li/c1-3-4-2;/h1,3-4H2,2H3;/rC4H9Li/c1-2-3-4-5/h2-4H2,1H3\n']

In [8]:
decode_sequence('InChI=1S/2C17H14P.Fe/c2*1-3-9-15(10-4-1)18(17-13-7-8-14-17)16-11-5-2-6-12-16;/h2*1-14H;', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1S/3ClH.Co.6H3N/h3*1H;;6*1H3/q;;;+3;;;;;;/p-3', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1S/3CO.Co.NO/c3*1-2;;1-2/q;;;;-1', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1S/CH3.BrH.Mg/h1H3;1H;/q;;+1/p-1', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1S/C4H9.Li/c1-3-4-2;/h1,3-4H2,2H3;', beam_width=4, max_seq_length=128)

-----------------
Inchi:
InChI=1S/2C17H14P.Fe/c2*1-3-9-15(10-4-1)18(17-13-7-8-14-17)16-11-5-2-6-12-16;/h2*1-14H;
-----------------

Iupac name:
---------------------------------------------
'tetrakis(tetrachloro-1,3,5,2⁻⁵,4⁻⁵,6⁻⁵-trioxatriphosphinane-2,4,6-tris(olate)\n'  beam score:  -20.42848
'tetrakis(tetrachloro-1,3,5,2⁻⁵,4⁻⁵,6⁻⁵-trioxatriphosphinane-2,4,6-tris(olate))\n'  beam score:  -22.078316
'tetrakis(tetrachloro-1,3,5,2⁻⁵,4⁻⁵,6⁻⁵,8⁻⁵-tetraoxatetraphosphocine-2,4,6,8-tetrakis(olate)\n'  beam score:  -22.676762
-----------------
Inchi:
InChI=1S/3ClH.Co.6H3N/h3*1H;;6*1H3/q;;;+3;;;;;;/p-3
-----------------

Iupac name:
---------------------------------------------
'cobalt hexahydrate trichloride\n'  beam score:  -5.152494
'trichlorocobalt hexahydrate\n'  beam score:  -5.2363367
'trichloromolybdenum tetrahydrate\n'  beam score:  -7.627126
-----------------
Inchi:
InChI=1S/3CO.Co.NO/c3*1-2;;1-2/q;;;;-1
-----------------

Iupac name:
---------------------------------------------
'mo

In [10]:
decode_sequence('InChI=1/2C17H14P.Fe/c2*1-3-9-15(10-4-1)18(17-13-7-8-14-17)16-11-5-2-6-12-16;/h2*1-14H;/rC34H28FeP2/c1-5-13-21(14-6-1)36(22-15-7-2-8-16-22)33-29-25-26-30(33)35(25,26,29,33)27-28(35)32(35)34(35,31(27)35)37(23-17-9-3-10-18-23)24-19-11-4-12-20-24/h1-20,25-32H', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1/3ClH.Co.6H3N/h3*1H;;6*1H3/q;;;+3;;;;;;/p-3/r3ClH.CoH18N6/c;;;2-1(3,4,5,6)7/h3*1H;2-7H3/q;;;+3/p-3', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1/3CO.Co.NO/c3*1-2;;1-2/q;;;;-1', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1/CH3.BrH.Mg/h1H3;1H;/q;;+1/p-1/rCH3BrMg/c1-3-2/h1H3', beam_width=4, max_seq_length=128)
decode_sequence('InChI=1/C4H9.Li/c1-3-4-2;/h1,3-4H2,2H3;/rC4H9Li/c1-2-3-4-5/h2-4H2,1H3', beam_width=4, max_seq_length=128)

-----------------
Inchi:
InChI=1/2C17H14P.Fe/c2*1-3-9-15(10-4-1)18(17-13-7-8-14-17)16-11-5-2-6-12-16;/h2*1-14H;/rC34H28FeP2/c1-5-13-21(14-6-1)36(22-15-7-2-8-16-22)33-29-25-26-30(33)35(25,26,29,33)27-28(35)32(35)34(35,31(27)35)37(23-17-9-3-10-18-23)24-19-11-4-12-20-24/h1-20,25-32H
-----------------

Iupac name:
---------------------------------------------
'hexatecasiloxane\n'  beam score:  -9.121264
'hexatecasiloxane-1,3,5,7,9,10,12-hexaene\n'  beam score:  -18.54115
'hexatecasiloxane-1,3,5,7,9,10,12-hexaone\n'  beam score:  -19.17309
-----------------
Inchi:
InChI=1/3ClH.Co.6H3N/h3*1H;;6*1H3/q;;;+3;;;;;;/p-3/r3ClH.CoH18N6/c;;;2-1(3,4,5,6)7/h3*1H;2-7H3/q;;;+3/p-3
-----------------

Iupac name:
---------------------------------------------
'cobalt triammonium hexakis(iminomethanide) hexahydrate\n'  beam score:  -12.455928
'cobalt triammonium hexakis(iminomethanide) trihydrate diphosphate\n'  beam score:  -13.958942
'cobalt triammonium hexakis(iminomethanide) trihydrate trisulfate\n'  be