In [None]:
%env THEANO_FLAGS=device=gpu7
import numpy as np
import theano
import theano.tensor as T
import lasagne
import os
import pandas as pd
from tqdm import tqdm_notebook

#thanks Muammar 
PAD_ix=-1


# Problem & Dataset

* Chemistry is not a mostly loved subject.
* There are various chemical compounds. The problem here is to pronounce a common name knowing its formula.  
* So, we try to learn transition: molecular_formula->common_name.
* If you want, you can replace source and target variables to predict something else (sequential)

In [None]:
molecules = pd.read_csv('molecules.tsv',sep='\t')
molecules.head()

In [None]:
def get_xy(x, y):
    global molecules
    is_str = lambda s: type(s) is str
    molecules = molecules[x.apply(is_str)& y.apply(is_str)]
    return x.values, y.apply(lambda s: ["START"]+list(s)+["END"])


source_seqs,target_seqs = get_xy(molecules.molecular_formula, molecules.common_name) #Replace hee

In [None]:
for source, target in zip(source_seqs[:5],target_seqs[:5]):
    print( source,':',"".join(target[1:-1]))

In [None]:
target_letters = list(set([token for ts in target_seqs for token in ts]))
target_letter_to_ix = {ph:i for i,ph in enumerate(target_letters)}

In [None]:
source_letters = list(set([token for word in source_seqs for token in word]))
source_letter_to_ix = {l:i for i,l in enumerate(source_letters)}

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(list(map(len,target_seqs)),bins=25);

# Truncate names longer than MAX_LEN characters. This can be changed
MAX_LEN = min([150,max(list(map(len,target_seqs)))])

### Cast everything from symbols into matrix of int32. Pad with -1

In [None]:
def as_matrix(sequences,token_to_i, max_len=None,PAX_ix=PAD_ix):
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences),max_len),dtype='int8') -1
    for i,seq in enumerate(sequences):
        
        row_ix = [token_to_i.get(_, 0) for _ in seq[:max_len]]
        matrix[i,:len(row_ix)] = row_ix
    
    return matrix


In [None]:
print(as_matrix(source_seqs[:10],source_letter_to_ix))

# Input variables

In [None]:
input_sequence = T.matrix('token sequence','int32')
target_target_letters = T.matrix('target target_letters','int32')

# Build NN

You will be building a model that takes token sequence and predicts next token


* iput sequence
* one-hot / embedding
* recurrent layer(s)
* otput layer(s) that predict output probabilities


In [None]:
from lasagne.layers import InputLayer,DenseLayer,EmbeddingLayer
from lasagne.layers import RecurrentLayer,LSTMLayer,GRULayer,CustomRecurrentLayer

In [None]:

##ENCODER
l_in = lasagne.layers.InputLayer(shape=(None, None),input_var=input_sequence)
l_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(input_sequence,-1)) 

l_emb = lasagne.layers.EmbeddingLayer(l_in, len(source_letters), 40)
l_rnn = lasagne.layers.<layer>(<params>,only_return_final=<what?>,mask_input=l_mask)

##DECODER
transc_in = lasagne.layers.InputLayer(shape=(None, None),input_var=target_target_letters)
transc_mask = lasagne.layers.InputLayer(shape=(None, None),input_var=T.neq(target_target_letters,-1))
transc_emb = lasagne.layers.EmbeddingLayer(transc_in, len(target_letters), 50)
transc_rnn = lasagne.layers.<layer>(<params>,hid_init=l_rnn,mask_input=transc_mask)# WARNING! IF LSTM ADD OTHER INIT


#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
transc_rnn_flat = lasagne.layers.reshape(transc_rnn, (-1,transc_rnn.output_shape[-1]))

l_out = lasagne.layers.DenseLayer(transc_rnn_flat,len(target_letters),nonlinearity=lasagne.nonlinearities.softmax)

In [None]:
# Model weights
weights = lasagne.layers.get_all_params(l_out,trainable=True)
#print weights

In [None]:
network_output = lasagne.layers.get_output(l_out)
network_output = network_output.reshape([target_target_letters.shape[0],target_target_letters.shape[1],-1])
#If you use dropout do not forget to create deterministic version for evaluation

In [None]:
predictions_flat = network_output[:,:-1,:].reshape([-1,len(target_letters)])
targets = target_target_letters[:,1:].ravel()

#do not count loss for '-1' tokens
mask = T.nonzero(T.neq(targets,-1))

loss = <count me! (dont forgent about mask)>

updates = lasagne.updates.adam(loss.mean(),weights)

# Compiling it

In [None]:
#training
train = theano.function([input_sequence, target_target_letters], loss, updates=updates, allow_input_downcast=True)

#computing loss without training
compute_cost = theano.function([input_sequence, target_target_letters], loss, allow_input_downcast=True)

# generation

Simple: 
* get initial context(seed), 
* predict next token probabilities,
* sample next token, 
* add it to the context
* repeat from step 2

You'll get a more detailed info on how it works in the homework section.

In [None]:
#compile the function that computes probabilities for next token given previous text.

#reshape back into original shape
network_output = network_output.reshape((target_target_letters.shape[0],target_target_letters.shape[1],len(target_letters)))
#predictions for next tokens (after sequence end)
last_word_probas = network_output[:,-1]
probs = theano.function([input_sequence,target_target_letters],last_word_probas,allow_input_downcast=True)

In [None]:
#generate_target_content("3-abc")

# Model training

In [None]:
source_seqs = np.array(source_seqs)
target_seqs = np.array(target_seqs)

In [None]:
def sample_batch(source_seqs,target_seqs, batch_size):
    
    batch_ix = np.random.randint(0,len(source_seqs),size=batch_size)
    source_seqs_batch=as_matrix(source_seqs[batch_ix],source_letter_to_ix) 
    target_seqs_batch=as_matrix(target_seqs[batch_ix],target_letter_to_ix)
    return source_seqs_batch,target_seqs_batch

In [None]:
from tqdm import tqdm

In [None]:
print("Training ...")

#total N iterations
n_epochs=100

# how many minibatches are there in the epoch 
batches_per_epoch = 500

#how many training sequences are processed in a single function call
batch_size=10


for epoch in tqdm_notebook(range(n_epochs)):


    avg_cost = 0;
    
    for _ in tqdm_notebook(range(batches_per_epoch)):
        
        x,y = sample_batch(source_seqs,target_seqs,batch_size)
        avg_cost += train(x, y).mean()
        
        
    
        
        
    print("Epoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))
    for i in range(5):
        ind = np.random.randint(len(source_seqs))
        print (source_seqs[ind],':', ''.join(generate_target_content(source_seqs[ind],sample=True)[1:-1]))



In [None]:
generate_target_content(" C_{4}H_{1}", t=2)

# And now,
* try lstm/gru
* try several layers
* try mtg cards
* try your own dataset of any kind