In [None]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
import os
import pandas as pd


# Problem & Dataset

* Chemistry is not a mostly loved subject.
* There are various chemical compounds. The problem here is to pronounce a common name knowing its formula.  
* So, we try to learn transition: molecular_formula->common_name.
* If you want, you can replace source and target variables to predict something else (sequential)

In [None]:
molecules = pd.read_csv('molecules.tsv',sep='\t')

def get_xy(x, y):
    global molecules
    is_str = lambda s: type(s) is str
    molecules = molecules[x.apply(is_str)& y.apply(is_str)]
    return x.values, y.apply(lambda s: ["START"]+list(s)+["END"])

source_seqs,target_seqs = get_xy(molecules.molecular_formula, molecules.common_name) #Replace hee

In [None]:
for source, target in zip(source_seqs[:5],target_seqs[:5]):
    print( source,':',"".join(target[1:-1]))

In [None]:
target_letters = list(set([token for ts in target_seqs for token in ts]))
target_letter_to_ix = {ph:i for i,ph in enumerate(target_letters)}

In [None]:
source_letters = list(set([token for word in source_seqs for token in word]))
source_letter_to_ix = {l:i for i,l in enumerate(source_letters)}

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(list(map(len,target_seqs)),bins=25);

# Truncate names longer than MAX_LEN characters. This can be changed
MAX_LEN = min([150,max(list(map(len,target_seqs)))])

### Cast everything from symbols into matrix of int32. Pad with -1

In [None]:
def as_matrix(sequences,token_to_i, max_len=None,PAX_ix=-1):
    """
    Converts several sequences of tokens to a matrix, edible a neural network.
    Crops at max_len(if given), pads shorter sequences with -1 or PAD_ix.
    """
    max_len = max_len or max(map(len,sequences))
    
    matrix = np.zeros((len(sequences),max_len),dtype='int32') -1
    for i,seq in enumerate(sequences):
        
        row_ix = [token_to_i.get(_, 0) for _ in seq[:max_len]]
        matrix[i,:len(row_ix)] = row_ix
    
    return matrix


In [None]:
print(as_matrix(source_seqs[:10],source_letter_to_ix))

# Input variables

In [None]:
input_sequence = T.matrix('token sequence','int32')
target_target_letters = T.matrix('target target_letters','int32')

# Build NN

You will be building a model that takes token sequence and predicts next token


* Input sequence
* One-hot / embedding
* Encoder recurrent layer(s)
* Decoder recurrent layer(s)
* Softmax layer to predict probabilities

In [None]:
from lasagne.layers import *

##ENCODER
l_in = InputLayer(shape=(None, None),input_var=input_sequence)
l_mask = InputLayer(shape=(None, None),input_var=T.neq(input_sequence,-1)) 

l_emb = <embed input tokens>
l_rnn = <layer>(<params>,only_return_final=True,mask_input=l_mask)


In [None]:
##DECODER
dec_in = InputLayer(shape=(None, None),input_var=target_target_letters)
dec_mask = InputLayer(shape=(None, None),input_var=T.neq(target_target_letters,-1))

dec_emb = <embed dec_in>
dec_rnn = <layer>(<incoming>,hid_init=l_rnn,mask_input=<what?>)# WARNING! if it's lstm use cell_init, not hid_init


#flatten batch and time to be compatible with feedforward layers (will un-flatten later)
dec_rnn_flat = reshape(dec_rnn, (-1,dec_rnn.output_shape[-1]))

l_out = <a layer that predicts next token probabilities given dec_rnn_flat>

In [None]:
# Model weights
weights = get_all_params(l_out,trainable=True)
#print weights

In [None]:
network_output = get_output(dec_rnn_flat)
network_output = network_output.reshape([target_target_letters.shape[0],target_target_letters.shape[1],-1])
#If you use dropout do not forget to create deterministic version for evaluation

In [None]:
predictions_flat = network_output[:,:-1,:].reshape([-1,len(target_letters)])
targets = target_target_letters[:,1:].ravel()

#do not count loss for '-1' tokens
mask = T.nonzero(T.neq(targets,-1))

loss = <compute me! You will require predictions_flat, targets and mask. Loss must be scalar>

updates = lasagne.updates.adam(loss,weights)

# Compiling it

In [None]:
#training
train = theano.function([input_sequence, target_target_letters], loss, updates=updates, allow_input_downcast=True)

#computing loss without training
compute_cost = theano.function([input_sequence, target_target_letters], loss, allow_input_downcast=True)

# Generation

We now need to implement a function that generates output sequence given input.

Such function must work thusly:
```
Init:
x = input
y = ["START"]

While not_too_long:
  p(y_next|x,y) = probabilities of next letter for y
  
  y_next ~ p(y_next|x,y)
  
  y.append(y_next)
  
  if y_next == "END":
      break
```

In [None]:
#compile the function that computes probabilities for next token given previous text.

network_output = <network output reshaped to [batch,tick,token] format>

last_word_probas = <a matrix [batch_i, decoder_n_tokens] of network output for last time step>

probs = <a function that predicts probabilities coming after the last token

In [None]:
def generate_output(input,
                    output_prefix = ("START",),
                    END_token="END"
                    temperature=1,
                    sample=True):
    
    """
    Implement a function that generates output sequence given input.
    
    We recommend (but not require) you to use the pseudo-code above and inline instructions.
    """
    
    output = list(output_prefix)
    
    while True:
        next_y_probs = <a vector of probabilities of the next token>
        next_y_probs = <apply temperature>

        if sample:
            next_y = <token sampled with these probabilities (string character)>
        else:
            next_y = <most take likely token>
        
        assert type(next_y) is str, "please return token(string/character), not it's index"
        
        output.append(next_y)

        if next_y==END_token:
            break
            
    return output



# Model training

In [None]:
source_seqs = np.array(source_seqs)
target_seqs = np.array(target_seqs)

In [None]:
def sample_batch(source_seqs,target_seqs, batch_size):
    """samples a random batch of source and target sequences, batch_size elements"""
    batch_ix = np.random.randint(0,len(source_seqs),size=batch_size)
    source_seqs_batch=as_matrix(source_seqs[batch_ix],source_letter_to_ix) 
    target_seqs_batch=as_matrix(target_seqs[batch_ix],target_letter_to_ix)
    
    return source_seqs_batch,target_seqs_batch

In [None]:
from tqdm import tqdm_notebook

#total N iterations
n_epochs=100

# how many minibatches are there in the epoch 
batches_per_epoch = 500

#how many training sequences are processed in a single function call
batch_size=10


for epoch in tqdm_notebook(range(n_epochs)):


    avg_cost = 0;
    
    for _ in tqdm_notebook(range(batches_per_epoch)):
        
        x,y = sample_batch(source_seqs,target_seqs,batch_size)
        avg_cost += train(x, y).mean()
        
    print("Epoch {} average loss = {}".format(epoch, avg_cost / batches_per_epoch))
    for i in range(5):
        ind = np.random.randint(len(source_seqs))
        print (source_seqs[ind],':', ''.join(generate_target_content(source_seqs[ind],sample=True)[1:-1]))



In [None]:
generate_target_content(" C_{4}H_{1}", t=2)

## Homework part 2 - chemistry (6 pt total)

* [4pts] Complete notebook and make sure target sequence is being generated.
* [2pts] Modify train cycle to output sequences with different sampling strategies (varying t in range $[0, + \infty)$ and try to find out which sampling strategy is the best for current task)


## [bonus] [2pts]  Latex display
Swap target and source and learn name->formula, then try to reach quality when almos any generated sequence is a valid Latex formula and implement its prinitng using IPython magic in jupyter. It would be good if you create a demo and pass there some chemical (or not?) names

In [None]:
import IPython
z = IPython.display.Latex(data='$2+2$')
IPython.display.display(z)

# And now,
* try lstm/gru
* try several layers
* try mtg cards
* try your own dataset of any kind