In [34]:
import sys
sys.path.append("../code/")
from wiki_dataset import get_wiki_dataset
from __future__ import division
from __future__ import print_function
import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L
import os
from chainer import training
from chainer.training import extensions
from chainer.functions import softmax 

# Translation matrix
In this notebook we use the two learned language models to learn an translation matrix. We do this by retraining a recurrent network. This model has to predict the probability distribution of the next word given the previous word embeddings of the what I call the fit language: in our case english. However the recurrent layer values will be obtained from the flow language: spanish. The embedding layer and recurrent layers will not be trained however, only a linear transformation layer between the embedding and recurrent layers. Also the linear layer from recurrent to network output has to be retrained. Since we cannot re-use it. The general idea however remains the same: to make sure that the flow patterns fits the one of the original embeddings that embedding has to be linearly transformed such that it fits in the flow of the flow language.

## Model
This the network we will be training. The l0 layer will be the linear transformation layer training, the recurrent l1-l2 layer are the recurrent flow layers. Where as the embeded layer and the output predict will be from the fit model. This model is also defined in code/model.py. Note that we have to retrain the l3 layer. 

In [3]:
class TranslationMatrixRNN(chainer.Chain):
    def __init__(self, n_units, n_vocab, train=True):
        super(TranslationMatrixRNN, self).__init__(
            embed=L.EmbedID(n_vocab, n_units),
            l0=L.Linear(n_units, n_units, nobias=True),
            l1=L.LSTM(n_units, n_units),
            l2=L.LSTM(n_units, n_units),
            l3=L.Linear(n_units, n_vocab),
        )
        self.n_vocab = n_vocab
        self.n_units = n_units

        # Our linear transformation layer starts with
        for param in self.l0.params():
            param.data[...] = np.eye(n_units)
        self.train = train

    def reset_state(self):
        self.l1.reset_state()
        self.l2.reset_state()

    def __call__(self, x):
        h0 = self.embed(x)
        h1 = self.l0(h0)
        h2 = self.l1(F.dropout(h1, train=self.train))
        h3 = self.l2(F.dropout(h2, train=self.train))
        y = self.l3(F.dropout(h3, train=self.train))
        return y


## Analyse results
Once we trained the model we can examine the results. We will need the learned translation matrix model and the word embeddings of the original two language models.

In [5]:
from models import load_rnn_model
from helpers import read_dataset
def load_tmrnn_model(model_file,n_vocab,n_units):
    """
    Loads pre-trained trasnaltion matrix rnn language model in a classifier object
    Exact training procedure is explained in language model notebook
    """
    model = L.Classifier(TranslationMatrixRNN(n_units, n_vocab ))
    model.compute_accuracy = False
    chainer.serializers.load_npz(model_file, model)
    return model

ENGLISH_FOLDER = "../result-english/"
SPANISH_FOLDER = "../result-spanish/"
TRANS_FOLDER = "../result-trans/"

model_english = os.path.join(ENGLISH_FOLDER,'650_u_45_e_1M_v_5_th_model')
model_spanish = os.path.join(SPANISH_FOLDER,'650_u_45_e_1M_v_5_th_model')
model_trans = os.path.join(TRANS_FOLDER,'model_fit_en_flow_es_30')
seq_en, voc_en = read_dataset(os.path.join(ENGLISH_FOLDER,'data.npz'))
seq_es, voc_es = read_dataset(os.path.join(SPANISH_FOLDER,'data.npz'))
print("English has vocubaly size of %d" % len(voc_en))
print("Spanish has vocubaly size of %d" % len(voc_es))
rnn_en = load_rnn_model(model_english,len(voc_en),650)
rnn_es = load_rnn_model(model_spanish,len(voc_es),650)
tmrnn = load_tmrnn_model(model_trans, len(voc_en),650)

English has vocubaly size of 15736
Spanish has vocubaly size of 16509


In [68]:
import scipy.spatial.distance
from operator import itemgetter
from heapq import nlargest
from models import load_rnn_model
import re
from chainer.functions.activation.softmax import softmax

def tokenize(line):
    line = line.replace("<br>", " ").replace(". ", " <eos> ").lower()
    for token in re.findall("[\w\<\>]+", line):
        yield token

embed_en = rnn_en.predictor.embed.W.data
embed_es = rnn_en.predictor.embed.W.data
translation = tmrnn.predictor.l0.W.data
translated_en = np.dot(embed_en, translation)

class Translator():
    def __init__(self, embedding_a, embedding_b, voc_a,voc_b):
        self.em_a = embedding_a
        self.em_b = embedding_b
        self.voc_a = voc_a
        self.voc_b = voc_b
        self.inv_voc_a = self._create_inverse_voc(voc_a)
        self.inv_voc_b = self._create_inverse_voc(voc_b)

    def closest_to(self, words_a, top_n=10):
        translations = []
        idxs = [self.inv_voc_a[word_a] for word_a in words_a]
        print(idxs)
        embeddings = self.em_a[idxs,:]
        
        dist = scipy.spatial.distance.cdist(embeddings,self.em_b,'cosine')
        
        for x in range(dist.shape[0]):
            print(words_a[x])
            result = nlargest(top_n, enumerate(dist[x,:]), itemgetter(1))
            for idx,r in result:
                print(idx,self.voc_b[idx],r)
        
        return dist.shape
        
        
    
    def _create_inverse_voc(self, voc):
        """ Creates inverse vocabulary from word to index
        """
        return dict([(word,idx) for idx,word in enumerate(voc)])
    
TOP_N = 5

def create_inverse_voc(voc):
    """ Creates inverse vocabulary from word to index
    """
    return dict([(word,idx) for idx,word in enumerate(voc)])

def map_line_to_seq(line,inverse_voc):
    """ Converts a string(sentence) to a sequence of integers
    Will also tokenize the sentence
    """
    return [inverse_voc[w] if w in inverse_voc else inverse_voc['<below_th>'] for w in tokenize(line)]

def fill_till_max(x,filler=-1,n=100):
    """ Will make an array of fixed size n, will use x to fill this array. 
    If len(x) < n will fill the rest with filler. 
    """
    return [x[i] if len(x) > i else filler for i in range(n)]

def map_seq_to_sentence(seq, voc):
    """ Maps seqs back to a readable sentence
    """
    return " ".join([voc[int(w)] for w in seq]).replace(" <eos>",".")

def generate_text(pred, voc, seeds=[],max_len=100):
    pred.reset_state()
    inverse_voc = create_inverse_voc(voc)
    
    # matrix of sentences in rows, words in columns
    text_idx = np.array([fill_till_max(map_line_to_seq(s,inverse_voc),n=max_len) for s in seeds], dtype=np.int32)
    
    # i is the to predict word column
    for i in range(2,max_len):
        # Our input is all words before the one to predict
        before = i-1
        # create a batch looking max 35 words back
        begin = max(0,before-35)
        
        #calculate probabilty
        x = pred(text_idx[:,before])
        d = softmax(x).data #convert to prob distribution
        next_words = []
        d = np.delete(d,0,1) #Ignore <below_th> keyword
        for r in range(d.shape[0]):
            top = nlargest(TOP_N, enumerate(d[r,:]), itemgetter(1))
            idx = [x[0] for x in top]
            probs = [x[1] for x in top]
            probs = np.array(probs)/np.sum(probs) #normalize to valid prob. distribution
            next_words.append(np.random.choice(idx,p=probs) + 1)
    
        for si, w in enumerate(next_words):
            # only replace -1 values, those have to be generated
            if text_idx[si,i] < 0:
                text_idx[si,i] = w
        
    
    return [map_seq_to_sentence(s, voc) for s in text_idx]

#print("Generating for english:")
#seeds = ["Sheep eat a lot of grass and produce wool. ", "The Golden Rule or law of reciprocity is the principle of treating others as one would wish"]
#tmrnn.predictor.train = False
#text = generate_text(tmrnn.predictor, voc_en, seeds)
#for t in text:
#    print(t)
trans = Translator(translated_en, embed_es, voc_en, voc_es)
print (trans.closest_to(['the','house','national','a','mountain']))

[5, 401, 487, 45, 288]
the
1309 pueblo 1.15414505531
11463 definidos 1.15229425961
5774 proveniente 1.15197452175
254 000 1.15127534051
11634 ignorancia 1.14225533182
2168 matemáticas 1.1381306797
11831 argumenta 1.13665396969
12374 vor 1.13277899351
6129 checo 1.12781306275
7996 escaleras 1.1267066339
house
1937 estrada 1.15229862184
14516 válvula 1.14469017955
10537 manufactura 1.14275755929
5037 aceite 1.1361963191
7540 proteínas 1.13199044231
14265 ciudadela 1.12970709624
6566 encabezada 1.12922698086
15072 encuestas 1.12906708268
3747 centrales 1.12775513898
9550 tailandia 1.1268144965
national
159 impacto 1.1646993011
4606 electrónica 1.16330005766
8152 jugando 1.15874897734
9852 gil 1.15713302497
3400 php 1.13882280923
7976 pau 1.13812396435
14340 permanecen 1.13673320906
10632 descendido 1.13261568912
9038 ganados 1.13054859588
6370 designación 1.1296080716
a
8153 service 1.14864263779
915 sea 1.14637861033
15562 daimler 1.14027998533
1850 fundadores 1.13646963266
4127 ganadore

435.863
