In [40]:
import sys
sys.path.append("../code/")
from wiki_dataset import get_wiki_dataset
from __future__ import division
from __future__ import print_function
import numpy as np

import chainer
import chainer.functions as F
import chainer.links as L
from chainer.datasets.ptb import get_ptb_words
from chainer import training
from chainer.training import extensions

# Idea
We want to see if we can use the general sequential patterns in textual data to learn word 2 word alignments in a unsupervised manner. Given two datasets for two languages we will first learn a language model for each language: a recurrent neural network that tries to predict the next word given the sequence of words before.

For each model we will try to predict the values of the next word embedding value. In the next phase we will use the language model of language a to predict the words of language b. Hence the input embeddings and output embeddings will be of language b where as all other parameters will be off language. In this phase a transformation matrix will be added that is the only that will be learned.

## Model
Our model is based on the https://github.com/pfnet/chainer/tree/master/examples/ptb model. However we add an extra weights layer that can later be used to linearly transform the input matrix. In the beginning it will be an identity matrix and parameters will not be updated. In the next phase it will be the only one that is learned. 

In [35]:
# This a simple language model, directly copied from the tutorial
class RNNLM(chainer.Chain):

    def __init__(self, n_vocab, n_units, train=True):
        super(RNNLM, self).__init__(
            embed=L.EmbedID(n_vocab, n_units),
            l1=L.LSTM(n_units, n_units),
            l2=L.LSTM(n_units, n_units),
            l3=L.Linear(n_units, n_units),
        )
        self.n_units = n_units
        self.n_vocab = n_vocab
        # Initialize with uniform distribution, expect for our linear tranformation layer
        for param in self.params():
            param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
        
        self.train = train

    def reset_state(self):
        self.l1.reset_state()
        self.l2.reset_state()

    def __call__(self, x):
        h0 = self.embed(x)
        h2 = self.l1(F.dropout(h0, train=self.train))
        h3 = self.l2(F.dropout(h2, train=self.train))
        y = self.l3(F.dropout(h3, train=self.train))
        return y

# This network builds a bit different network that add layers that we need to be able to learn our translation
# First we need the translation layer 
# Secondly we need to the predict the next embedding instead of the word itself. If not we cannot re-use the network for the other language
# We can however also no train the word embedding immid
class TransformingRNNLM(chainer.Chain):

    def __init__(self, lm, train=True):
        super(TransformingRNNLM, self).__init__(
            embed=L.EmbedID(n_vocab, n_units),
            l0=L.Linear(n_units,n_units,nobias=True),
            l1=L.LSTM(n_units, n_units),
            l2=L.LSTM(n_units, n_units),
            l3=L.Linear(n_units, n_vocab),
        )
        
        # Initialize with uniform distribution, expect for our linear tranformation layer
        for param in self.params():
            param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
        
        # Our linear tranformation layer starts with 
        for param in self.l0.params():
            param.data[...] = np.eye(n_units)
        self.train = train

    def reset_state(self):
        self.l1.reset_state()
        self.l2.reset_state()

    def __call__(self, x):
        h0 = self.embed(x)
        h1 = self.l0(h0)
        h2 = self.l1(F.dropout(h2, train=self.train))
        h3 = self.l2(F.dropout(h3, train=self.train))
        y = self.l3(F.dropout(h2, train=self.train))
        return y



In [32]:
x = RNNForLM(86000,500)

In [34]:
# Dataset iterator to create a batch of sequences at different positions.
# This iterator returns a pair of current words and the next words. Each
# example is a part of sequences starting from the different offsets
# equally spaced within the whole sequence.
class ParallelSequentialIterator(chainer.dataset.Iterator):

    def __init__(self, dataset, batch_size, repeat=True):
        self.dataset = dataset
        self.batch_size = batch_size  # batch size
        # Number of completed sweeps over the dataset. In this case, it is
        # incremented if every word is visited at least once after the last
        # increment.
        self.epoch = 0
        # True if the epoch is incremented at the last iteration.
        self.is_new_epoch = False
        self.repeat = repeat
        length = len(dataset)
        # Offsets maintain the position of each sequence in the mini-batch.
        self.offsets = [i * length // batch_size for i in range(batch_size)]
        # NOTE: this is not a count of parameter updates. It is just a count of
        # calls of ``__next__``.
        self.iteration = 0

    def __next__(self):
        # This iterator returns a list representing a mini-batch. Each item
        # indicates a different position in the original sequence. Each item is
        # represented by a pair of two word IDs. The first word is at the
        # "current" position, while the second word at the next position.
        # At each iteration, the iteration count is incremented, which pushes
        # forward the "current" position.
        length = len(self.dataset)
        if not self.repeat and self.iteration * self.batch_size >= length:
            # If not self.repeat, this iterator stops at the end of the first
            # epoch (i.e., when all words are visited once).
            raise StopIteration
        cur_words = self.get_words()
        self.iteration += 1
        next_words = self.get_words()

        epoch = self.iteration * self.batch_size // length
        self.is_new_epoch = self.epoch < epoch
        if self.is_new_epoch:
            self.epoch = epoch

        return list(zip(cur_words, next_words))

    @property
    def epoch_detail(self):
        # Floating point version of epoch.
        return self.iteration * self.batch_size / len(self.dataset)

    def get_words(self):
        # It returns a list of current words.
        return [self.dataset[(offset + self.iteration) % len(self.dataset)]
                for offset in self.offsets]

    def serialize(self, serializer):
        # It is important to serialize the state to be recovered on resume.
        self.iteration = serializer('iteration', self.iteration)
        self.epoch = serializer('epoch', self.epoch)


# Custom updater for truncated BackProp Through Time (BPTT)
class BPTTUpdater(training.StandardUpdater):

    def __init__(self, train_iter, optimizer, bprop_len, device):
        super(BPTTUpdater, self).__init__(
            train_iter, optimizer, device=device)
        self.bprop_len = bprop_len

    # The core part of the update routine can be customized by overriding.
    def update_core(self):
        loss = 0
        # When we pass one iterator and optimizer to StandardUpdater.__init__,
        # they are automatically named 'main'.
        train_iter = self.get_iterator('main')
        optimizer = self.get_optimizer('main')

        # Progress the dataset iterator for bprop_len words at each iteration.
        for i in range(self.bprop_len):
            # Get the next batch (a list of tuples of two word IDs)
            batch = train_iter.__next__()

            # Concatenate the word IDs to matrices and send them to the device
            # self.converter does this job
            # (it is chainer.dataset.concat_examples by default)
            x, t = self.converter(batch, self.device)

            # Compute the loss at this time step and accumulate it
            loss += optimizer.target(chainer.Variable(x), chainer.Variable(t))

        optimizer.target.cleargrads()  # Clear the parameter gradients
        loss.backward()  # Backprop
        loss.unchain_backward()  # Truncate the graph
        optimizer.update()  # Update the parameters


# Routine to rewrite the result dictionary of LogReport to add perplexity
# values
def compute_perplexity(result):
    result['perplexity'] = np.exp(result['main/loss'])
    if 'validation/main/loss' in result:
        result['val_perplexity'] = np.exp(result['validation/main/loss'])



## Dataset:
First we have to divide our training, test and validation set. I use roughlt the same fractionas are used in the get_ptb_words() dataset. First 10% - 90% training,validation. Then split the training and validation in 10-90% again.

In [49]:
train, val, test = get_ptb_words()
print(len(train), len(val), len(test))
def retrieve_and_split(dump):
    seq, voc = get_wiki_dataset(dump)
    
    val_start = int(len(seq) * .9)
    test_start = int(val_start *.9)
    train = seq[:test_start]
    test = seq[test_start:val_start]
    val = seq[val_start:]
    
    return train, test, val,voc

def train_phase1(dump,name):
    """
    
    """
    
    
    train, val, test,voc = retrieve_and_split('nlwiki/20161220/nlwiki-20161220-pages-articles1.xml.bz2')
    print("Going to run %s", name) 
    print("#training: %d, #val: %d, #test: %d", len(train), len(val), len(test))

929589 73760 82430


TypeError: 'NoneType' object is not iterable