In [1]:
import numpy as np
import random
import matplotlib
import os
apollo_root = os.environ['APOLLO_ROOT']

%matplotlib inline

import apollo
import logging
from apollo import layers

In [2]:
def get_hyper():
    hyper = {}
    hyper['vocab_size'] = 10000
    hyper['batch_size'] = 32
    hyper['init_range'] = 0.1
    hyper['zero_symbol'] = hyper['vocab_size'] - 1
    hyper['unknown_symbol'] = hyper['vocab_size'] - 2
    hyper['test_interval'] = 100
    hyper['test_iter'] = 20
    hyper['base_lr'] = 20
    hyper['weight_decay'] = 0
    hyper['momentum'] = 0.0
    hyper['clip_gradients'] = 0.24
    hyper['display_interval'] = 20
    hyper['max_iter'] = 2000000
    hyper['snapshot_prefix'] = '/tmp/lm'
    hyper['snapshot_interval'] = 10000
    hyper['random_seed'] = 22
    hyper['gamma'] = 0.792
    hyper['stepsize'] = 10000
    hyper['mem_cells'] = 250
    hyper['graph_interval'] = 1000
    hyper['graph_prefix'] = ''
    return hyper

hyper = get_hyper()

apollo.Caffe.set_random_seed(hyper['random_seed'])
apollo.Caffe.set_mode_gpu()
apollo.Caffe.set_device(1)
apollo.Caffe.set_logging_verbosity(3)

In [6]:
def get_data():
    # You can download this file with bash ./data/language_model/get_lm.sh
    data_source = '%s/data/language_model/train_indices.txt' % apollo_root
    epoch = 0
    while True:
        with open(data_source, 'r') as f:
            for x in f.readlines():
                yield x.strip().split(' ')
        logging.info('epoch %s finished' % epoch)
        epoch += 1
    
def pad_batch(sentence_batch):
    max_len = max(len(x) for x in sentence_batch)
    result = []
    for x in sentence_batch:
        y = [int(z) if int(z) < hyper['unknown_symbol'] else hyper['unknown_symbol']
            for z in x]
        result.append(y + [hyper['zero_symbol']] * (max_len - len(x)))
    return result
    
def get_data_batch(data_iter):
    while True:
        raw_batch = []
        for i in range(hyper['batch_size']):
            raw_batch.append(next(data_iter))
        sentence_batch = np.array(pad_batch(raw_batch))
        yield sentence_batch

In [7]:
def forward(net, sentence_batches):
    sentence_batch = next(sentence_batches)
    length = min(sentence_batch.shape[1], 30)

    filler = layers.Filler(type='uniform', max=hyper['init_range'],
        min=(-hyper['init_range']))
    net.forward_layer(layers.NumpyData(name='lstm_seed',
        data=np.zeros((hyper['batch_size'], hyper['mem_cells'], 1, 1))))
    net.forward_layer(layers.NumpyData(name='label',
        data=np.zeros((hyper['batch_size'] * length, 1, 1, 1))))
    hidden_concat_bottoms = []
    for step in range(length):
        net.forward_layer(layers.DummyData(name=('word%d' % step),
            shape=[hyper['batch_size'], 1, 1, 1]))
        if step == 0:
            prev_hidden = 'lstm_seed'
            prev_mem = 'lstm_seed'
            word = np.zeros(sentence_batch[:, 0].shape)
        else:
            prev_hidden = 'lstm%d_hidden' % (step - 1)
            prev_mem = 'lstm%d_mem' % (step - 1)
            word = sentence_batch[:, step - 1]
        net.tops['word%d' % step].data[:,0,0,0] = word
        net.forward_layer(layers.Wordvec(name=('wordvec%d' % step),
            bottoms=['word%d' % step],
            dimension=hyper['mem_cells'], vocab_size=hyper['vocab_size'],
            param_names=['wordvec_param'], weight_filler=filler))
        net.forward_layer(layers.Concat(name='lstm_concat%d' % step,
            bottoms=[prev_hidden, 'wordvec%d' % step]))
        net.forward_layer(layers.Lstm(name='lstm%d' % step,
            bottoms=['lstm_concat%d' % step, prev_mem],
            param_names=['lstm_input_value', 'lstm_input_gate',
                'lstm_forget_gate', 'lstm_output_gate'],
            tops=['lstm%d_hidden' % step, 'lstm%d_mem' % step],
            num_cells=hyper['mem_cells'], weight_filler=filler))
        net.forward_layer(layers.Dropout(name='dropout%d' % step,
            bottoms=['lstm%d_hidden' % step], dropout_ratio=0.16))
        hidden_concat_bottoms.append('dropout%d' % step)

    net.forward_layer(layers.Concat(name='hidden_concat',
        concat_dim=0, bottoms=hidden_concat_bottoms))
    net.tops['label'].data[:,0,0,0] = sentence_batch[:, :length].T.flatten()
    net.forward_layer(layers.InnerProduct(name='ip', bottoms=['hidden_concat'],
        num_output=hyper['vocab_size'], weight_filler=filler))
    loss = net.forward_layer(layers.SoftmaxWithLoss(name='softmax_loss',
        ignore_label=hyper['zero_symbol'], bottoms=['ip', 'label']))
    return loss

In [8]:
net = apollo.Net()

apollo.log.log_to_stdout() # for ipython notebook
sentences = get_data()
sentence_batches = get_data_batch(sentences)

forward(net, sentence_batches)
net.reset_forward()
train_loss_hist = []

In [None]:
for i in range(hyper['max_iter']):
    train_loss_hist.append(forward(net, sentence_batches))
    net.backward()
    lr = (hyper['base_lr'] * (hyper['gamma'])**(i // hyper['stepsize']))
    net.update(lr=lr, momentum=hyper['momentum'],
        clip_gradients=hyper['clip_gradients'], weight_decay=hyper['weight_decay'])
    if i % hyper['display_interval'] == 0:
        logging.info('Iteration %d: %s' % (i, np.mean(train_loss_hist[-hyper['display_interval']:])))
    if i % hyper['test_interval'] == 0:
        #test_performance(net, test_net)
        pass
    if i % hyper['snapshot_interval'] == 0 and i > 0:
        filename = '%s_%d.h5' % (hyper['snapshot_prefix'], i)
        logging.info('Saving net to: %s' % filename)
        net.save(filename)
    if i % hyper['graph_interval'] == 0 and i > 0:
        sub = 100
        plt.plot(np.convolve(train_loss_hist, np.ones(sub)/sub)[sub:-sub])
        filename = '%strain_loss.jpg' % hyper['graph_prefix']
        logging.info('Saving figure to: %s' % filename)
        plt.savefig(filename)

2015-07-05 13:17:39,582 - INFO - Iteration 0: 6.8161034584
2015-07-05 13:17:42,248 - INFO - Iteration 20: 6.49280529022
2015-07-05 13:17:44,842 - INFO - Iteration 40: 6.1886277914
2015-07-05 13:17:47,437 - INFO - Iteration 60: 6.03695037365
2015-07-05 13:17:50,030 - INFO - Iteration 80: 5.94959592819
2015-07-05 13:17:52,626 - INFO - Iteration 100: 5.79375097752
2015-07-05 13:17:55,221 - INFO - Iteration 120: 5.88647036552
2015-07-05 13:17:57,818 - INFO - Iteration 140: 5.76611590385
2015-07-05 13:18:00,416 - INFO - Iteration 160: 5.68361911774
2015-07-05 13:18:03,013 - INFO - Iteration 180: 5.67861757278
2015-07-05 13:18:05,610 - INFO - Iteration 200: 5.574786973
2015-07-05 13:18:08,204 - INFO - Iteration 220: 5.5921135664
2015-07-05 13:18:10,799 - INFO - Iteration 240: 5.50135638714
2015-07-05 13:18:13,394 - INFO - Iteration 260: 5.53342981339
2015-07-05 13:18:15,990 - INFO - Iteration 280: 5.4502014637


In [9]:
def eval_forward(net):
    output_words = []
    filler = layers.Filler(type='uniform', max=hyper['init_range'],
        min=(-hyper['init_range']))
    net.forward_layer(layers.NumpyData(name='lstm_hidden_prev',
        data=np.zeros((1, hyper['mem_cells'], 1, 1))))
    net.forward_layer(layers.NumpyData(name='lstm_mem_prev',
        data=np.zeros((1, hyper['mem_cells'], 1, 1))))
    length = 30
    for step in range(length):
        net.forward_layer(layers.NumpyData(name=('word'),
            data=np.zeros((1, 1, 1, 1))))
        prev_hidden = 'lstm_hidden_prev'
        prev_mem = 'lstm_mem_prev'
        word = np.zeros((1, 1, 1, 1))
        if step == 0:
            net.tops['word'].data[0,0,0,0] = random.randrange(1,100)
        else:
            output_words.append(np.argmax(net.tops['softmax'].data.flatten()[:9000]))
            net.tops['word'].data[0,0,0,0] = np.argmax(net.tops['softmax'].data)
        net.forward_layer(layers.Wordvec(name=('wordvec'),
            bottoms=['word'],
            dimension=hyper['mem_cells'], vocab_size=hyper['vocab_size'],
            param_names=['wordvec_param'], weight_filler=filler))
        net.forward_layer(layers.Concat(name='lstm_concat',
            bottoms=[prev_hidden, 'wordvec']))
        net.forward_layer(layers.Lstm(name='lstm',
            bottoms=['lstm_concat', prev_mem],
            param_names=['lstm_input_value', 'lstm_input_gate',
                'lstm_forget_gate', 'lstm_output_gate'],
            tops=['lstm_hidden_next', 'lstm_mem_next'],
            num_cells=hyper['mem_cells'], weight_filler=filler))
        net.forward_layer(layers.Dropout(name='dropout',
            bottoms=['lstm_hidden_next'], dropout_ratio=0.16))

        net.forward_layer(layers.InnerProduct(name='ip', bottoms=['dropout'],
            num_output=hyper['vocab_size'], weight_filler=filler))
        net.forward_layer(layers.Softmax(name='softmax',
            ignore_label=hyper['zero_symbol'], bottoms=['ip']))
        net.tops['lstm_hidden_prev'].data_tensor.copy_from(net.tops['lstm_hidden_next'].data_tensor)
        net.tops['lstm_mem_prev'].data_tensor.copy_from(net.tops['lstm_mem_next'].data_tensor)
        net.reset_forward()
    return output_words

In [10]:
import pickle
import os
with open('%s/data/language_model/vocab.pkl' % os.environ['APOLLO_ROOT'], 'r') as f:
    vocab = pickle.load(f)
inv_vocab = {v: k for k, v in vocab.items()}

In [11]:
eval_net = apollo.Net()
eval_forward(eval_net)
eval_net.load('%s_20000.h5' % hyper['snapshot_prefix'])

In [None]:
output_words = eval_forward(eval_net)
print ' '.join([inv_vocab[x] for x in output_words])