In [1]:
import numpy as np
import random
import matplotlib
import os
import simplejson as json
apollo_root = os.environ['APOLLO_ROOT']

%matplotlib inline

import apollo
import logging
from apollo import layers

In [2]:
import pickle
import os
with open('%s/data/language_model/vocab.pkl' % os.environ['APOLLO_ROOT'], 'r') as f:
    vocab = pickle.load(f)
ivocab = {v: k for k, v in vocab.items()}

In [3]:
def get_hyper():
    hyper = {}
    hyper['vocab_size'] = 256
    hyper['batch_size'] = 32
    hyper['init_range'] = 0.1
    hyper['zero_symbol'] = hyper['vocab_size'] - 1
    hyper['unknown_symbol'] = hyper['vocab_size'] - 2
    hyper['test_interval'] = 100
    hyper['test_iter'] = 20
    hyper['base_lr'] = 20
    hyper['weight_decay'] = 0
    hyper['momentum'] = 0.0
    hyper['clip_gradients'] = 0.24
    hyper['display_interval'] = 100
    hyper['max_iter'] = 10000
    hyper['snapshot_prefix'] = '/tmp/char'
    hyper['snapshot_interval'] = 1000
    hyper['random_seed'] = 22
    hyper['gamma'] = 0.8
    hyper['graph_interval'] = 1000
    hyper['stepsize'] = 2500
    hyper['mem_cells'] = 1000

    hyper['graph_interval'] = 1000
    hyper['graph_prefix'] = ''
    hyper['i_temperature'] = 1.5
    return hyper

hyper = get_hyper()

apollo.Caffe.set_random_seed(hyper['random_seed'])
apollo.Caffe.set_mode_gpu()
apollo.Caffe.set_device(1)
apollo.Caffe.set_logging_verbosity(3)

In [4]:
def get_data():
    data_source = '%s/data/char_model/reddit_ml.txt' % apollo_root
    if not os.path.exists(data_source):
        raise IOError('You must download the data with ./data/character_model/get_reddit_lm.sh')
    epoch = 0
    while True:
        with open(data_source, 'r') as f:
            for x in f.readlines():
                data = json.loads(x)
                if len(data['body']) == 0:
                    continue
                yield data
        logging.info('epoch %s finished' % epoch)
        epoch += 1

def get_data_batch(data_iter):
    while True:
        batch = []
        for i in range(hyper['batch_size']):
            batch.append(next(data_iter))
        yield batch

In [5]:
def pad_batch(sentence_batch):
    max_len = max(len(x) for x in sentence_batch)
    result = []
    for sentence in sentence_batch:
        chars = [min(ord(c), 255) for c in sentence] 
        result.append(chars + [hyper['zero_symbol']] * (max_len - len(sentence)))
    return result

In [6]:
def forward(net, sentence_batches):
    batch = next(sentence_batches)
    sentence_batch = np.array(pad_batch([x['body'] for x in batch]))
    length = min(sentence_batch.shape[1], 100)
    assert length > 0

    filler = layers.Filler(type='uniform', max=hyper['init_range'],
        min=(-hyper['init_range']))
    net.forward_layer(layers.NumpyData(name='lstm_seed',
        data=np.zeros((hyper['batch_size'], hyper['mem_cells'], 1, 1))))
    net.forward_layer(layers.NumpyData(name='label',
        data=np.zeros((hyper['batch_size'] * length, 1, 1, 1))))
    hidden_concat_bottoms = []
    for step in range(length):
        net.forward_layer(layers.DummyData(name=('word%d' % step),
            shape=[hyper['batch_size'], 1, 1, 1]))
        if step == 0:
            prev_hidden = 'lstm_seed'
            prev_mem = 'lstm_seed'
            word = np.zeros(sentence_batch[:, 0].shape)
        else:
            prev_hidden = 'lstm%d_hidden' % (step - 1)
            prev_mem = 'lstm%d_mem' % (step - 1)
            word = sentence_batch[:, step - 1]
        net.tops['word%d' % step].data[:,0,0,0] = word
        net.forward_layer(layers.Wordvec(name=('wordvec%d' % step),
            bottoms=['word%d' % step],
            dimension=hyper['mem_cells'], vocab_size=hyper['vocab_size'],
            param_names=['wordvec_param'], weight_filler=filler))
        net.forward_layer(layers.Concat(name='lstm_concat%d' % step,
            bottoms=[prev_hidden, 'wordvec%d' % step]))
        net.forward_layer(layers.Lstm(name='lstm%d' % step,
            bottoms=['lstm_concat%d' % step, prev_mem],
            param_names=['lstm_input_value', 'lstm_input_gate',
                'lstm_forget_gate', 'lstm_output_gate'],
            tops=['lstm%d_hidden' % step, 'lstm%d_mem' % step],
            num_cells=hyper['mem_cells'], weight_filler=filler))
        net.forward_layer(layers.Dropout(name='dropout%d' % step,
            bottoms=['lstm%d_hidden' % step], dropout_ratio=0.16))
        hidden_concat_bottoms.append('dropout%d' % step)

    net.forward_layer(layers.Concat(name='hidden_concat',
        concat_dim=0, bottoms=hidden_concat_bottoms))
    net.tops['label'].data[:,0,0,0] = sentence_batch[:, :length].T.flatten()
    net.forward_layer(layers.InnerProduct(name='ip', bottoms=['hidden_concat'],
        num_output=hyper['vocab_size'], weight_filler=filler))
    loss = net.forward_layer(layers.SoftmaxWithLoss(name='softmax_loss',
        ignore_label=hyper['zero_symbol'], bottoms=['ip', 'label']))
    return loss

In [7]:
def eval_performance(net):
    eval_net = apollo.Net()
    eval_forward(eval_net)
    eval_net.copy_params_from(net)
    output_words = eval_forward(eval_net)
    print ''.join([chr(x) for x in output_words])

In [8]:
def softmax_choice(data):
    return np.random.choice(range(len(data.flatten())), p=data.flatten())

In [9]:
def eval_forward(net):
    output_words = []
    filler = layers.Filler(type='uniform', max=hyper['init_range'],
        min=(-hyper['init_range']))
    net.forward_layer(layers.NumpyData(name='lstm_hidden_prev',
        data=np.zeros((1, hyper['mem_cells'], 1, 1))))
    net.forward_layer(layers.NumpyData(name='lstm_mem_prev',
        data=np.zeros((1, hyper['mem_cells'], 1, 1))))
    length = 150
    for step in range(length):
        net.forward_layer(layers.NumpyData(name=('word'),
            data=np.zeros((1, 1, 1, 1))))
        prev_hidden = 'lstm_hidden_prev'
        prev_mem = 'lstm_mem_prev'
        word = np.zeros((1, 1, 1, 1))
        if step == 0:
            output = ord('.')
        else:
            output = softmax_choice(net.tops['softmax'].data)
        output_words.append(output)
        net.tops['word'].data[0,0,0,0] = output
        net.forward_layer(layers.Wordvec(name=('wordvec'),
            bottoms=['word'],
            dimension=hyper['mem_cells'], vocab_size=hyper['vocab_size'],
            param_names=['wordvec_param'], weight_filler=filler))
        net.forward_layer(layers.Concat(name='lstm_concat',
            bottoms=[prev_hidden, 'wordvec']))
        net.forward_layer(layers.Lstm(name='lstm',
            bottoms=['lstm_concat', prev_mem],
            param_names=['lstm_input_value', 'lstm_input_gate',
                'lstm_forget_gate', 'lstm_output_gate'],
            tops=['lstm_hidden_next', 'lstm_mem_next'],
            num_cells=hyper['mem_cells'], weight_filler=filler))
        net.forward_layer(layers.Dropout(name='dropout',
            bottoms=['lstm_hidden_next'], dropout_ratio=0.16))

        net.forward_layer(layers.InnerProduct(name='ip', bottoms=['dropout'],
            num_output=hyper['vocab_size'], weight_filler=filler))
        net.tops['ip'].data[:] *= hyper['i_temperature']
        net.forward_layer(layers.Softmax(name='softmax',
            ignore_label=hyper['zero_symbol'], bottoms=['ip']))
        net.tops['lstm_hidden_prev'].data_tensor.copy_from(net.tops['lstm_hidden_next'].data_tensor)
        net.tops['lstm_mem_prev'].data_tensor.copy_from(net.tops['lstm_mem_next'].data_tensor)
        net.reset_forward()
    return output_words

In [None]:
net = apollo.Net()

apollo.log.log_to_stdout() # for ipython notebook
sentences = get_data()
sentence_batches = get_data_batch(sentences)

forward(net, sentence_batches)
net.reset_forward()
train_loss_hist = []

In [None]:
for i in range(hyper['max_iter']):
    train_loss_hist.append(forward(net, sentence_batches))
    net.backward()
    lr = (hyper['base_lr'] * (hyper['gamma'])**(i // hyper['stepsize']))
    net.update(lr=lr, momentum=hyper['momentum'],
        clip_gradients=hyper['clip_gradients'], weight_decay=hyper['weight_decay'])
    if i % hyper['display_interval'] == 0:
        logging.info('Iteration %d: %s' % (i, np.mean(train_loss_hist[-hyper['display_interval']:])))
    if i % hyper['test_interval'] == 0:
        eval_performance(net)
    if i % hyper['snapshot_interval'] == 0 and i > 0:
        filename = '%s_%d.h5' % (hyper['snapshot_prefix'], i)
        logging.info('Saving net to: %s' % filename)
        net.save(filename)
    if i % hyper['graph_interval'] == 0 and i > 0:
        sub = 100
        plt.plot(np.convolve(train_loss_hist, np.ones(sub)/sub)[sub:-sub])
        filename = '%strain_loss.jpg' % hyper['graph_prefix']
        logging.info('Saving figure to: %s' % filename)
        plt.savefig(filename)

2015-07-14 16:31:25,870 - INFO - Iteration 0: 5.55461072922
.M���Y�$caE`i���S
I�ŝW�&,4ǝ��y9 ��'$���)8~ 	�L�k�,��D�	�挞� ��[u�eȎ�u� � p���ru%���r
D�x�]wz8�{e �s?���O�u���ƛ�'��pl��  r  e�ao '
2015-07-14 16:32:00,581 - INFO - Iteration 100: 4.48616950274
.. Itthessimeallyousthisusterelthiseredithessielloouthereni�eturexicemeteesealestichisimelestilly therenecreacliouiestirearetiseallestores, freaechet
2015-07-14 16:32:37,788 - INFO - Iteration 200: 2.5186178422
. Thinks of inperimech of you beanding "normally and like ingerestime " problemaring mabuling some rearning (andersticking don't formands and of have 
2015-07-14 16:33:14,259 - INFO - Iteration 300: 2.02955379009
. I coversultions, toologoodifientoonont/station/onenotitionasioning. Confinations. oftingterfortuations. Explaining, there tools.it/ontoro/in/gotiona
2015-07-14 16:33:50,748 - INFO - Iteration 400: 1.80005491734
.  I want a data is ration is a many not do you main function as surp of s a sparse that is with a