In [None]:
# Preprocessing Brown

import time
import numpy as np

from nltk.corpus import brown
from spacy.en import English

brown_sents = [unicode(' '.join(sent)) for sent in brown.sents()]

def parse(sents, n_threads=20, batch_size=1000):
    size = len(sents)
    count = 0
    start_time = time.time()
    data, idx2word, idx2tag = [], set(), set()
    parser = English()
    for parsed_sent in parser.pipe(sents, n_threads=n_threads, batch_size=batch_size):
        count += 1
        X_i,Y_i = [], []
        for token in parsed_sent:
            X_i.append(token.lemma_)
            Y_i.append(token.pos_)
            idx2word.add(token.lemma_)
            idx2tag.add(token.pos_)
        data.append((X_i,Y_i))
        if count%(size//20)==0:
            end_time = time.time()
            print '... parsed', count, 'sents' + '('+str(end_time-start_time)+')'
            start_time = end_time
    return data, list(idx2word), list(idx2tag), count

data, idx2word, idx2tag, count = parse(brown_sents)
word2idx = {w:i for i,w in enumerate(idx2word)}
tag2idx = {t:i for i,t in enumerate(idx2tag)}
VOCAB_SIZE = len(idx2word)
TAG_SIZE = len(idx2tag)
MAX_LEN = np.array([len(sent) for sent in data]).max()
print
print 'STATS:'
print 'data size:', count
print 'vocab size:', VOCAB_SIZE
print 'tag size:', TAG_SIZE

data_code = []
for X_i,Y_i in data:
    data_code.append(([word2idx[w] for w in X_i],[tag2idx[t] for t in Y_i]))
    
split = 0.8
cutoff = int(count*split)
data_train, data_test = data_code[:cutoff], data_code[cutoff:]
print 'train size:', len(data_train)
print 'test size:', len(data_test)

''' Data Stats

STATS:
data size: 57340
vocab size: 35241
tag size: 15
train size: 45872
test size: 11468

'''


''' Data Format

# original
([u'the', u'fulton', u'county', u'grand', u'jury', u'say', u'friday', u'an', u'investigation',
u'of', u'atlanta', u"'s", u'recent', u'primary', u'election', u'produce', u'``', u'no', u'evidence',
u"''", u'that', u'any', u'irregularity', u'take', u'place', u'.'], 
[u'DET', u'PROPN', u'PROPN', u'PROPN', u'PROPN', u'VERB', u'PROPN', u'DET', u'NOUN', u'ADP',
u'PROPN', u'PART', u'ADJ', u'NOUN', u'NOUN', u'VERB', u'PUNCT', u'DET', u'NOUN', u'PUNCT', 
u'ADJ', u'DET', u'NOUN', u'VERB', u'NOUN', u'PUNCT'])

# encoded
([11414, 29182, 13262, 3664, 18109, 27138, 7843, 9840, 18259, 16419, 10809, 18036, 10297, 
5613, 6999, 2859, 9415, 15909, 11514, 18019, 29312, 16179, 34814, 9580, 24917, 375], 
[5, 4, 4, 4, 4, 14, 4, 5, 1, 2, 4, 9, 13, 1, 1, 14, 8, 5, 1, 8, 13, 5, 1, 14, 1, 8])

'''

# Setup DyNet

import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import dynet as dn
dp = dn.DynetParams()
dp.set_mem(4000)
dp.set_autobatch(True)

class DataIterator:
    
    def __init__(self, data):
        self.data = data
        self.size = len(data)
        self.epoch = 0
        self.cursor = 0
        
    def next_batch(self, n):
        if self.cursor+n > self.size:
            self.epoch += 1
            self.cursor = 0
        batch = self.data[self.cursor:self.cursor+n]
        self.cursor += n
        return batch

def train(net, train, test, epochs=1):

    train = train*epochs
    train_size = len(train)
    test_freq = 10
    optimizer = dn.AdamTrainer(net.model)
    test_losses, iterations = [], []
    start_time = time.time()
    
    
# # Iterating feed    
#     gen = DataIterator(train)
#     batch_count = 0
#     while gen.epoch < epochs:
#         dn.renew_cg()
#         batch_count += 1
#         batch = gen.next_batch(50)
#         losses = []
#         for i,(X_i,Y_i) in enumerate(batch):
#             loss = net.get_loss(X_i,Y_i)
#             losses.append(loss)
#         loss = dn.esum(losses)
#         loss.backward()
#         optimizer.update()
        
#         if batch_count%test_freq==0:
#             test_loss = np.array([net.get_loss(X_i,Y_i).value() 
#                                   for X_i,Y_i in test]).sum()
#             print '... loss:', test_loss
#             test_losses.append(test_loss)
#             iterations.append(batch_count)            
            
    for i,(X_i,Y_i) in enumerate(train):
        loss = net.get_loss(X_i,Y_i)
        loss.backward()
        optimizer.update()
        end_time = time.time()
        print '... train loss:', loss.value(), '('+str(end_time-start_time)+')'
        start_time = end_time
        if i%test_freq==0:
            test_loss = np.array([net.get_loss(X_i,Y_i).value() 
                                  for X_i,Y_i in test]).sum()
            print '... loss:', test_loss
            test_losses.append(test_loss)
            iterations.append(i//test_freq)
    
    plt.plot(iterations, test_losses)
    plt.axis([0, 100, 0, len(test)*MAX_LEN])
    plt.show()
    print 'loss on test:', test_losses[-1]
    
    
class SRN:
    
    def __init__(self, num_layers, embeddings_size, hidden_size):
        self.model = dn.Model()
        self.embeddings = self.model.add_lookup_parameters((VOCAB_SIZE, embeddings_size))
        self.RNN = dn.LSTMBuilder(num_layers, embeddings_size, hidden_size, self.model)
        self.output_W = self.model.add_parameters((VOCAB_SIZE, hidden_size))
        self.output_b = self.model.add_parameters((VOCAB_SIZE))
        
    def embed(self, X):
        return [self.embeddings[w_i] for w_i in X]
    
    def run(self, init_state, X_emb):
        st = init_state
        states = st.add_inputs(X_emb)
        return [s.output() for s in states]
    
    def get_probs(self, rnn_output):
        output_W = dn.parameter(self.output_W)
        output_b = dn.parameter(self.output_b)
        return dn.softmax(output_W*rnn_output + output_b)
    
    def get_loss(self, X, Y):
        dn.renew_cg()
        X_emb = self.embed(X)
        init_state = self.RNN.initial_state()
        rnn_outputs = self.run(init_state, X_emb)
        losses = []
        for rnn_output,Y_i in zip(rnn_outputs,Y):
            probs = self.get_probs(rnn_output)
            losses.append(-dn.log(dn.pick(probs,Y_i)))
        return dn.esum(losses)
    
    def predict_tag(self, probs):
        probs = probs.value()
        return idx2tag[probs.index(max(probs))]
            
    def predict(self, X):
        dn.renew_cg()
        X_emb = self.embed(X)
        init_state = self.RNN.initial_state()
        rnn_outputs = self.run(init_state, X_emb)
        pred = []
        for rnn_output in rnn_outputs:
            probs = self.get_probs(rnn_output)
            pred.append(self.predict_tag(probs))
        return pred

# Training
    
num_layers = 1
embeddings_size = 50
hidden_size = 50

rnn = SRN(num_layers, embeddings_size, hidden_size)
train(rnn, data_train, data_test)
print 'Example:'
print 'Input sent:', [idx2word[idx] for idx in [w_i for w_i,t_i in data_test[0]]]
print 'Predicted tags:', rnn.predict([w_i for w_i,t_i in data_test[0]])