In [1]:
!pip install dynet
!git clone https://github.com/neubig/nn4nlp-code.git

Collecting dynet
[?25l  Downloading https://files.pythonhosted.org/packages/1b/8c/767cc83241b2abe567d705f87589d8ad44cca321f7c78720269c45e0469f/dyNET-2.0.3-cp36-cp36m-manylinux1_x86_64.whl (27.8MB)
[K    100% |████████████████████████████████| 27.8MB 1.0MB/s 
Collecting cython (from dynet)
[?25l  Downloading https://files.pythonhosted.org/packages/1c/10/ffebdb9faa901c83b69ab7040a1f5f3b2c71899be141752a6d466718c491/Cython-0.28.4-cp36-cp36m-manylinux1_x86_64.whl (3.4MB)
[K    100% |████████████████████████████████| 3.4MB 5.5MB/s 
[?25hInstalling collected packages: cython, dynet
Successfully installed cython-0.28.4 dynet-2.0.3
fatal: destination path 'nn4nlp-code' already exists and is not an empty directory.


In [0]:
from __future__ import print_function
import time

from collections import defaultdict
import random
import math
import sys
import argparse

import dynet as dy
import numpy as np

In [0]:
#the parameters from mixer
NXENT = 40
NXER = 20
delta = 2

# format of files: each line is "word1|tag1 word2|tag2 ..."
train_file = "nn4nlp-code/data/tags/train.txt"
dev_file = "nn4nlp-code/data/tags/dev.txt"

w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))


def read(fname):
    """
    Read tagged file
    """
    with open(fname, "r") as f:
        for line in f:
            words, tags = [], []
            for wt in line.strip().split():
                w, t = wt.split('|')
                words.append(w2i[w])
                tags.append(t2i[t])
            yield (words, tags)


# Read the data
train = list(read(train_file))
unk_word = w2i["<unk>"]
w2i = defaultdict(lambda: unk_word, w2i)
unk_tag = t2i["<unk>"]
t2i = defaultdict(lambda: unk_tag, t2i)
nwords = len(w2i)
ntags = len(t2i)
dev = list(read(dev_file))

In [0]:
# DyNet Starts
model = dy.Model()
trainer = dy.AdamTrainer(model)

# Model parameters
EMBED_SIZE = 64
HIDDEN_SIZE = 128

# Lookup parameters for word embeddings
LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE))

# Word-level BiLSTM
LSTM = dy.BiRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE, model, dy.LSTMBuilder)

# Word-level softmax
W_sm = model.add_parameters((ntags, HIDDEN_SIZE))
b_sm = model.add_parameters(ntags)


#Baseline reward parameters
W_bl_p = model.add_parameters((1, HIDDEN_SIZE))
b_bl_p = model.add_parameters(1)

# Calculate the scores for one example
def calc_scores(words):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]

    return scores


# Calculate MLE loss for one example
def calc_loss(scores, tags):
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    return dy.esum(losses)

def calc_reinforce_loss(words, tags, delta):
    dy.renew_cg()

    # Transduce all batch elements with an LSTM
    word_reps = LSTM.transduce([LOOKUP[x] for x in words])

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    #calculate the probability distribution 
    scores = [dy.affine_transform([b, W, x]) for x in word_reps]
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    probs = [-dy.exp(loss).as_array() for loss in losses]

    #then take samples from the probability distribution
    samples = [np.random.choice(range(len(x)), p=x) for x in probs]

    #calculate accuracy=reward
    correct = [sample == tag for sample, tag in zip(samples, tags)]
    r_i = float(sum(correct))/len(correct)
    r = dy.constant((1), r_i)
    # Reward baseline for each word
    W_bl = dy.parameter(W_bl_p)
    b_bl = dy.parameter(b_bl_p)
    r_b = [dy.affine_transform([b_bl, W_bl, dy.nobackprop(x)]) for x in word_reps]

    #we need to take the value in order to break the computation graph
    #as the reward portion is trained seperatley and not backpropogated through during the overall score
    rewards_over_baseline = [(r - dy.nobackprop(x)) for x in r_b]
    #the scores for training the baseline
    baseline_scores = [dy.square(r - x) for x in r_b]

    #then calculate the reinforce scores using reinforce
    reinforce_scores = [r_s*score for r_s, score in zip(rewards_over_baseline, scores)]

    #we want the first len(sent)-delta scores from xent then delta scores from reinforce
    #for mixer
    if len(scores) > delta:
        mixer_scores = scores[:len(scores)-delta] + reinforce_scores[delta-1:]
    else:
        mixer_scores = reinforce_scores
    return dy.esum(mixer_scores), dy.esum(baseline_scores)


# Calculate number of tags correct for one example
def calc_correct(scores, tags):
    correct = [np.argmax(score.npvalue()) == tag for score, tag in zip(scores, tags)]
    return sum(correct)

In [5]:
# Perform training
for ITER in range(NXENT+NXER):
    random.shuffle(train)
    start = time.time()
    this_sents = this_words = this_loss = this_correct = 0
    for sid in range(0, len(train)):
        this_sents += 1
        if this_sents % int(1000) == 0:
            print("train loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
                this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)),
                  file=sys.stderr)
        # train on the example
        if ITER < NXER:
            words, tags = train[sid]
            scores = calc_scores(words)
            loss_exp = calc_loss(scores, tags)
            this_correct += calc_correct(scores, tags)
            this_loss += loss_exp.scalar_value()
            this_words += len(words)
            loss_exp.backward()
            trainer.update()
        else:
            delta = 2*(ITER - NXENT)
            mixer_loss, baseline_loss = calc_reinforce_loss(words, tags, delta)
            this_loss += mixer_loss.scalar_value() + baseline_loss.scalar_value()
            this_words += len(words)
            mixer_loss.backward()
            baseline_loss.backward()
            trainer.update()
    # Perform evaluation 
    start = time.time()
    this_sents = this_words = this_loss = this_correct = 0
    for words, tags in dev:
        this_sents += 1
        scores = calc_scores(words)
        loss_exp = calc_loss(scores, tags)
        this_correct += calc_correct(scores, tags)
        this_loss += loss_exp.scalar_value()
        this_words += len(words)
    print("dev loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
        this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)), file=sys.stderr)

train loss/word=0.5637, acc=86.76%, word/sec=5603.4796
train loss/word=0.4980, acc=86.88%, word/sec=5275.9078
train loss/word=0.4553, acc=87.44%, word/sec=5370.1220
train loss/word=0.4272, acc=87.92%, word/sec=5429.1631
train loss/word=0.4031, acc=88.43%, word/sec=5557.0012
train loss/word=0.3827, acc=88.90%, word/sec=5677.5937
train loss/word=0.3647, acc=89.34%, word/sec=5762.1203
train loss/word=0.3490, acc=89.73%, word/sec=5796.2541
train loss/word=0.3355, acc=90.10%, word/sec=5848.1657
train loss/word=0.3257, acc=90.36%, word/sec=5896.0950
dev loss/word=0.4211, acc=86.94%, word/sec=17244.6169
train loss/word=0.1830, acc=94.36%, word/sec=6301.6586
train loss/word=0.1803, acc=94.43%, word/sec=6317.0340
train loss/word=0.1765, acc=94.60%, word/sec=6270.0666
train loss/word=0.1752, acc=94.65%, word/sec=6257.2067
train loss/word=0.1756, acc=94.67%, word/sec=6280.3980
train loss/word=0.1735, acc=94.77%, word/sec=6279.4957
train loss/word=0.1732, acc=94.78%, word/sec=6292.9446
train loss/

train loss/word=0.0384, acc=98.95%, word/sec=6337.4868
train loss/word=0.0375, acc=98.97%, word/sec=6335.4072
train loss/word=0.0383, acc=98.96%, word/sec=6339.8847
train loss/word=0.0385, acc=98.96%, word/sec=6329.2470
train loss/word=0.0383, acc=98.96%, word/sec=6336.9445
train loss/word=0.0393, acc=98.95%, word/sec=6343.0772
dev loss/word=0.7254, acc=89.55%, word/sec=17147.3225
train loss/word=0.0226, acc=99.32%, word/sec=6343.3786
train loss/word=0.0235, acc=99.33%, word/sec=6359.5700
train loss/word=0.0241, acc=99.33%, word/sec=6369.9006
train loss/word=0.0237, acc=99.36%, word/sec=6361.7786
train loss/word=0.0239, acc=99.34%, word/sec=6358.7953
train loss/word=0.0244, acc=99.33%, word/sec=6358.7531
train loss/word=0.0247, acc=99.33%, word/sec=6361.0555
train loss/word=0.0253, acc=99.31%, word/sec=6354.9474
train loss/word=0.0254, acc=99.32%, word/sec=6358.5355
train loss/word=0.0254, acc=99.32%, word/sec=6367.3133
dev loss/word=0.8641, acc=89.46%, word/sec=17378.3417
train loss/w

train loss/word=0.0039, acc=99.89%, word/sec=6194.4832
train loss/word=0.0041, acc=99.89%, word/sec=6199.1903
dev loss/word=1.2339, acc=89.33%, word/sec=17071.3899
train loss/word=0.0021, acc=99.93%, word/sec=6235.9321
train loss/word=0.0019, acc=99.94%, word/sec=6210.4839
train loss/word=0.0020, acc=99.93%, word/sec=6204.0298
train loss/word=0.0024, acc=99.92%, word/sec=6208.1281
train loss/word=0.0025, acc=99.92%, word/sec=6221.4735
train loss/word=0.0023, acc=99.92%, word/sec=6229.6998
train loss/word=0.0026, acc=99.92%, word/sec=6230.1403
train loss/word=0.0028, acc=99.92%, word/sec=6234.9062
train loss/word=0.0028, acc=99.91%, word/sec=6233.6331
train loss/word=0.0029, acc=99.91%, word/sec=6232.2648
dev loss/word=1.4318, acc=89.06%, word/sec=16964.1641
train loss/word=0.0015, acc=99.96%, word/sec=6206.7753
train loss/word=0.0014, acc=99.96%, word/sec=6198.8095
train loss/word=0.0019, acc=99.95%, word/sec=6209.0702
train loss/word=0.0019, acc=99.95%, word/sec=6213.1301
train loss/w

train loss/word=0.0005, acc=99.98%, word/sec=5802.7664
train loss/word=0.0007, acc=99.98%, word/sec=5799.1016
train loss/word=0.0007, acc=99.98%, word/sec=5892.4475
train loss/word=0.0007, acc=99.98%, word/sec=5955.8564
train loss/word=0.0007, acc=99.98%, word/sec=6002.8348
train loss/word=0.0008, acc=99.98%, word/sec=6036.9220
train loss/word=0.0008, acc=99.98%, word/sec=6063.1508
train loss/word=0.0009, acc=99.98%, word/sec=6088.0391
train loss/word=0.0009, acc=99.98%, word/sec=6077.9788
dev loss/word=1.7989, acc=89.09%, word/sec=16930.5725
train loss/word=0.0008, acc=99.98%, word/sec=6278.4539
train loss/word=0.0006, acc=99.98%, word/sec=6257.3298
train loss/word=0.0008, acc=99.98%, word/sec=6222.6731
train loss/word=0.0008, acc=99.98%, word/sec=6083.1523
train loss/word=0.0007, acc=99.98%, word/sec=6033.9415
train loss/word=0.0007, acc=99.98%, word/sec=6060.4028
train loss/word=0.0008, acc=99.98%, word/sec=6110.9066
train loss/word=0.0008, acc=99.98%, word/sec=6139.7763
train loss/

NameError: ignored