In [13]:
!pip install dynet
!git clone https://github.com/neubig/nn4nlp-code.git

fatal: destination path 'nn4nlp-code' already exists and is not an empty directory.


In [0]:
from __future__ import print_function
import time

from collections import defaultdict
import random
import sys
import argparse

import dynet as dy
import numpy as np

In [0]:
parser = argparse.ArgumentParser(description='BiLSTM variants.')
parser.add_argument('--teacher', action='store_true')
parser.add_argument('--perceptron', action='store_true')
parser.add_argument('--cost', action='store_true')
parser.add_argument('--hinge', action='store_true')
parser.add_argument('--schedule', action='store_true')

opts = ['--teacher']

args = parser.parse_args(opts)
use_teacher_forcing = args.teacher
use_structure_perceptron = args.perceptron
use_cost_augmented = args.cost
use_hinge = args.hinge
use_schedule = args.schedule

In [16]:
print("Training BiLSTM %s teacher forcing (%s schedule), %s structured perceptron loss, %s augmented cost, %s margin."
      % ("with" if use_teacher_forcing else "without",
         "with" if use_schedule else "without",
         "with" if use_structure_perceptron else "without",
         "with" if use_cost_augmented else "without",
         "with" if use_hinge else "without"
         )
      )

# format of files: each line is "word1|tag1 word2|tag2 ..."
train_file = "nn4nlp-code/data/tags/train.txt"
dev_file = "nn4nlp-code/data/tags/dev.txt"

w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))


def read(fname):
    """
    Read tagged file
    """
    with open(fname, "r") as f:
        for line in f:
            words, tags = [], []
            for wt in line.strip().split():
                w, t = wt.split('|')
                words.append(w2i[w])
                tags.append(t2i[t])
            yield (words, tags)


class AlwaysTrueSampler:
    """
    An always true sampler, only sample fromtrue distribution.
    """

    def sample_true(self):
        return True

    def decay(self):
        pass


class ScheduleSampler:
    """
    A linear schedule sampler.
    """

    def __init__(self, start_rate=1, min_rate=0.2, decay_rate=0.1):
        self.min_rate = min_rate
        self.iter = 0
        self.decay_rate = decay_rate
        self.start_rate = start_rate
        self.reach_min = False
        self.sample_rate = start_rate

    def decay_func(self):
        if not self.reach_min:
            self.sample_rate = self.start_rate - self.iter * self.decay_rate
            if self.sample_rate < self.min_rate:
                self.reach_min = True
                self.sample_rate = self.min_rate

    def decay(self):
        self.iter += 1
        self.decay_func()
        print("Sample rate is now %.2f" % self.sample_rate)

    def sample_true(self):
        return random.random() < self.sample_rate


# Read the data
train = list(read(train_file))
unk_word = w2i["<unk>"]
w2i = defaultdict(lambda: unk_word, w2i)
unk_tag = t2i["<unk>"]
start_tag = t2i["<start>"]
t2i = defaultdict(lambda: unk_tag, t2i)
nwords = len(w2i)
ntags = len(t2i)
dev = list(read(dev_file))

Training BiLSTM with teacher forcing (without schedule), without structured perceptron loss, without augmented cost, without margin.


In [0]:
# DyNet Starts
model = dy.Model()
trainer = dy.AdamTrainer(model)

# Model parameters
EMBED_SIZE = 64
TAG_EMBED_SIZE = 16
HIDDEN_SIZE = 128

assert HIDDEN_SIZE % 2 == 0

# Lookup parameters for word embeddings
LOOKUP = model.add_lookup_parameters((nwords, EMBED_SIZE))

if use_teacher_forcing:
    TAG_LOOKUP = model.add_lookup_parameters((ntags, TAG_EMBED_SIZE))

if use_schedule:
    sampler = ScheduleSampler()
else:
    sampler = AlwaysTrueSampler()

# Word-level BiLSTM is just a composition of two LSTMs.
if use_teacher_forcing:
    fwdLSTM = dy.SimpleRNNBuilder(1, EMBED_SIZE + TAG_EMBED_SIZE, HIDDEN_SIZE / 2, model)  # Forward LSTM
else:
    fwdLSTM = dy.SimpleRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE / 2, model)  # Forward LSTM

# We cannot insert previous predicted tag to the backward LSTM anyway.
bwdLSTM = dy.SimpleRNNBuilder(1, EMBED_SIZE, HIDDEN_SIZE / 2, model)  # Backward LSTM

# Word-level softmax
W_sm = model.add_parameters((ntags, HIDDEN_SIZE))
b_sm = model.add_parameters(ntags)


# Calculate the scores for one example
def calc_scores(words):
    """
    Calculate scores using BiLSTM.
    :param words:
    :return:
    """
    dy.renew_cg()

    word_embs = [LOOKUP[x] for x in words]

    # Transduce all batch elements with an LSTM
    fwd_init = fwdLSTM.initial_state()
    fwd_word_reps = fwd_init.transduce(word_embs)
    bwd_init = bwdLSTM.initial_state()
    bwd_word_reps = bwd_init.transduce(reversed(word_embs))

    combined_word_reps = [dy.concatenate([f, b]) for f, b in zip(fwd_word_reps, reversed(bwd_word_reps))]

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)
    scores = [dy.affine_transform([b, W, x]) for x in combined_word_reps]

    return scores


def calc_scores_with_previous_tag(words, referent_tags=None):
    """
    Calculate scores using previous tag as input. If the referent tags are provided, then we will sample from previous
    referent tag or previous system prediction.
    :param words:
    :param referent_tags:
    :return:
    """
    dy.renew_cg()

    word_embs = [LOOKUP[x] for x in words]

    # Transduce all batch elements for the backward LSTM, using the original word embeddings.
    bwd_init = bwdLSTM.initial_state()
    bwd_word_reps = bwd_init.transduce(reversed(word_embs))

    # Softmax scores
    W = dy.parameter(W_sm)
    b = dy.parameter(b_sm)

    scores = []
    # Transduce one by one for the forward LSTM
    fwd_init = fwdLSTM.initial_state()
    s_fwd = fwd_init

    prev_tag = start_tag

    index = 0
    for word, bwd_word_rep in zip(word_embs, reversed(bwd_word_reps)):
        # Concatenate word and tag representation just as training.
        fwd_input = dy.concatenate([word, TAG_LOOKUP[prev_tag]])
        s_fwd = s_fwd.add_input(fwd_input)
        combined_rep = dy.concatenate([s_fwd.output(), bwd_word_rep])
        score = dy.affine_transform([b, W, combined_rep])
        prediction = np.argmax(score.npvalue())

        if referent_tags:
            if sampler.sample_true():
                prev_tag = referent_tags[index]
            else:
                prev_tag = prediction
            index += 1
        else:
            prev_tag = prediction

        scores.append(score)

    return scores


def mle(scores, tags):
    losses = [dy.pickneglogsoftmax(score, tag) for score, tag in zip(scores, tags)]
    return dy.esum(losses)


def hamming_cost(predictions, reference):
    return sum(p != r for p, r in zip(predictions, reference))


def calc_sequence_score(scores, tags):
    return dy.esum([score[tag] for score, tag in zip(scores, tags)])


def hamming_augmented_decode(scores, reference):
    """
    Local decoding with hamming cost.
    :param scores: Local decoding scores.
    :param reference: Referent tag result.
    :return:
    """
    augmented_result = []
    for score, referent_tag in zip(scores, reference):
        origin_scores = score.npvalue()
        cost = np.ones(origin_scores.shape)
        cost[referent_tag] = 0
        augmented_result.append(np.argmax(np.add(origin_scores, cost)))
    return augmented_result


def perceptron_loss(scores, reference):
    if use_cost_augmented:
        predictions = hamming_augmented_decode(scores, reference)
    else:
        predictions = [np.argmax(score.npvalue()) for score in scores]

    margin = dy.scalarInput(-2)

    if predictions != reference:
        reference_score = calc_sequence_score(scores, reference)
        prediction_score = calc_sequence_score(scores, predictions)
        if use_cost_augmented:
            # One could actually get the hamming augmented value during decoding, but we didn't do it here for
            # demonstration purpose.
            hamming = dy.scalarInput(hamming_cost(predictions, reference))
            loss = prediction_score + hamming - reference_score
        else:
            loss = prediction_score - reference_score

        if use_hinge:
            loss = dy.emax([dy.scalarInput(0), loss - margin])

        return loss
    else:
        return dy.scalarInput(0)


# Calculate MLE loss for one example
def calc_loss(scores, tags):
    if use_structure_perceptron:
        return perceptron_loss(scores, tags)
    else:
        return mle(scores, tags)


# Calculate number of tags correct for one example
def calc_correct(scores, tags):
    correct = [np.argmax(score.npvalue()) == tag for score, tag in zip(scores, tags)]
    return sum(correct)

In [18]:
# Perform training
for ITER in range(100):
    random.shuffle(train)
    start = time.time()
    this_sents = this_words = this_loss = this_correct = 0
    for sid in range(0, len(train)):
        this_sents += 1
        if this_sents % int(1000) == 0:
            print("train loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
                this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)),
                  file=sys.stderr)
        # train on the example
        words, tags = train[sid]
        # choose whether to use teacher forcing
        if use_teacher_forcing:
            scores = calc_scores_with_previous_tag(words, tags)
        else:
            scores = calc_scores(words)
        loss_exp = calc_loss(scores, tags)
        this_correct += calc_correct(scores, tags)
        this_loss += loss_exp.scalar_value()
        this_words += len(words)
        loss_exp.backward()
        trainer.update()
    # Decay the schedule sampler if using schedule sampling.
    sampler.decay()
    # Perform evaluation
    start = time.time()
    this_sents = this_words = this_loss = this_correct = 0
    for words, tags in dev:
        this_sents += 1
        # choose whether to use teacher forcing
        if use_teacher_forcing:
            scores = calc_scores_with_previous_tag(words)
        else:
            scores = calc_scores(words)
        loss_exp = calc_loss(scores, tags)
        this_correct += calc_correct(scores, tags)
        this_loss += loss_exp.scalar_value()
        this_words += len(words)
    print("dev loss/word=%.4f, acc=%.2f%%, word/sec=%.4f" % (
        this_loss / this_words, 100 * this_correct / this_words, this_words / (time.time() - start)), file=sys.stderr)

train loss/word=0.4632, acc=87.31%, word/sec=7594.7372
train loss/word=0.3942, acc=88.68%, word/sec=7676.3388
train loss/word=0.3526, acc=89.60%, word/sec=7695.7130
train loss/word=0.3245, acc=90.35%, word/sec=7686.0091
train loss/word=0.3060, acc=90.86%, word/sec=7687.8159
train loss/word=0.2890, acc=91.30%, word/sec=7707.5892
train loss/word=0.2767, acc=91.64%, word/sec=7710.1013
train loss/word=0.2661, acc=91.95%, word/sec=7711.4834
train loss/word=0.2563, acc=92.24%, word/sec=7725.5252
train loss/word=0.2491, acc=92.45%, word/sec=7722.2980
dev loss/word=0.5297, acc=87.03%, word/sec=13315.3852
train loss/word=0.1470, acc=95.50%, word/sec=7766.8397
train loss/word=0.1443, acc=95.62%, word/sec=7736.8131
train loss/word=0.1429, acc=95.71%, word/sec=7706.7104
train loss/word=0.1408, acc=95.77%, word/sec=7684.9657
train loss/word=0.1396, acc=95.83%, word/sec=7670.9107
train loss/word=0.1370, acc=95.91%, word/sec=7662.7986
train loss/word=0.1361, acc=95.95%, word/sec=7669.5796
train loss/

train loss/word=0.0378, acc=98.98%, word/sec=7705.1904
train loss/word=0.0382, acc=98.98%, word/sec=7721.7639
train loss/word=0.0386, acc=98.97%, word/sec=7732.3654
train loss/word=0.0395, acc=98.96%, word/sec=7729.7172
train loss/word=0.0402, acc=98.95%, word/sec=7732.5510
train loss/word=0.0398, acc=98.96%, word/sec=7727.4324
dev loss/word=0.8354, acc=89.48%, word/sec=13053.5478
train loss/word=0.0265, acc=99.33%, word/sec=7722.0404
train loss/word=0.0283, acc=99.28%, word/sec=7691.0748
train loss/word=0.0287, acc=99.26%, word/sec=7676.2293
train loss/word=0.0298, acc=99.23%, word/sec=7670.3747
train loss/word=0.0306, acc=99.20%, word/sec=7670.5752
train loss/word=0.0318, acc=99.17%, word/sec=7655.0285
train loss/word=0.0312, acc=99.19%, word/sec=7641.7315
train loss/word=0.0302, acc=99.21%, word/sec=7640.9490
train loss/word=0.0300, acc=99.22%, word/sec=7624.6894
train loss/word=0.0306, acc=99.21%, word/sec=7602.4867
dev loss/word=1.0215, acc=89.26%, word/sec=12823.2680
train loss/w

train loss/word=0.0124, acc=99.69%, word/sec=7689.3305
train loss/word=0.0128, acc=99.69%, word/sec=7667.4725
dev loss/word=1.5099, acc=89.04%, word/sec=12798.1914
train loss/word=0.0073, acc=99.80%, word/sec=7465.6491
train loss/word=0.0065, acc=99.83%, word/sec=7435.7393
train loss/word=0.0078, acc=99.80%, word/sec=7417.6862
train loss/word=0.0079, acc=99.79%, word/sec=7411.8514
train loss/word=0.0087, acc=99.78%, word/sec=7399.9155
train loss/word=0.0087, acc=99.77%, word/sec=7374.7635
train loss/word=0.0089, acc=99.77%, word/sec=7349.9399
train loss/word=0.0098, acc=99.76%, word/sec=7327.8066
train loss/word=0.0104, acc=99.75%, word/sec=7306.3598
train loss/word=0.0104, acc=99.75%, word/sec=7267.2731
dev loss/word=1.6517, acc=88.67%, word/sec=11871.3550
train loss/word=0.0053, acc=99.85%, word/sec=6917.0639
train loss/word=0.0059, acc=99.83%, word/sec=6945.9439
train loss/word=0.0073, acc=99.80%, word/sec=7110.3456
train loss/word=0.0075, acc=99.80%, word/sec=7246.9967
train loss/w

train loss/word=0.0075, acc=99.83%, word/sec=7270.3820
train loss/word=0.0067, acc=99.84%, word/sec=7263.0707
train loss/word=0.0061, acc=99.85%, word/sec=7299.5546
train loss/word=0.0061, acc=99.85%, word/sec=7284.7109
train loss/word=0.0065, acc=99.84%, word/sec=7254.3548
train loss/word=0.0065, acc=99.84%, word/sec=7227.7656
train loss/word=0.0066, acc=99.84%, word/sec=7201.2996
train loss/word=0.0065, acc=99.84%, word/sec=7164.6323
train loss/word=0.0067, acc=99.84%, word/sec=7189.7518
dev loss/word=1.9690, acc=88.72%, word/sec=13123.6037
train loss/word=0.0054, acc=99.88%, word/sec=7597.9591
train loss/word=0.0050, acc=99.88%, word/sec=7570.2281
train loss/word=0.0047, acc=99.88%, word/sec=7567.0992
train loss/word=0.0048, acc=99.88%, word/sec=7579.4155
train loss/word=0.0048, acc=99.88%, word/sec=7598.4475
train loss/word=0.0048, acc=99.88%, word/sec=7594.4433
train loss/word=0.0049, acc=99.88%, word/sec=7592.9215
train loss/word=0.0052, acc=99.87%, word/sec=7587.7793
train loss/

train loss/word=0.0044, acc=99.88%, word/sec=7123.3318
train loss/word=0.0048, acc=99.88%, word/sec=7192.3911
train loss/word=0.0049, acc=99.87%, word/sec=7240.8749
train loss/word=0.0053, acc=99.87%, word/sec=7281.6387
train loss/word=0.0054, acc=99.87%, word/sec=7311.2099
dev loss/word=2.3639, acc=89.27%, word/sec=13178.9161
train loss/word=0.0043, acc=99.89%, word/sec=7600.0957
train loss/word=0.0030, acc=99.91%, word/sec=7578.2556
train loss/word=0.0038, acc=99.90%, word/sec=7564.7113
train loss/word=0.0047, acc=99.88%, word/sec=7560.5085
train loss/word=0.0049, acc=99.88%, word/sec=7482.2437
train loss/word=0.0047, acc=99.89%, word/sec=7406.3525
train loss/word=0.0048, acc=99.88%, word/sec=7396.2039
train loss/word=0.0049, acc=99.88%, word/sec=7385.3635
train loss/word=0.0052, acc=99.87%, word/sec=7383.9161
train loss/word=0.0053, acc=99.87%, word/sec=7384.1588
dev loss/word=2.4927, acc=89.40%, word/sec=12880.0939
train loss/word=0.0069, acc=99.88%, word/sec=7609.6448
train loss/w

train loss/word=0.0045, acc=99.88%, word/sec=7546.1716
dev loss/word=2.5615, acc=89.13%, word/sec=13063.4829
train loss/word=0.0027, acc=99.92%, word/sec=7072.8859
train loss/word=0.0030, acc=99.90%, word/sec=7112.7696
train loss/word=0.0031, acc=99.91%, word/sec=7178.2346
train loss/word=0.0040, acc=99.90%, word/sec=7227.2347
train loss/word=0.0042, acc=99.89%, word/sec=7275.6101
train loss/word=0.0043, acc=99.89%, word/sec=7312.9829
train loss/word=0.0042, acc=99.89%, word/sec=7335.4512
train loss/word=0.0044, acc=99.89%, word/sec=7345.9218
train loss/word=0.0047, acc=99.88%, word/sec=7365.9875
train loss/word=0.0048, acc=99.88%, word/sec=7370.9827
dev loss/word=2.7159, acc=89.11%, word/sec=12892.5852
train loss/word=0.0036, acc=99.91%, word/sec=7501.3458
train loss/word=0.0033, acc=99.91%, word/sec=7536.1849
train loss/word=0.0040, acc=99.89%, word/sec=7489.4903
train loss/word=0.0047, acc=99.88%, word/sec=7458.3739
train loss/word=0.0049, acc=99.88%, word/sec=7445.6383
train loss/w

train loss/word=0.0041, acc=99.90%, word/sec=7492.8186
train loss/word=0.0040, acc=99.90%, word/sec=7482.3269
train loss/word=0.0041, acc=99.90%, word/sec=7476.4188
train loss/word=0.0044, acc=99.89%, word/sec=7492.5410
train loss/word=0.0046, acc=99.89%, word/sec=7488.3797
train loss/word=0.0048, acc=99.88%, word/sec=7497.9890
train loss/word=0.0049, acc=99.88%, word/sec=7506.1480
train loss/word=0.0048, acc=99.88%, word/sec=7512.8848
dev loss/word=3.1139, acc=88.68%, word/sec=13192.4632
train loss/word=0.0051, acc=99.88%, word/sec=7576.2522
train loss/word=0.0040, acc=99.90%, word/sec=7495.1016
train loss/word=0.0037, acc=99.91%, word/sec=7479.7192
train loss/word=0.0039, acc=99.91%, word/sec=7447.5803
train loss/word=0.0041, acc=99.90%, word/sec=7456.2976
train loss/word=0.0045, acc=99.89%, word/sec=7470.9440
train loss/word=0.0045, acc=99.89%, word/sec=7458.5436
train loss/word=0.0048, acc=99.89%, word/sec=7465.2298
train loss/word=0.0047, acc=99.89%, word/sec=7468.8643
train loss/

train loss/word=0.0042, acc=99.91%, word/sec=7582.5337
train loss/word=0.0044, acc=99.90%, word/sec=7599.9868
train loss/word=0.0045, acc=99.90%, word/sec=7602.5288
train loss/word=0.0045, acc=99.89%, word/sec=7616.1677
dev loss/word=3.0961, acc=88.79%, word/sec=13075.9041
train loss/word=0.0034, acc=99.94%, word/sec=7530.4593
train loss/word=0.0040, acc=99.91%, word/sec=7509.6847
train loss/word=0.0041, acc=99.91%, word/sec=7521.7453
train loss/word=0.0046, acc=99.90%, word/sec=7495.7599
train loss/word=0.0048, acc=99.90%, word/sec=7490.5659
train loss/word=0.0047, acc=99.90%, word/sec=7495.6058
train loss/word=0.0046, acc=99.90%, word/sec=7492.3477
train loss/word=0.0050, acc=99.89%, word/sec=7508.0384
train loss/word=0.0050, acc=99.89%, word/sec=7494.9854
train loss/word=0.0051, acc=99.89%, word/sec=7489.1986
dev loss/word=3.1609, acc=88.38%, word/sec=12677.7453
train loss/word=0.0027, acc=99.94%, word/sec=7393.1001
train loss/word=0.0030, acc=99.93%, word/sec=7350.7006
train loss/w

dev loss/word=3.1853, acc=88.98%, word/sec=13179.5839
train loss/word=0.0025, acc=99.93%, word/sec=7562.2943
train loss/word=0.0029, acc=99.93%, word/sec=7532.7526
train loss/word=0.0037, acc=99.91%, word/sec=7524.5099
train loss/word=0.0038, acc=99.91%, word/sec=7505.0532
train loss/word=0.0039, acc=99.90%, word/sec=7475.6959
train loss/word=0.0040, acc=99.90%, word/sec=7479.8612
train loss/word=0.0041, acc=99.90%, word/sec=7467.0890
train loss/word=0.0047, acc=99.89%, word/sec=7469.6812
train loss/word=0.0048, acc=99.89%, word/sec=7452.4601
train loss/word=0.0052, acc=99.88%, word/sec=7439.2911
dev loss/word=3.0740, acc=89.07%, word/sec=12453.5916
train loss/word=0.0028, acc=99.92%, word/sec=7135.6223
train loss/word=0.0036, acc=99.90%, word/sec=7163.8177
train loss/word=0.0043, acc=99.90%, word/sec=7181.0892
train loss/word=0.0045, acc=99.90%, word/sec=7176.1996
train loss/word=0.0047, acc=99.90%, word/sec=7164.9055
train loss/word=0.0048, acc=99.90%, word/sec=7149.2523
train loss/w

train loss/word=0.0044, acc=99.89%, word/sec=14665.1088
train loss/word=0.0047, acc=99.89%, word/sec=14680.9559
train loss/word=0.0050, acc=99.89%, word/sec=14690.0896
train loss/word=0.0048, acc=99.89%, word/sec=14707.9047
train loss/word=0.0053, acc=99.88%, word/sec=14727.4850
train loss/word=0.0055, acc=99.88%, word/sec=14746.5360
train loss/word=0.0054, acc=99.88%, word/sec=14723.5432
dev loss/word=3.4605, acc=88.44%, word/sec=24912.4928
train loss/word=0.0068, acc=99.84%, word/sec=14730.1221
train loss/word=0.0057, acc=99.87%, word/sec=14619.2006
train loss/word=0.0052, acc=99.88%, word/sec=14684.1350
train loss/word=0.0053, acc=99.88%, word/sec=14731.1862
train loss/word=0.0053, acc=99.87%, word/sec=14727.0209
train loss/word=0.0051, acc=99.88%, word/sec=14739.1136
train loss/word=0.0057, acc=99.87%, word/sec=14733.1856
train loss/word=0.0056, acc=99.87%, word/sec=14722.3962
train loss/word=0.0061, acc=99.86%, word/sec=14725.4973
train loss/word=0.0061, acc=99.86%, word/sec=14724

train loss/word=0.0060, acc=99.87%, word/sec=14590.5699
train loss/word=0.0062, acc=99.86%, word/sec=14555.7251
train loss/word=0.0063, acc=99.86%, word/sec=14549.6489
dev loss/word=3.4927, acc=88.40%, word/sec=25259.2535
train loss/word=0.0072, acc=99.84%, word/sec=14452.4640
train loss/word=0.0061, acc=99.87%, word/sec=14428.4546
train loss/word=0.0054, acc=99.88%, word/sec=14449.6797
train loss/word=0.0058, acc=99.88%, word/sec=14472.2351
train loss/word=0.0057, acc=99.88%, word/sec=14490.3318
train loss/word=0.0060, acc=99.88%, word/sec=14502.5378
train loss/word=0.0060, acc=99.88%, word/sec=14498.8020
train loss/word=0.0062, acc=99.87%, word/sec=14481.7849
train loss/word=0.0063, acc=99.87%, word/sec=14475.9121
train loss/word=0.0066, acc=99.87%, word/sec=14474.3579
dev loss/word=3.4491, acc=88.94%, word/sec=25206.1919
train loss/word=0.0065, acc=99.86%, word/sec=14568.2810
train loss/word=0.0049, acc=99.89%, word/sec=14568.3841
train loss/word=0.0050, acc=99.89%, word/sec=14577.9

train loss/word=0.0067, acc=99.86%, word/sec=14232.0518
train loss/word=0.0064, acc=99.86%, word/sec=14238.9554
train loss/word=0.0061, acc=99.86%, word/sec=14252.4532
train loss/word=0.0061, acc=99.86%, word/sec=14246.4934
train loss/word=0.0066, acc=99.85%, word/sec=14244.7933
train loss/word=0.0070, acc=99.85%, word/sec=14245.2206
train loss/word=0.0070, acc=99.84%, word/sec=14204.0278
train loss/word=0.0073, acc=99.84%, word/sec=14172.1441
train loss/word=0.0075, acc=99.84%, word/sec=14177.4607
train loss/word=0.0079, acc=99.84%, word/sec=14109.5385
dev loss/word=3.6755, acc=88.20%, word/sec=24684.0574
train loss/word=0.0064, acc=99.85%, word/sec=13823.4283
train loss/word=0.0050, acc=99.88%, word/sec=13858.3917
train loss/word=0.0065, acc=99.87%, word/sec=13947.5606
train loss/word=0.0067, acc=99.86%, word/sec=14027.7863
train loss/word=0.0068, acc=99.86%, word/sec=14043.8759
train loss/word=0.0068, acc=99.86%, word/sec=14110.7625
train loss/word=0.0072, acc=99.86%, word/sec=14150

train loss/word=0.0060, acc=99.88%, word/sec=14529.9658
train loss/word=0.0062, acc=99.88%, word/sec=14535.8062
train loss/word=0.0064, acc=99.87%, word/sec=14541.2361
train loss/word=0.0062, acc=99.87%, word/sec=14538.4758
train loss/word=0.0065, acc=99.87%, word/sec=14509.5073
train loss/word=0.0068, acc=99.87%, word/sec=14504.2380
dev loss/word=3.5274, acc=88.45%, word/sec=25375.5904
train loss/word=0.0054, acc=99.88%, word/sec=14374.0473
train loss/word=0.0048, acc=99.89%, word/sec=14316.0221
train loss/word=0.0058, acc=99.88%, word/sec=14318.9891
train loss/word=0.0065, acc=99.87%, word/sec=14359.2871
train loss/word=0.0064, acc=99.87%, word/sec=14343.3381
train loss/word=0.0063, acc=99.87%, word/sec=14362.9708
train loss/word=0.0063, acc=99.87%, word/sec=14360.7072
train loss/word=0.0069, acc=99.86%, word/sec=14365.8874
train loss/word=0.0075, acc=99.85%, word/sec=14361.7111
train loss/word=0.0073, acc=99.85%, word/sec=14346.0715
dev loss/word=3.6347, acc=88.58%, word/sec=25164.2

train loss/word=0.0080, acc=99.85%, word/sec=14419.8834
train loss/word=0.0083, acc=99.84%, word/sec=14411.5493
dev loss/word=3.5681, acc=88.81%, word/sec=25400.9621
train loss/word=0.0086, acc=99.84%, word/sec=14331.9410
train loss/word=0.0078, acc=99.84%, word/sec=14466.3715
train loss/word=0.0083, acc=99.84%, word/sec=14490.7881
train loss/word=0.0079, acc=99.85%, word/sec=14490.6858
train loss/word=0.0079, acc=99.84%, word/sec=14508.0567
train loss/word=0.0084, acc=99.83%, word/sec=14537.9244
train loss/word=0.0082, acc=99.84%, word/sec=14560.0745
train loss/word=0.0079, acc=99.84%, word/sec=14559.2507
train loss/word=0.0076, acc=99.84%, word/sec=14553.6222
train loss/word=0.0078, acc=99.84%, word/sec=14549.2620
dev loss/word=3.4100, acc=89.08%, word/sec=25600.3520
train loss/word=0.0055, acc=99.87%, word/sec=14610.5684
train loss/word=0.0074, acc=99.85%, word/sec=14520.0189
train loss/word=0.0066, acc=99.86%, word/sec=14516.4167
train loss/word=0.0065, acc=99.86%, word/sec=14508.0

train loss/word=0.0078, acc=99.84%, word/sec=14366.3647
train loss/word=0.0079, acc=99.84%, word/sec=14433.0275
train loss/word=0.0080, acc=99.84%, word/sec=14387.3358
train loss/word=0.0082, acc=99.84%, word/sec=14347.8089
train loss/word=0.0082, acc=99.84%, word/sec=14336.5893
train loss/word=0.0079, acc=99.84%, word/sec=14350.7419
train loss/word=0.0086, acc=99.84%, word/sec=14355.8948
train loss/word=0.0090, acc=99.83%, word/sec=14358.5909
train loss/word=0.0089, acc=99.83%, word/sec=14344.3789
dev loss/word=3.7647, acc=88.95%, word/sec=25128.8320
train loss/word=0.0056, acc=99.87%, word/sec=14377.7934
train loss/word=0.0078, acc=99.84%, word/sec=14422.7566
train loss/word=0.0070, acc=99.85%, word/sec=14425.5126
train loss/word=0.0083, acc=99.83%, word/sec=14434.3164
train loss/word=0.0084, acc=99.83%, word/sec=14407.4550
train loss/word=0.0085, acc=99.84%, word/sec=14421.9891
train loss/word=0.0089, acc=99.83%, word/sec=14448.5797
train loss/word=0.0092, acc=99.83%, word/sec=14467

train loss/word=0.0078, acc=99.84%, word/sec=14500.8083
train loss/word=0.0077, acc=99.84%, word/sec=14498.3103
train loss/word=0.0076, acc=99.84%, word/sec=14514.3354
train loss/word=0.0079, acc=99.84%, word/sec=14509.5369
train loss/word=0.0083, acc=99.83%, word/sec=14491.8680
dev loss/word=3.8522, acc=88.90%, word/sec=25289.8999
train loss/word=0.0070, acc=99.84%, word/sec=14577.0836
train loss/word=0.0055, acc=99.87%, word/sec=14493.8660
train loss/word=0.0061, acc=99.87%, word/sec=14472.3612
train loss/word=0.0069, acc=99.86%, word/sec=14466.6630
train loss/word=0.0072, acc=99.86%, word/sec=14422.5406
train loss/word=0.0079, acc=99.84%, word/sec=14348.3024
train loss/word=0.0083, acc=99.84%, word/sec=14374.3185
train loss/word=0.0088, acc=99.84%, word/sec=14379.2946
train loss/word=0.0089, acc=99.84%, word/sec=14356.9162
train loss/word=0.0088, acc=99.83%, word/sec=14366.0718
dev loss/word=3.7721, acc=88.51%, word/sec=25471.1751
train loss/word=0.0052, acc=99.88%, word/sec=14448.0

train loss/word=0.0090, acc=99.83%, word/sec=14289.8483
dev loss/word=3.5685, acc=88.70%, word/sec=24936.1445
train loss/word=0.0076, acc=99.86%, word/sec=14113.1354
train loss/word=0.0084, acc=99.85%, word/sec=14157.9884
train loss/word=0.0077, acc=99.85%, word/sec=14090.8469
train loss/word=0.0082, acc=99.83%, word/sec=14158.1339
train loss/word=0.0081, acc=99.83%, word/sec=14212.5621
train loss/word=0.0081, acc=99.84%, word/sec=14246.3358
train loss/word=0.0086, acc=99.83%, word/sec=14277.8755
train loss/word=0.0090, acc=99.83%, word/sec=14300.0359
train loss/word=0.0088, acc=99.83%, word/sec=14320.2231
train loss/word=0.0089, acc=99.83%, word/sec=14340.7172
dev loss/word=3.7219, acc=88.83%, word/sec=25031.3821
train loss/word=0.0034, acc=99.89%, word/sec=14452.6791
train loss/word=0.0052, acc=99.87%, word/sec=14545.6868
train loss/word=0.0059, acc=99.88%, word/sec=14519.0110
train loss/word=0.0063, acc=99.87%, word/sec=14513.5993
train loss/word=0.0062, acc=99.87%, word/sec=14551.3

train loss/word=0.0072, acc=99.86%, word/sec=14008.0582
train loss/word=0.0070, acc=99.86%, word/sec=14001.7855
train loss/word=0.0069, acc=99.86%, word/sec=14015.1151
train loss/word=0.0068, acc=99.86%, word/sec=14045.0392
train loss/word=0.0070, acc=99.86%, word/sec=14047.5661
train loss/word=0.0072, acc=99.86%, word/sec=14049.4039
train loss/word=0.0072, acc=99.86%, word/sec=14083.6039
train loss/word=0.0075, acc=99.85%, word/sec=14106.7749
dev loss/word=3.8634, acc=88.86%, word/sec=24698.9557
train loss/word=0.0077, acc=99.85%, word/sec=14049.6143
train loss/word=0.0070, acc=99.85%, word/sec=13983.4751
train loss/word=0.0069, acc=99.85%, word/sec=14047.9696
train loss/word=0.0082, acc=99.84%, word/sec=14082.0218
train loss/word=0.0083, acc=99.84%, word/sec=14060.5662
train loss/word=0.0085, acc=99.83%, word/sec=14086.4954
train loss/word=0.0087, acc=99.83%, word/sec=14023.6762
train loss/word=0.0085, acc=99.83%, word/sec=14035.6192
train loss/word=0.0083, acc=99.83%, word/sec=14061

train loss/word=0.0082, acc=99.84%, word/sec=14266.2422
train loss/word=0.0084, acc=99.84%, word/sec=14265.4277
train loss/word=0.0087, acc=99.83%, word/sec=14264.1806
train loss/word=0.0090, acc=99.83%, word/sec=14280.3914
dev loss/word=3.6400, acc=88.94%, word/sec=25045.6253
train loss/word=0.0072, acc=99.88%, word/sec=14274.6850
train loss/word=0.0069, acc=99.87%, word/sec=14283.8791
train loss/word=0.0067, acc=99.86%, word/sec=14320.7218
train loss/word=0.0066, acc=99.86%, word/sec=14317.7244
train loss/word=0.0067, acc=99.86%, word/sec=14313.9062
train loss/word=0.0071, acc=99.86%, word/sec=14314.5926
train loss/word=0.0075, acc=99.85%, word/sec=14278.8566
train loss/word=0.0078, acc=99.84%, word/sec=14260.9116
train loss/word=0.0080, acc=99.84%, word/sec=14225.6420
train loss/word=0.0081, acc=99.83%, word/sec=14233.2507
dev loss/word=3.5966, acc=88.79%, word/sec=24648.8546
train loss/word=0.0052, acc=99.88%, word/sec=14219.2323
train loss/word=0.0049, acc=99.89%, word/sec=14318.4