## 6.5　RNNLM のさらなる改善

In [1]:
import os
import sys
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [2]:
os.chdir('/Users/yuta.shimizu/Downloads/ML/deep-learning-from-scratch-2-master/ch06')
sys.path.append(os.pardir)

In [3]:
from dataset import ptb
from rnnlm import Rnnlm
from common import config
from common.np import *
from common.time_layers import *
from common.optimizer import SGD
from common.util import eval_perplexity
from common.trainer import RnnlmTrainer
from common.base_model import BaseModel

In [4]:
class BetterRnnlm(BaseModel):
    def __init__(self, vocab_size=10000, wordvec_size=650, hidden_size=650, dropout_ratio=0.5):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_W = (rn(V, D) / 100).astype('f')
        lstm_Wx1 = (rn(D, 4*H) / np.sqrt(D)).astype('f')
        lstm_Wh1 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        lstm_b1 = np.zeros(4*H).astype('f')
        lstm_Wx2 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        lstm_Wh2 = (rn(H, 4*H) / np.sqrt(H)).astype('f')
        lstm_b2 = np.zeros(4*H).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.layers = [
            TimeEmbedding(embed_W),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx1, lstm_Wh1, lstm_b1, stateful=True),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_Wx2, lstm_Wh2, lstm_b2, stateful=True),
            TimeDropout(dropout_ratio),
            TimeAffine(embed_W.T, affine_b)
        ]
        
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layers = [self.layers[2], self.layers[4]]
        self.drop_layers = [self.layers[1], self.layers[3], self.layers[5]]
        
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
            
    def predict(self, xs, train_flg=False):
        for layer in self.drop_layers:
            layer.train_flg = train_flg
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs
    
    def forward(self, xs, ts, train_flg=True):
        score = self.predict(xs, train_flg)
        loss = self.loss_layer.forward(score, ts)
        return loss
    
    def backward(self, dout=1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout
    
    def reset_state(self):
        for layer in self.lstm_layers:
            layer.reset_state()

In [5]:
batch_size = 20
wordvec_size = 650
hidden_size = 650
time_size = 35
lr = 20.0
max_epoch = 40
max_grad = 0.25
dropout = 0.5

In [6]:
corpus, word_to_id, id_to_word = ptb.load_data('train')
corpus_val, _, _ = ptb.load_data('val')
corpus_test, _, _ = ptb.load_data('test')

vocab_size = len(word_to_id)
xs = corpus[:-1]
ts = corpus[1:]

print(vocab_size)

10000


In [7]:
model = BetterRnnlm(vocab_size, wordvec_size, hidden_size, dropout)
optimizer = SGD(lr)
trainer = RnnlmTrainer(model, optimizer)

best_ppl = float('inf')
for epoch in range(max_epoch):
    trainer.fit(xs, ts, max_epoch=1, batch_size=batch_size, time_size=time_size, max_grad=max_grad)
    model.reset_state()
    ppl = eval_perplexity(model, corpus_val)
    print('valid perplexity: ', ppl)
    
    if best_ppl > ppl:
        best_ppl = ppl
        model.save_params()
    else:
        lr /= 4.0
        optimizer.lr = lr
        
    model.reset_state()
    print('-' * 50)

| epoch 1 |  iter 1 / 1327 | time 4[s] | perplexity 9999.82
| epoch 1 |  iter 21 / 1327 | time 64[s] | perplexity 4625.60
| epoch 1 |  iter 41 / 1327 | time 116[s] | perplexity 2176.86
| epoch 1 |  iter 61 / 1327 | time 171[s] | perplexity 1293.87
| epoch 1 |  iter 81 / 1327 | time 229[s] | perplexity 1010.97
| epoch 1 |  iter 101 / 1327 | time 281[s] | perplexity 797.58
| epoch 1 |  iter 121 / 1327 | time 326[s] | perplexity 782.39
| epoch 1 |  iter 141 / 1327 | time 374[s] | perplexity 714.27
| epoch 1 |  iter 161 / 1327 | time 426[s] | perplexity 690.45
| epoch 1 |  iter 181 / 1327 | time 481[s] | perplexity 685.90
| epoch 1 |  iter 201 / 1327 | time 538[s] | perplexity 583.30
| epoch 1 |  iter 221 / 1327 | time 588[s] | perplexity 564.53
| epoch 1 |  iter 241 / 1327 | time 636[s] | perplexity 522.80
| epoch 1 |  iter 261 / 1327 | time 690[s] | perplexity 526.09
| epoch 1 |  iter 281 / 1327 | time 740[s] | perplexity 519.07
| epoch 1 |  iter 301 / 1327 | time 796[s] | perplexity 445

| epoch 2 |  iter 1201 / 1327 | time 8887[s] | perplexity 126.46
| epoch 2 |  iter 1221 / 1327 | time 8922[s] | perplexity 125.89
| epoch 2 |  iter 1241 / 1327 | time 8963[s] | perplexity 146.97
| epoch 2 |  iter 1261 / 1327 | time 9003[s] | perplexity 138.87
| epoch 2 |  iter 1281 / 1327 | time 9041[s] | perplexity 140.80
| epoch 2 |  iter 1301 / 1327 | time 9077[s] | perplexity 180.78
| epoch 2 |  iter 1321 / 1327 | time 9115[s] | perplexity 171.78
evaluating perplexity ...
209 / 210
valid perplexity:  148.47857309072265
--------------------------------------------------
| epoch 3 |  iter 1 / 1327 | time 2[s] | perplexity 227.87
| epoch 3 |  iter 21 / 1327 | time 50[s] | perplexity 160.54
| epoch 3 |  iter 41 / 1327 | time 102[s] | perplexity 152.68
| epoch 3 |  iter 61 / 1327 | time 143[s] | perplexity 142.76
| epoch 3 |  iter 81 / 1327 | time 183[s] | perplexity 125.89
| epoch 3 |  iter 101 / 1327 | time 233[s] | perplexity 122.81
| epoch 3 |  iter 121 / 1327 | time 279[s] | perple

| epoch 4 |  iter 1001 / 1327 | time 40605[s] | perplexity 108.24
| epoch 4 |  iter 1021 / 1327 | time 40657[s] | perplexity 129.38
| epoch 4 |  iter 1041 / 1327 | time 40714[s] | perplexity 111.78
| epoch 4 |  iter 1061 / 1327 | time 40766[s] | perplexity 106.27
| epoch 4 |  iter 1081 / 1327 | time 40811[s] | perplexity 87.65
| epoch 4 |  iter 1101 / 1327 | time 40856[s] | perplexity 91.73
| epoch 4 |  iter 1121 / 1327 | time 41645[s] | perplexity 122.60
| epoch 4 |  iter 1141 / 1327 | time 43619[s] | perplexity 118.22
| epoch 4 |  iter 1161 / 1327 | time 43678[s] | perplexity 101.52
| epoch 4 |  iter 1181 / 1327 | time 43725[s] | perplexity 107.21
| epoch 4 |  iter 1201 / 1327 | time 43778[s] | perplexity 90.95
| epoch 4 |  iter 1221 / 1327 | time 43845[s] | perplexity 91.86
| epoch 4 |  iter 1241 / 1327 | time 43890[s] | perplexity 108.68
| epoch 4 |  iter 1261 / 1327 | time 45111[s] | perplexity 102.46
| epoch 4 |  iter 1281 / 1327 | time 51095[s] | perplexity 102.43
| epoch 4 |  i

| epoch 6 |  iter 821 / 1327 | time 1741[s] | perplexity 97.39
| epoch 6 |  iter 841 / 1327 | time 1781[s] | perplexity 96.90
| epoch 6 |  iter 861 / 1327 | time 1815[s] | perplexity 96.43
| epoch 6 |  iter 881 / 1327 | time 1854[s] | perplexity 86.54
| epoch 6 |  iter 901 / 1327 | time 1890[s] | perplexity 110.78
| epoch 6 |  iter 921 / 1327 | time 1923[s] | perplexity 98.11
| epoch 6 |  iter 941 / 1327 | time 1961[s] | perplexity 104.06
| epoch 6 |  iter 961 / 1327 | time 2622[s] | perplexity 112.25
| epoch 6 |  iter 981 / 1327 | time 2678[s] | perplexity 106.18
| epoch 6 |  iter 1001 / 1327 | time 2729[s] | perplexity 91.25
| epoch 6 |  iter 1021 / 1327 | time 5939[s] | perplexity 107.60
| epoch 6 |  iter 1041 / 1327 | time 5976[s] | perplexity 94.09
| epoch 6 |  iter 1061 / 1327 | time 6015[s] | perplexity 88.79
| epoch 6 |  iter 1081 / 1327 | time 6057[s] | perplexity 73.74
| epoch 6 |  iter 1101 / 1327 | time 6097[s] | perplexity 75.16
| epoch 6 |  iter 1121 / 1327 | time 6134[s]

| epoch 8 |  iter 681 / 1327 | time 14347[s] | perplexity 72.75
| epoch 8 |  iter 701 / 1327 | time 14405[s] | perplexity 87.56
| epoch 8 |  iter 721 / 1327 | time 14463[s] | perplexity 85.70
| epoch 8 |  iter 741 / 1327 | time 14522[s] | perplexity 76.45
| epoch 8 |  iter 761 / 1327 | time 14582[s] | perplexity 69.28
| epoch 8 |  iter 781 / 1327 | time 14631[s] | perplexity 77.76
| epoch 8 |  iter 801 / 1327 | time 14691[s] | perplexity 87.02
| epoch 8 |  iter 821 / 1327 | time 14751[s] | perplexity 86.72
| epoch 8 |  iter 841 / 1327 | time 14808[s] | perplexity 85.56
| epoch 8 |  iter 861 / 1327 | time 14868[s] | perplexity 86.11
| epoch 8 |  iter 881 / 1327 | time 14915[s] | perplexity 78.10
| epoch 8 |  iter 901 / 1327 | time 14962[s] | perplexity 98.83
| epoch 8 |  iter 921 / 1327 | time 15017[s] | perplexity 89.32
| epoch 8 |  iter 941 / 1327 | time 15069[s] | perplexity 93.24
| epoch 8 |  iter 961 / 1327 | time 16453[s] | perplexity 100.74
| epoch 8 |  iter 981 / 1327 | time 165

| epoch 10 |  iter 501 / 1327 | time 973[s] | perplexity 85.78
| epoch 10 |  iter 521 / 1327 | time 1011[s] | perplexity 87.13
| epoch 10 |  iter 541 / 1327 | time 1052[s] | perplexity 87.80
| epoch 10 |  iter 561 / 1327 | time 1097[s] | perplexity 74.17
| epoch 10 |  iter 581 / 1327 | time 1134[s] | perplexity 69.84
| epoch 10 |  iter 601 / 1327 | time 1175[s] | perplexity 97.56
| epoch 10 |  iter 621 / 1327 | time 1232[s] | perplexity 90.55
| epoch 10 |  iter 641 / 1327 | time 1271[s] | perplexity 84.90
| epoch 10 |  iter 661 / 1327 | time 1311[s] | perplexity 77.10
| epoch 10 |  iter 681 / 1327 | time 1350[s] | perplexity 66.98
| epoch 10 |  iter 701 / 1327 | time 1389[s] | perplexity 79.20
| epoch 10 |  iter 721 / 1327 | time 1428[s] | perplexity 79.97
| epoch 10 |  iter 741 / 1327 | time 1472[s] | perplexity 70.77
| epoch 10 |  iter 761 / 1327 | time 1512[s] | perplexity 65.49
| epoch 10 |  iter 781 / 1327 | time 1549[s] | perplexity 70.76
| epoch 10 |  iter 801 / 1327 | time 4820

| epoch 12 |  iter 301 / 1327 | time 1447[s] | perplexity 67.65
| epoch 12 |  iter 321 / 1327 | time 1531[s] | perplexity 55.61
| epoch 12 |  iter 341 / 1327 | time 1619[s] | perplexity 79.70
| epoch 12 |  iter 361 / 1327 | time 1712[s] | perplexity 81.22
| epoch 12 |  iter 381 / 1327 | time 1818[s] | perplexity 69.38
| epoch 12 |  iter 401 / 1327 | time 1937[s] | perplexity 76.80
| epoch 12 |  iter 421 / 1327 | time 2056[s] | perplexity 66.44
| epoch 12 |  iter 441 / 1327 | time 2201[s] | perplexity 72.23
| epoch 12 |  iter 461 / 1327 | time 2411[s] | perplexity 70.58
| epoch 12 |  iter 481 / 1327 | time 2622[s] | perplexity 71.41
| epoch 12 |  iter 501 / 1327 | time 2805[s] | perplexity 79.72
| epoch 12 |  iter 521 / 1327 | time 2989[s] | perplexity 81.67
| epoch 12 |  iter 541 / 1327 | time 3153[s] | perplexity 82.27
| epoch 12 |  iter 561 / 1327 | time 3274[s] | perplexity 70.56
| epoch 12 |  iter 581 / 1327 | time 3376[s] | perplexity 66.10
| epoch 12 |  iter 601 / 1327 | time 347

| epoch 14 |  iter 121 / 1327 | time 237[s] | perplexity 61.93
| epoch 14 |  iter 141 / 1327 | time 283[s] | perplexity 67.28
| epoch 14 |  iter 161 / 1327 | time 324[s] | perplexity 79.80
| epoch 14 |  iter 181 / 1327 | time 360[s] | perplexity 84.51
| epoch 14 |  iter 201 / 1327 | time 395[s] | perplexity 79.68
| epoch 14 |  iter 221 / 1327 | time 433[s] | perplexity 80.23
| epoch 14 |  iter 241 / 1327 | time 472[s] | perplexity 76.00
| epoch 14 |  iter 261 / 1327 | time 512[s] | perplexity 79.30
| epoch 14 |  iter 281 / 1327 | time 555[s] | perplexity 77.61
| epoch 14 |  iter 301 / 1327 | time 605[s] | perplexity 63.12
| epoch 14 |  iter 321 / 1327 | time 646[s] | perplexity 52.03
| epoch 14 |  iter 341 / 1327 | time 688[s] | perplexity 74.79
| epoch 14 |  iter 361 / 1327 | time 744[s] | perplexity 77.88
| epoch 14 |  iter 381 / 1327 | time 782[s] | perplexity 65.13
| epoch 14 |  iter 401 / 1327 | time 820[s] | perplexity 72.92
| epoch 14 |  iter 421 / 1327 | time 863[s] | perplexit

| epoch 15 |  iter 1281 / 1327 | time 32908[s] | perplexity 62.90
| epoch 15 |  iter 1301 / 1327 | time 32961[s] | perplexity 78.55
| epoch 15 |  iter 1321 / 1327 | time 33016[s] | perplexity 76.77
evaluating perplexity ...
209 / 210
valid perplexity:  87.96488472604571
--------------------------------------------------
| epoch 16 |  iter 1 / 1327 | time 2[s] | perplexity 113.78
| epoch 16 |  iter 21 / 1327 | time 8742[s] | perplexity 69.16
| epoch 16 |  iter 41 / 1327 | time 16574[s] | perplexity 66.74
| epoch 16 |  iter 61 / 1327 | time 23744[s] | perplexity 65.48
| epoch 16 |  iter 81 / 1327 | time 31264[s] | perplexity 56.39
| epoch 16 |  iter 101 / 1327 | time 34104[s] | perplexity 55.80
| epoch 16 |  iter 121 / 1327 | time 34165[s] | perplexity 59.51
| epoch 16 |  iter 141 / 1327 | time 34216[s] | perplexity 64.81
| epoch 16 |  iter 161 / 1327 | time 34267[s] | perplexity 75.16
| epoch 16 |  iter 181 / 1327 | time 34332[s] | perplexity 79.59
| epoch 16 |  iter 201 / 1327 | time 3

| epoch 17 |  iter 1081 / 1327 | time 4412[s] | perplexity 50.59
| epoch 17 |  iter 1101 / 1327 | time 4458[s] | perplexity 52.60
| epoch 17 |  iter 1121 / 1327 | time 4514[s] | perplexity 70.39
| epoch 17 |  iter 1141 / 1327 | time 4572[s] | perplexity 68.21
| epoch 17 |  iter 1161 / 1327 | time 4638[s] | perplexity 57.15
| epoch 17 |  iter 1181 / 1327 | time 4696[s] | perplexity 63.66
| epoch 17 |  iter 1201 / 1327 | time 4747[s] | perplexity 52.85
| epoch 17 |  iter 1221 / 1327 | time 4794[s] | perplexity 52.36
| epoch 17 |  iter 1241 / 1327 | time 5805[s] | perplexity 64.69
| epoch 17 |  iter 1261 / 1327 | time 5865[s] | perplexity 60.78
| epoch 17 |  iter 1281 / 1327 | time 5924[s] | perplexity 61.21
| epoch 17 |  iter 1301 / 1327 | time 5988[s] | perplexity 76.22
| epoch 17 |  iter 1321 / 1327 | time 6049[s] | perplexity 73.73
evaluating perplexity ...
209 / 210
valid perplexity:  86.31994923841889
--------------------------------------------------
| epoch 18 |  iter 1 / 1327 | t

| epoch 19 |  iter 901 / 1327 | time 5692[s] | perplexity 74.28
| epoch 19 |  iter 921 / 1327 | time 5726[s] | perplexity 65.06
| epoch 19 |  iter 941 / 1327 | time 5761[s] | perplexity 70.12
| epoch 19 |  iter 961 / 1327 | time 5794[s] | perplexity 74.94
| epoch 19 |  iter 981 / 1327 | time 5828[s] | perplexity 72.15
| epoch 19 |  iter 1001 / 1327 | time 5862[s] | perplexity 61.14
| epoch 19 |  iter 1021 / 1327 | time 5897[s] | perplexity 71.52
| epoch 19 |  iter 1041 / 1327 | time 5935[s] | perplexity 62.63
| epoch 19 |  iter 1061 / 1327 | time 5969[s] | perplexity 59.11
| epoch 19 |  iter 1081 / 1327 | time 6002[s] | perplexity 48.66
| epoch 19 |  iter 1101 / 1327 | time 6035[s] | perplexity 51.04
| epoch 19 |  iter 1121 / 1327 | time 6069[s] | perplexity 67.61
| epoch 19 |  iter 1141 / 1327 | time 6103[s] | perplexity 64.31
| epoch 19 |  iter 1161 / 1327 | time 6136[s] | perplexity 54.57
| epoch 19 |  iter 1181 / 1327 | time 6170[s] | perplexity 61.48
| epoch 19 |  iter 1201 / 1327

| epoch 21 |  iter 721 / 1327 | time 1498[s] | perplexity 55.62
| epoch 21 |  iter 741 / 1327 | time 1544[s] | perplexity 49.62
| epoch 21 |  iter 761 / 1327 | time 1591[s] | perplexity 43.91
| epoch 21 |  iter 781 / 1327 | time 1673[s] | perplexity 48.74
| epoch 21 |  iter 801 / 1327 | time 1752[s] | perplexity 54.74
| epoch 21 |  iter 821 / 1327 | time 1827[s] | perplexity 54.79
| epoch 21 |  iter 841 / 1327 | time 1896[s] | perplexity 54.11
| epoch 21 |  iter 861 / 1327 | time 1990[s] | perplexity 54.09
| epoch 21 |  iter 881 / 1327 | time 2078[s] | perplexity 49.82
| epoch 21 |  iter 901 / 1327 | time 2158[s] | perplexity 63.19
| epoch 21 |  iter 921 / 1327 | time 2237[s] | perplexity 55.74
| epoch 21 |  iter 941 / 1327 | time 2378[s] | perplexity 59.80
| epoch 21 |  iter 961 / 1327 | time 2482[s] | perplexity 62.34
| epoch 21 |  iter 981 / 1327 | time 2615[s] | perplexity 58.56
| epoch 21 |  iter 1001 / 1327 | time 2722[s] | perplexity 50.62
| epoch 21 |  iter 1021 / 1327 | time 2

| epoch 23 |  iter 541 / 1327 | time 3892[s] | perplexity 57.78
| epoch 23 |  iter 561 / 1327 | time 3976[s] | perplexity 47.42
| epoch 23 |  iter 581 / 1327 | time 4077[s] | perplexity 45.33
| epoch 23 |  iter 601 / 1327 | time 4199[s] | perplexity 64.50
| epoch 23 |  iter 621 / 1327 | time 4305[s] | perplexity 58.16
| epoch 23 |  iter 641 / 1327 | time 4441[s] | perplexity 53.69
| epoch 23 |  iter 661 / 1327 | time 4576[s] | perplexity 49.03
| epoch 23 |  iter 681 / 1327 | time 4728[s] | perplexity 43.05
| epoch 23 |  iter 701 / 1327 | time 4914[s] | perplexity 51.81
| epoch 23 |  iter 721 / 1327 | time 5083[s] | perplexity 50.69
| epoch 23 |  iter 741 / 1327 | time 5257[s] | perplexity 46.30
| epoch 23 |  iter 761 / 1327 | time 5463[s] | perplexity 39.94
| epoch 23 |  iter 781 / 1327 | time 5672[s] | perplexity 45.02
| epoch 23 |  iter 801 / 1327 | time 5867[s] | perplexity 50.74
| epoch 23 |  iter 821 / 1327 | time 6068[s] | perplexity 51.14
| epoch 23 |  iter 841 / 1327 | time 633

| epoch 25 |  iter 341 / 1327 | time 4322[s] | perplexity 52.39
| epoch 25 |  iter 361 / 1327 | time 4367[s] | perplexity 52.49
| epoch 25 |  iter 381 / 1327 | time 4407[s] | perplexity 45.61
| epoch 25 |  iter 401 / 1327 | time 4450[s] | perplexity 51.08
| epoch 25 |  iter 421 / 1327 | time 4493[s] | perplexity 45.22
| epoch 25 |  iter 441 / 1327 | time 4536[s] | perplexity 46.51
| epoch 25 |  iter 461 / 1327 | time 4579[s] | perplexity 47.42
| epoch 25 |  iter 481 / 1327 | time 4622[s] | perplexity 46.91
| epoch 25 |  iter 501 / 1327 | time 4663[s] | perplexity 51.66
| epoch 25 |  iter 521 / 1327 | time 4707[s] | perplexity 54.31
| epoch 25 |  iter 541 / 1327 | time 4751[s] | perplexity 54.59
| epoch 25 |  iter 561 / 1327 | time 4798[s] | perplexity 45.42
| epoch 25 |  iter 581 / 1327 | time 4842[s] | perplexity 43.91
| epoch 25 |  iter 601 / 1327 | time 4901[s] | perplexity 61.36
| epoch 25 |  iter 621 / 1327 | time 4943[s] | perplexity 55.72
| epoch 25 |  iter 641 / 1327 | time 498

| epoch 27 |  iter 161 / 1327 | time 266[s] | perplexity 53.57
| epoch 27 |  iter 181 / 1327 | time 300[s] | perplexity 56.56
| epoch 27 |  iter 201 / 1327 | time 333[s] | perplexity 56.28
| epoch 27 |  iter 221 / 1327 | time 366[s] | perplexity 54.58
| epoch 27 |  iter 241 / 1327 | time 400[s] | perplexity 50.53
| epoch 27 |  iter 261 / 1327 | time 434[s] | perplexity 55.09
| epoch 27 |  iter 281 / 1327 | time 467[s] | perplexity 52.81
| epoch 27 |  iter 301 / 1327 | time 798[s] | perplexity 43.18
| epoch 27 |  iter 321 / 1327 | time 1129[s] | perplexity 35.65
| epoch 27 |  iter 341 / 1327 | time 1160[s] | perplexity 49.28
| epoch 27 |  iter 361 / 1327 | time 1490[s] | perplexity 50.57
| epoch 27 |  iter 381 / 1327 | time 1789[s] | perplexity 43.66
| epoch 27 |  iter 401 / 1327 | time 1824[s] | perplexity 49.92
| epoch 27 |  iter 421 / 1327 | time 1858[s] | perplexity 42.69
| epoch 27 |  iter 441 / 1327 | time 1894[s] | perplexity 44.87
| epoch 27 |  iter 461 / 1327 | time 1930[s] | p

209 / 210
valid perplexity:  80.69321684389193
--------------------------------------------------
| epoch 29 |  iter 1 / 1327 | time 1[s] | perplexity 78.73
| epoch 29 |  iter 21 / 1327 | time 49[s] | perplexity 50.07
| epoch 29 |  iter 41 / 1327 | time 98[s] | perplexity 48.21
| epoch 29 |  iter 61 / 1327 | time 148[s] | perplexity 45.24
| epoch 29 |  iter 81 / 1327 | time 202[s] | perplexity 40.41
| epoch 29 |  iter 101 / 1327 | time 257[s] | perplexity 40.12
| epoch 29 |  iter 121 / 1327 | time 304[s] | perplexity 41.63
| epoch 29 |  iter 141 / 1327 | time 358[s] | perplexity 44.79
| epoch 29 |  iter 161 / 1327 | time 407[s] | perplexity 52.84
| epoch 29 |  iter 181 / 1327 | time 456[s] | perplexity 55.32
| epoch 29 |  iter 201 / 1327 | time 507[s] | perplexity 55.30
| epoch 29 |  iter 221 / 1327 | time 560[s] | perplexity 53.01
| epoch 29 |  iter 241 / 1327 | time 640[s] | perplexity 49.54
| epoch 29 |  iter 261 / 1327 | time 677[s] | perplexity 53.31
| epoch 29 |  iter 281 / 1327 

| epoch 30 |  iter 1181 / 1327 | time 2292[s] | perplexity 41.59
| epoch 30 |  iter 1201 / 1327 | time 2329[s] | perplexity 34.59
| epoch 30 |  iter 1221 / 1327 | time 2365[s] | perplexity 34.02
| epoch 30 |  iter 1241 / 1327 | time 2401[s] | perplexity 42.31
| epoch 30 |  iter 1261 / 1327 | time 2438[s] | perplexity 39.99
| epoch 30 |  iter 1281 / 1327 | time 2473[s] | perplexity 40.07
| epoch 30 |  iter 1301 / 1327 | time 2515[s] | perplexity 49.26
| epoch 30 |  iter 1321 / 1327 | time 2557[s] | perplexity 47.85
evaluating perplexity ...
209 / 210
valid perplexity:  80.3816308243883
--------------------------------------------------
| epoch 31 |  iter 1 / 1327 | time 1[s] | perplexity 80.44
| epoch 31 |  iter 21 / 1327 | time 37[s] | perplexity 49.57
| epoch 31 |  iter 41 / 1327 | time 75[s] | perplexity 46.86
| epoch 31 |  iter 61 / 1327 | time 110[s] | perplexity 45.51
| epoch 31 |  iter 81 / 1327 | time 146[s] | perplexity 40.01
| epoch 31 |  iter 101 / 1327 | time 184[s] | perple

| epoch 32 |  iter 1001 / 1327 | time 2541[s] | perplexity 42.23
| epoch 32 |  iter 1021 / 1327 | time 2579[s] | perplexity 48.61
| epoch 32 |  iter 1041 / 1327 | time 2613[s] | perplexity 42.39
| epoch 32 |  iter 1061 / 1327 | time 2649[s] | perplexity 40.47
| epoch 32 |  iter 1081 / 1327 | time 2684[s] | perplexity 33.63
| epoch 32 |  iter 1101 / 1327 | time 2719[s] | perplexity 34.77
| epoch 32 |  iter 1121 / 1327 | time 2756[s] | perplexity 46.37
| epoch 32 |  iter 1141 / 1327 | time 2799[s] | perplexity 44.24
| epoch 32 |  iter 1161 / 1327 | time 2841[s] | perplexity 37.65
| epoch 32 |  iter 1181 / 1327 | time 2883[s] | perplexity 41.80
| epoch 32 |  iter 1201 / 1327 | time 2924[s] | perplexity 34.62
| epoch 32 |  iter 1221 / 1327 | time 2970[s] | perplexity 34.29
| epoch 32 |  iter 1241 / 1327 | time 3023[s] | perplexity 41.83
| epoch 32 |  iter 1261 / 1327 | time 3080[s] | perplexity 39.13
| epoch 32 |  iter 1281 / 1327 | time 3125[s] | perplexity 39.99
| epoch 32 |  iter 1301 /

| epoch 34 |  iter 821 / 1327 | time 5446[s] | perplexity 45.37
| epoch 34 |  iter 841 / 1327 | time 5572[s] | perplexity 45.52
| epoch 34 |  iter 861 / 1327 | time 5736[s] | perplexity 44.51
| epoch 34 |  iter 881 / 1327 | time 5867[s] | perplexity 40.95
| epoch 34 |  iter 901 / 1327 | time 5961[s] | perplexity 51.62
| epoch 34 |  iter 921 / 1327 | time 6044[s] | perplexity 46.53
| epoch 34 |  iter 941 / 1327 | time 6115[s] | perplexity 48.34
| epoch 34 |  iter 961 / 1327 | time 6229[s] | perplexity 51.61
| epoch 34 |  iter 981 / 1327 | time 6337[s] | perplexity 48.92
| epoch 34 |  iter 1001 / 1327 | time 6418[s] | perplexity 42.67
| epoch 34 |  iter 1021 / 1327 | time 6499[s] | perplexity 49.37
| epoch 34 |  iter 1041 / 1327 | time 6583[s] | perplexity 42.67
| epoch 34 |  iter 1061 / 1327 | time 6671[s] | perplexity 40.56
| epoch 34 |  iter 1081 / 1327 | time 6766[s] | perplexity 33.58
| epoch 34 |  iter 1101 / 1327 | time 6899[s] | perplexity 34.64
| epoch 34 |  iter 1121 / 1327 | t

| epoch 36 |  iter 641 / 1327 | time 1731[s] | perplexity 47.36
| epoch 36 |  iter 661 / 1327 | time 2060[s] | perplexity 42.86
| epoch 36 |  iter 681 / 1327 | time 2745[s] | perplexity 38.58
| epoch 36 |  iter 701 / 1327 | time 2780[s] | perplexity 45.71
| epoch 36 |  iter 721 / 1327 | time 2886[s] | perplexity 45.18
| epoch 36 |  iter 741 / 1327 | time 2918[s] | perplexity 41.04
| epoch 36 |  iter 761 / 1327 | time 3979[s] | perplexity 35.19
| epoch 36 |  iter 781 / 1327 | time 4281[s] | perplexity 39.98
| epoch 36 |  iter 801 / 1327 | time 4323[s] | perplexity 44.32
| epoch 36 |  iter 821 / 1327 | time 4355[s] | perplexity 45.22
| epoch 36 |  iter 841 / 1327 | time 4593[s] | perplexity 44.67
| epoch 36 |  iter 861 / 1327 | time 4850[s] | perplexity 44.56
| epoch 36 |  iter 881 / 1327 | time 4881[s] | perplexity 40.49
| epoch 36 |  iter 901 / 1327 | time 5290[s] | perplexity 51.25
| epoch 36 |  iter 921 / 1327 | time 5329[s] | perplexity 46.53
| epoch 36 |  iter 941 / 1327 | time 536

| epoch 38 |  iter 441 / 1327 | time 1821[s] | perplexity 43.73
| epoch 38 |  iter 461 / 1327 | time 1895[s] | perplexity 44.41
| epoch 38 |  iter 481 / 1327 | time 1935[s] | perplexity 43.88
| epoch 38 |  iter 501 / 1327 | time 1979[s] | perplexity 48.65
| epoch 38 |  iter 521 / 1327 | time 2018[s] | perplexity 49.64
| epoch 38 |  iter 541 / 1327 | time 2059[s] | perplexity 51.59
| epoch 38 |  iter 561 / 1327 | time 2098[s] | perplexity 42.30
| epoch 38 |  iter 581 / 1327 | time 2140[s] | perplexity 40.53
| epoch 38 |  iter 601 / 1327 | time 2184[s] | perplexity 57.04
| epoch 38 |  iter 621 / 1327 | time 2222[s] | perplexity 51.97
| epoch 38 |  iter 641 / 1327 | time 2263[s] | perplexity 47.80
| epoch 38 |  iter 661 / 1327 | time 2300[s] | perplexity 42.84
| epoch 38 |  iter 681 / 1327 | time 2336[s] | perplexity 38.33
| epoch 38 |  iter 701 / 1327 | time 2384[s] | perplexity 45.47
| epoch 38 |  iter 721 / 1327 | time 2428[s] | perplexity 44.84
| epoch 38 |  iter 741 / 1327 | time 247

| epoch 40 |  iter 241 / 1327 | time 507[s] | perplexity 49.43
| epoch 40 |  iter 261 / 1327 | time 547[s] | perplexity 52.79
| epoch 40 |  iter 281 / 1327 | time 587[s] | perplexity 51.32
| epoch 40 |  iter 301 / 1327 | time 627[s] | perplexity 41.84
| epoch 40 |  iter 321 / 1327 | time 677[s] | perplexity 34.73
| epoch 40 |  iter 341 / 1327 | time 719[s] | perplexity 48.28
| epoch 40 |  iter 361 / 1327 | time 760[s] | perplexity 48.49
| epoch 40 |  iter 381 / 1327 | time 808[s] | perplexity 42.52
| epoch 40 |  iter 401 / 1327 | time 863[s] | perplexity 47.08
| epoch 40 |  iter 421 / 1327 | time 938[s] | perplexity 41.09
| epoch 40 |  iter 441 / 1327 | time 1020[s] | perplexity 44.19
| epoch 40 |  iter 461 / 1327 | time 1079[s] | perplexity 43.01
| epoch 40 |  iter 481 / 1327 | time 1143[s] | perplexity 43.88
| epoch 40 |  iter 501 / 1327 | time 1225[s] | perplexity 48.84
| epoch 40 |  iter 521 / 1327 | time 1309[s] | perplexity 50.05
| epoch 40 |  iter 541 / 1327 | time 1387[s] | per