In [17]:
import math
from typing import Tuple

import torch
from torch import nn, Tensor
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset, TensorDataset, DataLoader

class TransformerModel(nn.Module):

    def __init__(self, ntoken: int, out_size: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Linear(ntoken, d_model) # Embedding layer converted into linear layer
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, out_size)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: Tensor) -> Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
#         src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
    
        output = self.transformer_encoder(src)
        output = self.decoder(output)
#         print(output.size())
        return output


def generate_square_subsequent_mask(sz: int) -> Tensor:
    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)

In [18]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, d_model)
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [3]:
import os

# Change working directory to labels
work_dir = "C:/file_lists_with_labels_ff_estimator"
os.chdir(work_dir)

# Reads labels into dict of (filename: ff_value)
training_data_labels = {}
with open("training.txt") as f:
    for line in f:
        key, val = line.split()
        training_data_labels[key] = float(val)
        
test_data_labels = {}
with open("test.txt") as f:
    for line in f:
        key, val = line.split()
        test_data_labels[key] = float(val)

In [4]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Change directory to data
os.chdir("C:/rf_without_tgc")

# Iterate thru data to create lists of tensors
training_data = []
training_labels = []
for file in tqdm(os.listdir()):
    if file not in training_data_labels:
        continue
        
    file_data = pd.read_csv(file, header=None).T
    
    x_tensor = file_data.to_numpy().astype(np.float32)
    y_tensor = float(training_data_labels[file])
    
    for x in x_tensor:
        training_data.append(x)
        training_labels.append(y_tensor)
    
test_data = []
test_labels = []
for file in tqdm(os.listdir()):
    if file not in test_data_labels:
        continue
        
    file_data = pd.read_csv(file, header=None).T
    
    x_tensor = file_data.to_numpy().astype(np.float32)
    y_tensor = float(test_data_labels[file])

    for x in x_tensor:
        test_data.append(x)
        test_labels.append(y_tensor)

100%|██████████████████████████████████████████████████████████████████████████████| 2041/2041 [00:47<00:00, 42.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 2041/2041 [00:48<00:00, 41.87it/s]


In [5]:
# # Create class for DataLoader compatability
# class Data():
#     def __init__(self, x, y):
#         self.x = x
#         self.y = y
    
#     def __len__(self):
#         return len(self.x)

#     def __getitem__(self, idx):
#         X = self.x[idx]
#         y =  self.y[idx]

#         return X, y

training_data = np.array(training_data)
training_labels = np.array(training_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

print(training_data.shape)
print(test_data.shape)
    
# training_data = np.swapaxes(training_data, 1, 2)
# test_data = np.swapaxes(test_data, 1, 2)

# Create data tensors
training_data = torch.Tensor(training_data)
training_labels = torch.Tensor(training_labels)

test_data = torch.Tensor(test_data)
test_labels = torch.Tensor(test_labels)

# training_data = (training_data - torch.mean(training_data)) / torch.std(training_data)
# test_data = (test_data - torch.mean(test_data)) / torch.std(test_data)

train_dataset = TensorDataset(training_data, training_labels)
test_dataset = TensorDataset(test_data, test_labels)

# # Load tensors into class for torch DataLoaders
# train_data = Data(training_data, training_labels)
# test_data = Data(test_data, test_labels)

(261120, 1024)
(261120, 1024)


In [6]:
# DataLoader Parameters
train_loader_params = {
    "batch_size":  32, 
    "shuffle":     True,
    "num_workers": 0
}

# Create DataLoader for training data
train_loader = DataLoader(train_dataset, **train_loader_params)

In [22]:
# Choose device for torch computing
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
ntokens = 1024  # For ff estimation
emsize = 1024#200  # embedding dimension
d_hid = 100  # dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 4  # number of heads in nn.MultiheadAttention
dropout = 0.1  # dropout probability
model = TransformerModel(ntokens, 1, emsize, nhead, d_hid, nlayers, dropout).to(device)

In [20]:
import copy
import time

criterion = nn.MSELoss()
lr = 3  # learning rate
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def train(model: nn.Module) -> None:
    model.train()  # turn on train mode
    total_loss = 0.
    log_interval = 5000
    start_time = time.time()
#     src_mask = generate_square_subsequent_mask(bptt).to(device)

    num_batches = 261120 // 20 # Total training signals // batch_size
    for batch, (signal, target) in enumerate(train_loader):
        signal, target = signal.to(device), target.to(device)
#         signal = torch.unsqueeze(signal, dim=1)
        batch_size = signal.size(0)
#         print(signal.size())
#         print(target.size())
        output = model(signal)
        output = torch.squeeze(output)
        loss = criterion(output, target)
#         print(output.size(), "\n")
#         print(output.size(), target.size())
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % log_interval == 0 and batch > 0:
            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
#             ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(model: nn.Module, eval_data: Tensor) -> float:
    model.eval()  # turn on evaluation mode
    total_loss = 0.
    src_mask = generate_square_subsequent_mask(bptt).to(device)
    with torch.no_grad():
        for i in range(0, eval_data.size(0) - 1, bptt):
            data, targets = get_batch(eval_data, i)
            batch_size = data.size(0)
            if batch_size != bptt:
                src_mask = src_mask[:batch_size, :batch_size]
            output = model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += batch_size * criterion(output_flat, targets).item()
    return total_loss / (len(eval_data) - 1)

In [21]:
best_val_loss = float('inf')
epochs = 50
best_model = None

begin_time = time.time()

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
#     val_loss = evaluate(model, val_data)
#     val_ppl = math.exp(val_loss)
#     elapsed = time.time() - epoch_start_time
#     print('-' * 89)
#     print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
#           f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
#     print('-' * 89)

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_model = copy.deepcopy(model)

    scheduler.step()

print(time.time() - begin_time)

tensor([[-0.2683,  1.1886, -1.3740,  ...,  2.0660, -0.3293, -1.4028],
        [ 0.0208,  1.0228,  0.2272,  ...,  1.0667, -0.5378, -0.7382],
        [-2.7698, -0.1296,  2.3986,  ..., -0.4175, -0.5109, -0.4602],
        ...,
        [ 0.1341, -3.6466, -3.8215,  ...,  0.2865,  0.0856,  0.8273],
        [ 0.9595,  4.6524, -5.2989,  ...,  0.6947,  0.1482, -0.1551],
        [-1.8111, -0.6752, -1.2288,  ...,  0.5500,  0.3867, -0.2375]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-7.2525,  0.8447,  0.8540,  ...,  7.2731,  7.3079, -7.2145],
        [-7.2820,  0.8473,  0.8571,  ...,  7.2697,  7.2976, -7.2314],
        [-7.2879,  0.8496,  0.8597,  ...,  7.2656,  7.2886, -7.2372],
        ...,
        [-7.2623,  0.8608,  0.8697,  ...,  3.0070,  7.2677, -7.2170],
        [-7.2234,  0.8543,  2.9797,  ...,  7.2595,  7.2885, -7.1785],
        [-7.3048,  0.8764,  3.0152,  ...,  7.2119,  7.2357, -7.2587]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-2.

       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-1.3093, -0.7905,  0.4025,  ..., -1.3649, -0.5464,  1.0094],
        [-1.2318,  2.9140,  0.3273,  ..., -1.3993, -0.5964,  0.8691],
        [-1.2549, -0.8305,  0.3315,  ..., -1.3984, -0.6581,  0.9472],
        ...,
        [-1.2503, -0.7928,  0.3923,  ..., -0.3337, -0.5550,  0.8776],
        [-1.3002, -0.7442,  0.3961,  ..., -1.3909, -0.6157,  1.0289],
        [-1.2769, -0.7828,  0.3815,  ..., -1.3962, -0.5790,  0.8962]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-2.3456, -2.4330,  0.6848,  ..., -1.0141, -1.6358,  0.8658],
        [-2.3445, -2.4426,  0.7089,  ..., -0.9905, -1.4829,  0.8187],
        [-2.3748,  2.4172,  0.7054,  ..., -2.6955, -1.6089,  0.9349],
        ...,
        [-2.3389, -2.4247,  0.6905,  ..., -2.6712, -1.5532,  0.8581],
        [-2.3365, -2.3681,  3.5781,  ..., -2.6797, -1.6205,  0.9421],
        [-2.3497, -2.4347,  0.6820,  ..., -2.7016, -1.6272,  0.9400]],
       devic

       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[ 1.8123,  0.3053, -1.3981,  ...,  1.4301,  0.6104, -0.4281],
        [ 1.8130,  0.2927, -1.3822,  ...,  1.4017,  0.6033, -0.4255],
        [ 1.8129,  0.2976, -1.3738,  ...,  1.4051,  0.6018, -0.4187],
        ...,
        [ 1.6965,  0.2837, -1.4109,  ...,  1.4109,  0.6046, -0.4200],
        [ 1.6984,  0.3487,  2.1934,  ...,  1.4070,  0.6069, -0.4313],
        [ 1.8104,  0.2806, -1.3891,  ...,  1.3958,  0.5992, -0.4107]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[ 1.7879,  0.2297, -0.8051,  ...,  1.8532,  0.8417, -0.4988],
        [ 1.7879,  0.2663, -0.7634,  ...,  1.7955,  0.8384, -0.5051],
        [ 1.7920,  3.2026, -0.7600,  ...,  1.7817,  0.8237, -0.5110],
        ...,
        [ 1.7893,  3.1673,  2.2985,  ...,  1.8148,  0.8443, -0.5088],
        [ 1.7962,  0.3043, -0.7352,  ...,  1.8120,  0.8501, -0.5294],
        [ 1.7916,  0.2831,  2.3388,  ...,  1.7638,  0.8250, -0.5146]],
       devic

       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[ 2.8029,  5.0582,  2.9385,  ...,  0.8848,  0.3264, -0.7353],
        [ 1.4998,  5.1051,  2.9368,  ...,  0.8798,  0.3415, -0.7434],
        [ 1.4684,  5.0587,  2.9395,  ...,  0.9345,  0.3319, -0.7133],
        ...,
        [ 2.8010,  5.1053,  2.9376,  ..., -0.0261,  0.3579, -0.7720],
        [ 2.7753,  5.1068,  2.9388,  ...,  0.9452,  0.3661, -0.7347],
        [ 2.7861,  5.1064,  2.9382,  ...,  0.8670,  0.3138, -0.7292]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[ 3.3056,  5.4463,  2.7533,  ...,  1.3535,  0.5359, -1.0663],
        [ 3.3233,  5.4537,  2.7473,  ...,  1.4082,  0.5457, -1.0446],
        [ 3.3158,  5.4539,  2.7473,  ...,  1.3362,  0.5065, -1.0468],
        ...,
        [ 3.3312,  5.4524,  2.7474,  ...,  1.3666,  0.4948, -1.0405],
        [ 3.3116,  5.4530,  2.7474,  ...,  1.3385,  0.5041, -1.0347],
        [ 3.3231,  5.4530,  2.7473,  ...,  0.3551,  0.5397, -1.0363]],
       devic

       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-0.0555, -0.5282, -0.1669,  ..., -0.3465, -0.1396, -0.3844],
        [-0.0560, -0.6001, -0.1947,  ...,  0.1219, -0.2134, -0.3859],
        [-0.0541, -0.5316, -0.1611,  ...,  0.0798, -0.2439, -0.3834],
        ...,
        [-0.1905, -0.5344, -0.1628,  ...,  0.0978, -0.2192, -0.3864],
        [-0.0537, -0.5325, -0.1644,  ..., -0.3598, -0.2412, -0.3842],
        [-0.0556, -0.5545, -0.1646,  ...,  0.0985, -0.1366, -0.3874]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-0.0948, -0.8874, -0.7148,  ..., -0.3576, -0.2656, -0.1606],
        [-0.0943,  1.9656, -0.7161,  ..., -0.3465, -0.2422, -0.1604],
        [-0.0985, -0.9711, -0.7550,  ...,  0.1138, -0.2471, -0.1627],
        ...,
        [-0.0970, -0.9159, -0.7229,  ...,  0.0974, -0.2532, -0.1642],
        [-0.0916, -0.8688, -0.6987,  ...,  0.1209, -0.2174, -0.1580],
        [-0.1041, -0.9301, -0.7211,  ...,  0.1086, -0.2363, -0.1671]],
       devic

       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-0.2502,  1.9079, -0.0666,  ..., -0.0872,  0.2080,  1.5080],
        [-0.2513, -1.0528, -0.0616,  ..., -0.0935,  0.2015,  1.5073],
        [-0.2480, -1.1014, -0.0717,  ..., -0.1179,  0.1606,  1.5077],
        ...,
        [-0.3246, -1.0403, -0.0574,  ..., -0.1091,  0.1933,  1.5126],
        [-0.2466, -1.0870,  1.0223,  ..., -0.6330,  0.1778,  1.5110],
        [-0.2495, -1.0763, -0.0661,  ..., -0.1136,  0.1673,  1.5063]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)
tensor([[-0.2642,  1.9855,  0.4024,  ..., -0.0959,  0.2846,  0.9794],
        [-0.2640, -0.7947,  0.4147,  ..., -0.1044,  0.2702,  0.9788],
        [-0.2631, -0.8339,  0.4072,  ..., -0.0924,  0.2744,  0.9790],
        ...,
        [-0.2639, -0.8455,  0.4033,  ..., -0.0816,  0.2847,  0.9783],
        [-0.2634, -0.7888,  0.4232,  ..., -0.0964,  0.2845,  0.9801],
        [-0.3343, -0.8260,  0.4140,  ..., -0.1015,  0.2705,  0.9798]],
       devic

KeyboardInterrupt: 

In [11]:
from scipy import stats

# DataLoader Parameters
loader_params = {
    "batch_size":  1, 
    "shuffle":     False,
    "num_workers": 0
}

# Create test DataLoader
test_loader = DataLoader(test_dataset, **loader_params)

def test():
    model.eval()
    
    # Initialise arrays and dict
    predictions = np.array([])
    labels = np.array([])
    averaged_dict = {}
    
    with torch.no_grad():
        for i, (signal, label) in tqdm(enumerate(test_loader)):
            # Send input to device
            signal = torch.Tensor(signal).to(device)
            signal = torch.unsqueeze(signal, 0)
             
            # Get output of net, append to lists
            output = model(signal).cpu().detach().numpy()
            output = output[0][0]
            predictions = np.append(predictions, output)            
            labels = np.append(labels, label)
#             print(output, "\n")
#             print(output, label)
        
        for i in range(len(labels)):
            if labels[i] not in averaged_dict:
                averaged_dict[labels[i]] = [predictions[i]]
            else:
                averaged_dict[labels[i]].append(predictions[i])
            
    for i in averaged_dict:
        averaged_dict[i] = np.mean(averaged_dict[i])
    
    averaged_predictions = []
    ordered_labels = []
        
    for i in averaged_dict:
        ordered_labels.append(i)
        averaged_predictions.append(averaged_dict[i])
    
    print(predictions)
    print(labels)
    
#     r = scipy.stats.pearsonr(predictions, labels)
    
    r = stats.pearsonr(averaged_predictions, ordered_labels)
    
    print('Pearson r of the model is %.2f' % r[0])
    return r[0]
test()

261120it [07:03, 616.32it/s]


[7.98628712 7.98628902 7.98628807 ... 7.98629284 7.98628521 7.98629093]
[ 0.546314    0.546314    0.546314   ... 34.99910736 34.99910736
 34.99910736]
Pearson r of the model is 0.17


0.1717398998418664

In [129]:
ntokens = 1024  # For ff estimation
dropout = 0.2  # dropout probability
epochs  = 60 
emsize = 1024

data_arr = []
models_tested = 0

for d_hid in range(50, 500, 150):
    for nlayers in range(1, 7, 3):
        for nhead in range(1, 7, 3):
            r = 0.0
            fail = False
            try:
                model = TransformerModel(ntokens, 1, emsize, nhead, d_hid, nlayers, dropout).to(device)

                criterion = nn.MSELoss()
                lr = 2.  # learning rate
                optimizer = torch.optim.SGD(model.parameters(), lr=lr)
                scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.98)

                for epoch in range(1, epochs + 1):
                    train(model)
                    scheduler.step()

                r = test()

                data_arr.append((emsize, d_hid, nlayers, nhead, r))

            except Exception as e:
                print(e)
                fail = True

            if fail:
                print("Model %d:   emsize: %d   d_hid: %d   nlayers: %d   nhead: %d   FAILED" % (models_tested, emsize, d_hid, nlayers, nhead))
            else:
                print("Model %d:   emsize: %d   d_hid: %d   nlayers: %d   nhead: %d   r:%.3f" % (models_tested, emsize, d_hid, nlayers, nhead, r))

            models_tested += 1
                
data_save = np.array(data_arr)
np.save("results.npy", data_save)

| epoch   1 |  5000/13056 batches | lr 2.00 | ms/batch  3.42 | loss 92.58 | ppl 16100028925287311301350875111210690281472.00
| epoch   1 | 10000/13056 batches | lr 2.00 | ms/batch  3.24 | loss 85.33 | ppl 11455713630718390626473127553364131840.00
| epoch   2 |  5000/13056 batches | lr 1.96 | ms/batch  3.26 | loss 82.97 | ppl 1077043320158796336044229770602348544.00
| epoch   2 | 10000/13056 batches | lr 1.96 | ms/batch  3.24 | loss 81.36 | ppl 216755044073410641543040811992612864.00
| epoch   3 |  5000/13056 batches | lr 1.92 | ms/batch  3.25 | loss 81.82 | ppl 340941116277746081024917681024794624.00
| epoch   3 | 10000/13056 batches | lr 1.92 | ms/batch  3.23 | loss 81.53 | ppl 255088760811615562379166751957975040.00
| epoch   4 |  5000/13056 batches | lr 1.88 | ms/batch  3.25 | loss 81.75 | ppl 319270115387276113419745448691761152.00
| epoch   4 | 10000/13056 batches | lr 1.88 | ms/batch  3.24 | loss 81.03 | ppl 154841679147969441814124579803627520.00
| epoch   5 |  5000/13056 batche

| epoch  35 | 10000/13056 batches | lr 1.01 | ms/batch  3.24 | loss 79.92 | ppl 51109179225013272587111609212600320.00
| epoch  36 |  5000/13056 batches | lr 0.99 | ms/batch  3.27 | loss 79.89 | ppl 49737454510078287593684151318872064.00
| epoch  36 | 10000/13056 batches | lr 0.99 | ms/batch  3.22 | loss 80.04 | ppl 57521908676649670357711489112997888.00
| epoch  37 |  5000/13056 batches | lr 0.97 | ms/batch  3.21 | loss 80.52 | ppl 93344909671356342258398770313035776.00
| epoch  37 | 10000/13056 batches | lr 0.97 | ms/batch  3.27 | loss 79.71 | ppl 41460643275870174933355277343260672.00
| epoch  38 |  5000/13056 batches | lr 0.95 | ms/batch  3.24 | loss 79.94 | ppl 52404821219023020860353409739390976.00
| epoch  38 | 10000/13056 batches | lr 0.95 | ms/batch  3.25 | loss 80.20 | ppl 67851335120070213201786785029423104.00
| epoch  39 |  5000/13056 batches | lr 0.93 | ms/batch  3.30 | loss 79.49 | ppl 33228151714756309313696399605366784.00
| epoch  39 | 10000/13056 batches | lr 0.93 | ms

| epoch   9 |  5000/13056 batches | lr 1.70 | ms/batch  3.29 | loss 80.53 | ppl 94305303637320140817483904751501312.00
| epoch   9 | 10000/13056 batches | lr 1.70 | ms/batch  3.31 | loss 80.99 | ppl 148824862998500992786865649795203072.00
| epoch  10 |  5000/13056 batches | lr 1.67 | ms/batch  3.26 | loss 81.02 | ppl 153457038127688465430741183864242176.00
| epoch  10 | 10000/13056 batches | lr 1.67 | ms/batch  3.31 | loss 80.45 | ppl 86863782331745235171212561353801728.00
| epoch  11 |  5000/13056 batches | lr 1.63 | ms/batch  3.31 | loss 80.57 | ppl 98242362227085501108352044098387968.00
| epoch  11 | 10000/13056 batches | lr 1.63 | ms/batch  3.25 | loss 80.60 | ppl 101349945026809568412368357350703104.00
| epoch  12 |  5000/13056 batches | lr 1.60 | ms/batch  3.29 | loss 80.15 | ppl 64397342772006266139349853784768512.00
| epoch  12 | 10000/13056 batches | lr 1.60 | ms/batch  3.32 | loss 81.00 | ppl 151142295768526043165236100417454080.00
| epoch  13 |  5000/13056 batches | lr 1.57 

| epoch  43 | 10000/13056 batches | lr 0.86 | ms/batch  3.29 | loss 79.94 | ppl 52028956971935452545230985546956800.00
| epoch  44 |  5000/13056 batches | lr 0.84 | ms/batch  3.31 | loss 79.96 | ppl 53365920858991544981030337358331904.00
| epoch  44 | 10000/13056 batches | lr 0.84 | ms/batch  3.35 | loss 80.07 | ppl 59598661048108291190765259195416576.00
| epoch  45 |  5000/13056 batches | lr 0.82 | ms/batch  3.24 | loss 80.10 | ppl 61281770404885767692966089805791232.00
| epoch  45 | 10000/13056 batches | lr 0.82 | ms/batch  3.24 | loss 80.05 | ppl 58065043365874112992491852940705792.00
| epoch  46 |  5000/13056 batches | lr 0.81 | ms/batch  3.36 | loss 79.85 | ppl 47814531948900294887822723548971008.00
| epoch  46 | 10000/13056 batches | lr 0.81 | ms/batch  3.33 | loss 79.90 | ppl 50189786613906189497156480935133184.00
| epoch  47 |  5000/13056 batches | lr 0.79 | ms/batch  3.25 | loss 80.17 | ppl 65455057789693921785363789105856512.00
| epoch  47 | 10000/13056 batches | lr 0.79 | ms

| epoch  17 |  5000/13056 batches | lr 1.45 | ms/batch  9.76 | loss 80.57 | ppl 97789117970670999197704378986266624.00
| epoch  17 | 10000/13056 batches | lr 1.45 | ms/batch  9.83 | loss 80.33 | ppl 77010426831228102638990325807316992.00
| epoch  18 |  5000/13056 batches | lr 1.42 | ms/batch  9.88 | loss 80.29 | ppl 74085640986384864763788601778503680.00
| epoch  18 | 10000/13056 batches | lr 1.42 | ms/batch  9.81 | loss 80.29 | ppl 73697972681670137334579165078224896.00
| epoch  19 |  5000/13056 batches | lr 1.39 | ms/batch  9.75 | loss 80.07 | ppl 59413310337781618456254292529512448.00
| epoch  19 | 10000/13056 batches | lr 1.39 | ms/batch  9.88 | loss 80.55 | ppl 95738101647260735377680733896179712.00
| epoch  20 |  5000/13056 batches | lr 1.36 | ms/batch  9.83 | loss 80.33 | ppl 76819263066699203369547003959705600.00
| epoch  20 | 10000/13056 batches | lr 1.36 | ms/batch  9.78 | loss 80.49 | ppl 90406164372151162168778615496376320.00
| epoch  21 |  5000/13056 batches | lr 1.34 | ms

| epoch  51 | 10000/13056 batches | lr 0.73 | ms/batch  9.78 | loss 79.49 | ppl 33237568498846715203023484538060800.00
| epoch  52 |  5000/13056 batches | lr 0.71 | ms/batch  9.84 | loss 79.81 | ppl 45676412112873704323890702245167104.00
| epoch  52 | 10000/13056 batches | lr 0.71 | ms/batch  9.70 | loss 80.32 | ppl 76558494479293103158778861048561664.00
| epoch  53 |  5000/13056 batches | lr 0.70 | ms/batch  9.81 | loss 79.71 | ppl 41291510022327435253742374704119808.00
| epoch  53 | 10000/13056 batches | lr 0.70 | ms/batch  9.84 | loss 79.91 | ppl 50710956309786995943077853578395648.00
| epoch  54 |  5000/13056 batches | lr 0.69 | ms/batch  9.82 | loss 79.99 | ppl 54723678082258298266780390458916864.00
| epoch  54 | 10000/13056 batches | lr 0.69 | ms/batch  9.80 | loss 79.92 | ppl 51073563506006081361431565892059136.00
| epoch  55 |  5000/13056 batches | lr 0.67 | ms/batch  9.81 | loss 79.91 | ppl 50533251923709577411013853027762176.00
| epoch  55 | 10000/13056 batches | lr 0.67 | ms

| epoch  25 |  5000/13056 batches | lr 1.23 | ms/batch 10.03 | loss 80.15 | ppl 64563824361887772528669205156855808.00
| epoch  25 | 10000/13056 batches | lr 1.23 | ms/batch 10.06 | loss 79.96 | ppl 53282763103978749691021286904430592.00
| epoch  26 |  5000/13056 batches | lr 1.21 | ms/batch 10.03 | loss 79.78 | ppl 44471174904492800996111437216612352.00
| epoch  26 | 10000/13056 batches | lr 1.21 | ms/batch 10.10 | loss 80.27 | ppl 72452388795122367625672123924611072.00
| epoch  27 |  5000/13056 batches | lr 1.18 | ms/batch 10.04 | loss 80.44 | ppl 86061067874080250368395027318571008.00
| epoch  27 | 10000/13056 batches | lr 1.18 | ms/batch  9.97 | loss 80.14 | ppl 63984332559476391489249645890109440.00
| epoch  28 |  5000/13056 batches | lr 1.16 | ms/batch 10.08 | loss 79.92 | ppl 51093982867510105639894453091041280.00
| epoch  28 | 10000/13056 batches | lr 1.16 | ms/batch 10.02 | loss 80.31 | ppl 75212024666260019753818432647200768.00
| epoch  29 |  5000/13056 batches | lr 1.14 | ms

| epoch  59 | 10000/13056 batches | lr 0.62 | ms/batch 10.10 | loss 79.92 | ppl 51184972073668829639354510680784896.00
| epoch  60 |  5000/13056 batches | lr 0.61 | ms/batch 10.04 | loss 79.87 | ppl 48875528534063925655236728739332096.00
| epoch  60 | 10000/13056 batches | lr 0.61 | ms/batch 10.07 | loss 79.41 | ppl 30801729179033717992295620419780608.00
[12.21600819 12.21600819 12.21600819 ... 12.21600819 12.21600819
 12.21600819]
[ 0.546314    0.546314    0.546314   ... 34.99910736 34.99910736
 34.99910736]
Pearson r of the model is nan
Model 3:   emsize: 1024   d_hid: 50   nlayers: 4   nhead: 4   r:nan
| epoch   1 |  5000/13056 batches | lr 2.00 | ms/batch  3.32 | loss 94.68 | ppl 131958895131737298318949562272722400051200.00
| epoch   1 | 10000/13056 batches | lr 2.00 | ms/batch  3.24 | loss 86.15 | ppl 25857003276835279111533854598482624512.00
| epoch   2 |  5000/13056 batches | lr 1.96 | ms/batch  3.30 | loss 82.64 | ppl 776343421750879933145374492136046592.00
| epoch   2 | 10000

| epoch  33 |  5000/13056 batches | lr 1.05 | ms/batch  3.30 | loss 80.37 | ppl 80562021924614250950784993185497088.00
| epoch  33 | 10000/13056 batches | lr 1.05 | ms/batch  3.30 | loss 79.99 | ppl 54995778923313359199798527564709888.00
| epoch  34 |  5000/13056 batches | lr 1.03 | ms/batch  3.32 | loss 80.13 | ppl 63079125745370370607696046872068096.00
| epoch  34 | 10000/13056 batches | lr 1.03 | ms/batch  3.32 | loss 80.05 | ppl 58285201735149840781324196281057280.00
| epoch  35 |  5000/13056 batches | lr 1.01 | ms/batch  3.30 | loss 79.92 | ppl 51299421683148632643408199514324992.00
| epoch  35 | 10000/13056 batches | lr 1.01 | ms/batch  3.32 | loss 80.25 | ppl 70858761285526134518388905635479552.00
| epoch  36 |  5000/13056 batches | lr 0.99 | ms/batch  3.33 | loss 80.04 | ppl 57723550777569278057889525821079552.00
| epoch  36 | 10000/13056 batches | lr 0.99 | ms/batch  3.26 | loss 80.11 | ppl 61616159176313697251466767255994368.00
| epoch  37 |  5000/13056 batches | lr 0.97 | ms

| epoch   6 | 10000/13056 batches | lr 1.81 | ms/batch  3.35 | loss 81.45 | ppl 235648167141786993198287993517899776.00
| epoch   7 |  5000/13056 batches | lr 1.77 | ms/batch  3.33 | loss 80.51 | ppl 92638102318494461166173392954458112.00
| epoch   7 | 10000/13056 batches | lr 1.77 | ms/batch  3.39 | loss 80.99 | ppl 148716034262533469594945369145344000.00
| epoch   8 |  5000/13056 batches | lr 1.74 | ms/batch  3.34 | loss 80.70 | ppl 111726195121573854284525498628308992.00
| epoch   8 | 10000/13056 batches | lr 1.74 | ms/batch  3.33 | loss 80.67 | ppl 108688745254623352896376477277224960.00
| epoch   9 |  5000/13056 batches | lr 1.70 | ms/batch  3.35 | loss 80.59 | ppl 100268714569950625854364443408859136.00
| epoch   9 | 10000/13056 batches | lr 1.70 | ms/batch  3.37 | loss 80.95 | ppl 143305285770121025308092734329847808.00
| epoch  10 |  5000/13056 batches | lr 1.67 | ms/batch  3.39 | loss 80.67 | ppl 107853601576019180130520425264316416.00
| epoch  10 | 10000/13056 batches | lr 1.

| epoch  41 |  5000/13056 batches | lr 0.89 | ms/batch  3.39 | loss 80.07 | ppl 59575701308091368457264107656052736.00
| epoch  41 | 10000/13056 batches | lr 0.89 | ms/batch  3.35 | loss 79.88 | ppl 49054816955678962482832847941926912.00
| epoch  42 |  5000/13056 batches | lr 0.87 | ms/batch  3.36 | loss 80.02 | ppl 56509967061367084612417814624468992.00
| epoch  42 | 10000/13056 batches | lr 0.87 | ms/batch  3.37 | loss 79.98 | ppl 54564623951003792244605834305208320.00
| epoch  43 |  5000/13056 batches | lr 0.86 | ms/batch  3.34 | loss 79.77 | ppl 44218477506941014600108475089420288.00
| epoch  43 | 10000/13056 batches | lr 0.86 | ms/batch  3.38 | loss 79.59 | ppl 36800330568373742693548618179149824.00
| epoch  44 |  5000/13056 batches | lr 0.84 | ms/batch  3.34 | loss 80.00 | ppl 55383972783812229478277667058876416.00
| epoch  44 | 10000/13056 batches | lr 0.84 | ms/batch  3.34 | loss 80.22 | ppl 68730575222233848262132697510445056.00
| epoch  45 |  5000/13056 batches | lr 0.82 | ms

| epoch  14 | 10000/13056 batches | lr 1.54 | ms/batch 10.11 | loss 80.47 | ppl 88871206753989416528622349669367808.00
| epoch  15 |  5000/13056 batches | lr 1.51 | ms/batch 10.13 | loss 80.75 | ppl 116925063227904198808643172260904960.00
| epoch  15 | 10000/13056 batches | lr 1.51 | ms/batch 10.16 | loss 80.11 | ppl 61825812079595634769900445215227904.00
| epoch  16 |  5000/13056 batches | lr 1.48 | ms/batch 10.15 | loss 80.71 | ppl 112991463966542047509613830035472384.00
| epoch  16 | 10000/13056 batches | lr 1.48 | ms/batch 10.17 | loss 80.40 | ppl 82725121168878867426494616172494848.00
| epoch  17 |  5000/13056 batches | lr 1.45 | ms/batch 10.12 | loss 80.76 | ppl 118997489204452222177880352927776768.00
| epoch  17 | 10000/13056 batches | lr 1.45 | ms/batch 10.12 | loss 80.40 | ppl 82511045985238573493259738300809216.00
| epoch  18 |  5000/13056 batches | lr 1.42 | ms/batch 10.14 | loss 80.25 | ppl 71378824582467322497645710399242240.00
| epoch  18 | 10000/13056 batches | lr 1.42 |

| epoch  49 |  5000/13056 batches | lr 0.76 | ms/batch 10.19 | loss 79.74 | ppl 42641218758577933582325124874371072.00
| epoch  49 | 10000/13056 batches | lr 0.76 | ms/batch 10.12 | loss 80.03 | ppl 57138226721442619998197610407002112.00
| epoch  50 |  5000/13056 batches | lr 0.74 | ms/batch 10.14 | loss 79.72 | ppl 41887825987546660948870969151193088.00
| epoch  50 | 10000/13056 batches | lr 0.74 | ms/batch 10.13 | loss 80.11 | ppl 62007107280868613197320248772001792.00
| epoch  51 |  5000/13056 batches | lr 0.73 | ms/batch 10.13 | loss 79.98 | ppl 54379960596751163867919959239688192.00
| epoch  51 | 10000/13056 batches | lr 0.73 | ms/batch 10.15 | loss 79.81 | ppl 45816861527933888492415433320693760.00
| epoch  52 |  5000/13056 batches | lr 0.71 | ms/batch 10.18 | loss 79.80 | ppl 45235089957702282903017040885317632.00
| epoch  52 | 10000/13056 batches | lr 0.71 | ms/batch 10.05 | loss 79.89 | ppl 49847968721818895377837434146390016.00
| epoch  53 |  5000/13056 batches | lr 0.70 | ms

| epoch  22 | 10000/13056 batches | lr 1.31 | ms/batch 10.39 | loss 80.20 | ppl 67548248201209133889170088927428608.00
| epoch  23 |  5000/13056 batches | lr 1.28 | ms/batch 10.39 | loss 80.17 | ppl 65534203338069971787716557283524608.00
| epoch  23 | 10000/13056 batches | lr 1.28 | ms/batch 10.37 | loss 80.60 | ppl 101059928747022798339167307610193920.00
| epoch  24 |  5000/13056 batches | lr 1.26 | ms/batch 10.42 | loss 80.12 | ppl 62178628083514123752330543830663168.00
| epoch  24 | 10000/13056 batches | lr 1.26 | ms/batch 10.42 | loss 80.28 | ppl 73016507865961692313724255492636672.00
| epoch  25 |  5000/13056 batches | lr 1.23 | ms/batch 10.61 | loss 79.99 | ppl 54866127086056770990219923474612224.00
| epoch  25 | 10000/13056 batches | lr 1.23 | ms/batch 10.62 | loss 79.90 | ppl 49945899322581909030151481743900672.00
| epoch  26 |  5000/13056 batches | lr 1.21 | ms/batch 10.41 | loss 80.28 | ppl 73370659314820187960852544684556288.00
| epoch  26 | 10000/13056 batches | lr 1.21 | m

| epoch  57 |  5000/13056 batches | lr 0.65 | ms/batch 10.43 | loss 79.65 | ppl 39144567112611478977487985491050496.00
| epoch  57 | 10000/13056 batches | lr 0.65 | ms/batch 10.44 | loss 79.86 | ppl 48134707671292505025469489566711808.00
| epoch  58 |  5000/13056 batches | lr 0.63 | ms/batch 10.47 | loss 79.78 | ppl 44577111263787465301432376791924736.00
| epoch  58 | 10000/13056 batches | lr 0.63 | ms/batch 10.42 | loss 79.84 | ppl 47312929696509852450712096956481536.00
| epoch  59 |  5000/13056 batches | lr 0.62 | ms/batch 10.44 | loss 79.99 | ppl 55100162144621314191811776016809984.00
| epoch  59 | 10000/13056 batches | lr 0.62 | ms/batch 10.40 | loss 79.68 | ppl 40294885932915022984979982160232448.00
| epoch  60 |  5000/13056 batches | lr 0.61 | ms/batch 10.42 | loss 79.86 | ppl 48165891355445270385429628512632832.00
| epoch  60 | 10000/13056 batches | lr 0.61 | ms/batch 10.38 | loss 79.78 | ppl 44351172257323931293176658502090752.00
[10.97680855 10.97680855 10.97680855 ... 10.9768

| epoch  31 | 10000/13056 batches | lr 1.09 | ms/batch  3.30 | loss 67.89 | ppl 304030594173129697622614867968.00
| epoch  32 |  5000/13056 batches | lr 1.07 | ms/batch  3.33 | loss 68.25 | ppl 436629547909903757550708326400.00
| epoch  32 | 10000/13056 batches | lr 1.07 | ms/batch  3.31 | loss 68.12 | ppl 383658060799483702659147366400.00
| epoch  33 |  5000/13056 batches | lr 1.05 | ms/batch  3.28 | loss 68.19 | ppl 412642527301242955217879695360.00
| epoch  33 | 10000/13056 batches | lr 1.05 | ms/batch  3.31 | loss 68.20 | ppl 414769932695740034642975129600.00
| epoch  34 |  5000/13056 batches | lr 1.03 | ms/batch  3.35 | loss 68.15 | ppl 396871433575874362535053361152.00
| epoch  34 | 10000/13056 batches | lr 1.03 | ms/batch  3.33 | loss 68.03 | ppl 350726500373468419005364568064.00
| epoch  35 |  5000/13056 batches | lr 1.01 | ms/batch  3.34 | loss 67.69 | ppl 248969442922795178756071751680.00
| epoch  35 | 10000/13056 batches | lr 1.01 | ms/batch  3.35 | loss 68.02 | ppl 34658787

| epoch   6 | 10000/13056 batches | lr 1.81 | ms/batch  3.36 | loss 80.42 | ppl 84295386268945342234617920608010240.00
| epoch   7 |  5000/13056 batches | lr 1.77 | ms/batch  3.38 | loss 80.84 | ppl 128023819260217021674252742728613888.00
| epoch   7 | 10000/13056 batches | lr 1.77 | ms/batch  3.36 | loss 80.49 | ppl 90328323012813682742577570863120384.00
| epoch   8 |  5000/13056 batches | lr 1.74 | ms/batch  3.40 | loss 80.99 | ppl 149389948241998409990610029318766592.00
| epoch   8 | 10000/13056 batches | lr 1.74 | ms/batch  3.41 | loss 80.49 | ppl 90612726134177325448444048079585280.00
| epoch   9 |  5000/13056 batches | lr 1.70 | ms/batch  3.38 | loss 80.70 | ppl 112103321423756638980246827163451392.00
| epoch   9 | 10000/13056 batches | lr 1.70 | ms/batch  3.38 | loss 80.95 | ppl 143687377480498665817388969225093120.00
| epoch  10 |  5000/13056 batches | lr 1.67 | ms/batch  3.41 | loss 81.07 | ppl 161822731091587212326520515148644352.00
| epoch  10 | 10000/13056 batches | lr 1.67

| epoch  41 |  5000/13056 batches | lr 0.89 | ms/batch  3.34 | loss 79.96 | ppl 52973068788664906142727577375080448.00
| epoch  41 | 10000/13056 batches | lr 0.89 | ms/batch  3.35 | loss 80.25 | ppl 71117560646930975683491877191942144.00
| epoch  42 |  5000/13056 batches | lr 0.87 | ms/batch  3.40 | loss 80.12 | ppl 62387745021181043312363163311144960.00
| epoch  42 | 10000/13056 batches | lr 0.87 | ms/batch  3.37 | loss 79.73 | ppl 42381110559869698273416833782513664.00
| epoch  43 |  5000/13056 batches | lr 0.86 | ms/batch  3.41 | loss 80.20 | ppl 67980596591790169304525188643160064.00
| epoch  43 | 10000/13056 batches | lr 0.86 | ms/batch  3.34 | loss 79.79 | ppl 44732238577132798649035352405180416.00
| epoch  44 |  5000/13056 batches | lr 0.84 | ms/batch  3.36 | loss 79.46 | ppl 32388879466039633064583842079178752.00
| epoch  44 | 10000/13056 batches | lr 0.84 | ms/batch  3.40 | loss 80.12 | ppl 62557429900686041425036472974573568.00
| epoch  45 |  5000/13056 batches | lr 0.82 | ms

| epoch  14 | 10000/13056 batches | lr 1.54 | ms/batch 10.25 | loss 80.55 | ppl 95757706474690616019309666192326656.00
| epoch  15 |  5000/13056 batches | lr 1.51 | ms/batch 10.29 | loss 80.51 | ppl 92711134307560816445052045393657856.00
| epoch  15 | 10000/13056 batches | lr 1.51 | ms/batch 10.22 | loss 80.68 | ppl 109727694172761072411604390944702464.00
| epoch  16 |  5000/13056 batches | lr 1.48 | ms/batch 10.31 | loss 79.88 | ppl 48993812208664618810378894188740608.00
| epoch  16 | 10000/13056 batches | lr 1.48 | ms/batch 10.26 | loss 81.15 | ppl 174994567625781256496566165882011648.00
| epoch  17 |  5000/13056 batches | lr 1.45 | ms/batch 10.26 | loss 80.83 | ppl 126614968899953330094746314246979584.00
| epoch  17 | 10000/13056 batches | lr 1.45 | ms/batch 10.21 | loss 80.20 | ppl 67354185212024141223176844909477888.00
| epoch  18 |  5000/13056 batches | lr 1.42 | ms/batch 10.29 | loss 80.14 | ppl 63488240329103184837458174087790592.00
| epoch  18 | 10000/13056 batches | lr 1.42 |

| epoch  49 |  5000/13056 batches | lr 0.76 | ms/batch 10.32 | loss 80.26 | ppl 71755069938279661906497584073015296.00
| epoch  49 | 10000/13056 batches | lr 0.76 | ms/batch 10.33 | loss 79.38 | ppl 29805477690602425230616774792183808.00
| epoch  50 |  5000/13056 batches | lr 0.74 | ms/batch 10.34 | loss 80.14 | ppl 63838461558709646131415553134297088.00
| epoch  50 | 10000/13056 batches | lr 0.74 | ms/batch 10.31 | loss 80.08 | ppl 59891348242125874351931683940335616.00
| epoch  51 |  5000/13056 batches | lr 0.73 | ms/batch 10.30 | loss 79.53 | ppl 34544079526952359970884246283747328.00
| epoch  51 | 10000/13056 batches | lr 0.73 | ms/batch 10.33 | loss 80.33 | ppl 77029882570437084921260782695481344.00
| epoch  52 |  5000/13056 batches | lr 0.71 | ms/batch 10.34 | loss 80.14 | ppl 63907683004775299909691434822795264.00
| epoch  52 | 10000/13056 batches | lr 0.71 | ms/batch 10.28 | loss 79.99 | ppl 54776691421527186752943110693584896.00
| epoch  53 |  5000/13056 batches | lr 0.70 | ms

| epoch  22 | 10000/13056 batches | lr 1.31 | ms/batch 10.58 | loss 79.98 | ppl 54505453593685791474948360040873984.00
| epoch  23 |  5000/13056 batches | lr 1.28 | ms/batch 10.61 | loss 80.38 | ppl 81257696282865294017726782510202880.00
| epoch  23 | 10000/13056 batches | lr 1.28 | ms/batch 10.58 | loss 79.96 | ppl 53101113715826852116131063510597632.00
| epoch  24 |  5000/13056 batches | lr 1.26 | ms/batch 10.53 | loss 80.38 | ppl 80844869350051890012074785399373824.00
| epoch  24 | 10000/13056 batches | lr 1.26 | ms/batch 10.57 | loss 80.40 | ppl 82716639804684512851749114918993920.00
| epoch  25 |  5000/13056 batches | lr 1.23 | ms/batch 10.57 | loss 80.34 | ppl 77936048951442519951945608179220480.00
| epoch  25 | 10000/13056 batches | lr 1.23 | ms/batch 10.58 | loss 79.83 | ppl 46930245563146563858787173209735168.00
| epoch  26 |  5000/13056 batches | lr 1.21 | ms/batch 10.59 | loss 80.60 | ppl 101157702990041110768339315467485184.00
| epoch  26 | 10000/13056 batches | lr 1.21 | m

| epoch  57 |  5000/13056 batches | lr 0.65 | ms/batch 10.57 | loss 80.06 | ppl 58970771902195911450951947764367360.00
| epoch  57 | 10000/13056 batches | lr 0.65 | ms/batch 10.55 | loss 79.32 | ppl 28202046960870902177468399547842560.00
| epoch  58 |  5000/13056 batches | lr 0.63 | ms/batch 10.57 | loss 79.47 | ppl 32530013807707068546653451209146368.00
| epoch  58 | 10000/13056 batches | lr 0.63 | ms/batch 10.56 | loss 80.11 | ppl 62023859793326923030812009644949504.00
| epoch  59 |  5000/13056 batches | lr 0.62 | ms/batch 10.58 | loss 80.23 | ppl 69846242195667239186498428866658304.00
| epoch  59 | 10000/13056 batches | lr 0.62 | ms/batch 10.58 | loss 79.89 | ppl 49832391213090747348964370483773440.00
| epoch  60 |  5000/13056 batches | lr 0.61 | ms/batch 10.61 | loss 79.68 | ppl 40111420548506549633957397794914304.00
| epoch  60 | 10000/13056 batches | lr 0.61 | ms/batch 10.58 | loss 79.61 | ppl 37378216302692801882482288684433408.00
[11.61507225 11.61507225 11.61507225 ... 11.6150