# Memory Information

In [None]:
import psutil
def get_size(bytes, suffix="B"):
    factor = 1024
    for unit in ["", "K", "M", "G", "T", "P"]:
        if bytes < factor:
            return f"{bytes:.2f}{unit}{suffix}"
        bytes /= factor
print("="*40, "Memory Information", "="*40)
svmem = psutil.virtual_memory()
print(f"Total: {get_size(svmem.total)}") ; print(f"Available: {get_size(svmem.available)}")
print(f"Used: {get_size(svmem.used)}") ; print(f"Percentage: {svmem.percent}%")

Total: 25.51GB
Available: 22.57GB
Used: 6.31GB
Percentage: 11.5%


# GPU Information

In [None]:
! nvidia-smi

Tue Sep  1 06:36:13 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    32W / 250W |  15155MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install -r requirements.txt



In [None]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader
from utils import transformer_params
from utils import evaluation_metrics, save_model, root_and_binary_title
from math import ceil
from loguru import logger
import numpy as np
import os
import time
from datetime import timedelta
from tqdm import tqdm

In [None]:
class GPT2ForSequenceClassification(torch.nn.Module):
  def __init__(self, num_labels):
    super(GPT2ForSequenceClassification, self).__init__()
    self.model = GPT2Model.from_pretrained('gpt2',
                                       config=GPT2Config.from_pretrained('gpt2'))
    self.dropout = torch.nn.Dropout(p=0.1)
    self.fc_layer = torch.nn.Linear(in_features=768, out_features=768)
    self.tanh = torch.nn.Tanh()
    self.out_layer = torch.nn.Linear(in_features=768, out_features=num_labels)
    self.criterion = torch.nn.CrossEntropyLoss()

  def forward(self, input_ids, attention_mask, labels):
    gpt_last_layer = self.model(input_ids, attention_mask=attention_mask)[0]
    #[batch_size, seq_len, embedding_size(channels)] = [*, *, 768]

    gpt_last_layer = gpt_last_layer.permute(0, 2, 1)
    #[batch_size, embedding_size(channels), seq_len] = [*, 768, *]

    global_max_pooling_out, _ = torch.max(gpt_last_layer, axis=2)
    global_max_pooling_out = self.dropout(global_max_pooling_out)
    #[batch_size, embedding_size(channels)] = [*, 768]

    fc_layer_out = self.fc_layer(global_max_pooling_out)
    fc_layer_out = self.tanh(fc_layer_out)
    #[batch_size, embedding_size(channels)] = [*, 768]

    fc_layer_out = self.dropout(fc_layer_out)
    logits = self.out_layer(fc_layer_out)
    #[batch_size, embedding_size(channels)] = [*, num_labels]
    
    loss = self.criterion(logits, labels)
                                   
    return logits, loss


In [None]:
def load_transformer(name, binary):
  num_classes = 5
  if binary:
    num_classes = 2
  model = GPT2ForSequenceClassification(num_classes)
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  tokenizer.add_special_tokens({'pad_token': '.'})

  return {'model': model,
          'tokenizer': tokenizer}

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
def train_step(model, inputs, labels, optimizer):
    optimizer.zero_grad()

    logits, loss = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)

    loss.backward()
    optimizer.step()

    return logits, loss

In [None]:
def eval_step(model, inputs, labels):
    logits, loss = model(inputs['input_ids'], attention_mask=inputs['attention_mask'], labels=labels)

    return logits, loss

In [None]:
def train_epoch(model, tokenizer, train_dataset, optimizer, batch_size):
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size,
                              shuffle=True)

    correct_count = 0
    total_loss = 0

    model.train()
    with tqdm(total=ceil(len(train_dataset)/batch_size), desc='train', unit='batch') as pbar:
        for text, sentiment in train_loader:
            text = tokenizer(text, padding=True, return_tensors='pt').to(device)
            sentiment = sentiment.to(device)

            logits, loss = train_step(model, text, sentiment, optimizer)

            preds = torch.argmax(logits, axis=1)
            correct_count += (preds == sentiment).sum().item()
            total_loss += loss.item()
            pbar.update(1)

    return correct_count / len(train_dataset), total_loss / len(train_dataset)

In [None]:
def eval_epoch(model, tokenizer, eval_dataset, batch_size, split):
    eval_loader = DataLoader(dataset=eval_dataset,
                            batch_size=batch_size,
                            shuffle=True)

    correct_count = 0
    total_loss = 0
    y_pred = list()
    y_true = list()

    model.eval()
    with torch.no_grad():
        with tqdm(total=ceil(len(eval_dataset)/batch_size), desc=split, unit='batch') as pbar:
            for text, sentiment in eval_loader:
                text = tokenizer(text, padding=True, return_tensors='pt').to(device)
                sentiment = sentiment.to(device)

                logits, loss = eval_step(model, text, sentiment)

                preds = torch.argmax(logits, axis=1)
                y_pred += preds.cpu().numpy().tolist()
                y_true += sentiment.cpu().numpy().tolist()

                correct_count += (preds == sentiment).sum().item()
                total_loss += loss.item()
                pbar.update(1)

    metrics_score = evaluation_metrics(y_true, y_pred, split=split)
    return correct_count / len(eval_dataset), total_loss / len(eval_dataset), metrics_score

In [None]:
def train(name, root, binary, epochs=25, patience=3, save=False):

    #load model and tokenizer..
    try:
        transformer_container = load_transformer(name, binary)
    except ValueError:
        logger.error("Invalid transformer name!")
        os._exit(0)
    model = transformer_container['model']
    model = model.to(device)
    tokenizer = transformer_container['tokenizer']

    #load batch_size and learning rate..
    params_container = transformer_params(name)
    batch_size = params_container['batch_size']
    learning_rate = params_container['learning_rate']

    #load train, dev and test datasets..
    train_dataset = SSTDataset(root=root, binary=binary, split='train')
    dev_dataset = SSTDataset(root=root, binary=binary, split='dev')
    test_dataset = SSTDataset(root=root, binary=binary, split='test')

    #Intialize optimizer..
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    #Initialize training variables..
    best_acc = 0.0
    best_loss = np.inf
    stopping_step = 0
    best_model_name = None

    total_train_seconds = 0
    for epoch in range(epochs):

        start = time.time()
        train_acc, train_loss = train_epoch(model, tokenizer, train_dataset, optimizer, batch_size)
        end = time.time()
        total_train_seconds += (end - start)
        logger.info(f"epoch: {epoch+1}, transformer: {name}, train_loss: {train_loss:.4f}, train_acc: {train_acc*100:.2f}")

        dev_acc, dev_loss, _ = eval_epoch(model, tokenizer, dev_dataset, batch_size, 'dev')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, dev_loss: {dev_loss:.4f}, dev_acc: {dev_acc*100:.2f}")

        test_acc, test_loss, test_evaluation_metrics = eval_epoch(model, tokenizer, test_dataset,
                                                                  batch_size, 'test')
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_loss: {test_loss:.4f}, test_acc: {test_acc*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, "
                    f"test_precision: {test_evaluation_metrics['test_precision']*100:.2f}, "
                    f"test_recall: {test_evaluation_metrics['test_recall']*100:.2f}, "
                    f"test_f1_score: {test_evaluation_metrics['test_f1_score']*100:.2f}, "
                    f"test_accuracy_score: {test_evaluation_metrics['test_accuracy']*100:.2f}")
        logger.info(f"epoch: {epoch+1}, transformer: {name}, test_confusion_matrix: \n"
                    f"{test_evaluation_metrics['test_confusion_matrix']}")

        logger.info(f"Total training time elapsed: {timedelta(seconds=total_train_seconds)}")
        logger.info(f"Mean time per train epoch: {timedelta(seconds=total_train_seconds/(epoch+1))}")

        #save best model and delete previous ones...
        if save:
            if test_acc > best_acc:
                best_acc = test_acc
                phrase_type, label = root_and_binary_title(root, binary)
                model_name = "{}_{}_{}_{}.pickle".format(name, phrase_type, label, epoch)
                save_model(model, model_name, best_model_name)


        # Implement early stopping here
        if test_loss < best_loss:
            best_loss = test_loss
            stopping_step = 0
        else:
            stopping_step += 1

        if stopping_step >= patience:
            logger.info("EarlyStopping!")
            os._exit(1)


In [None]:
train('gpt2', True, False, 30, 300, False)

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2020-09-01 06:36:24.502 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: train!
2020-09-01 06:36:31.084 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: dev!
2020-09-01 06:36:35.735 | INFO     | dataset:__init__:17 - Preparing dataset config root: True, binary: False, split: test!
train:   0%|          | 1/267 [00:00<00:52,  5.11batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:   1%|          | 3/267 [00:00<00:54,  4.87batch/s]

torch.Size([32, 54, 768])


train:   1%|▏         | 4/267 [00:00<00:51,  5.14batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 42, 768])


train:   2%|▏         | 6/267 [00:01<00:51,  5.10batch/s]

torch.Size([32, 54, 768])


train:   3%|▎         | 7/267 [00:01<00:50,  5.18batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 58, 768])


train:   3%|▎         | 9/267 [00:01<00:50,  5.10batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


train:   4%|▍         | 11/267 [00:02<00:50,  5.04batch/s]

torch.Size([32, 51, 768])


train:   4%|▍         | 12/267 [00:02<00:48,  5.21batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])


train:   5%|▌         | 14/267 [00:02<00:48,  5.18batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 52, 768])


train:   6%|▌         | 16/267 [00:03<00:47,  5.23batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 42, 768])


train:   7%|▋         | 18/267 [00:03<00:49,  5.08batch/s]

torch.Size([32, 57, 768])


train:   7%|▋         | 19/267 [00:03<00:45,  5.44batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 48, 768])


train:   8%|▊         | 21/267 [00:04<00:44,  5.56batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 46, 768])


train:   9%|▊         | 23/267 [00:04<00:45,  5.34batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


train:   9%|▉         | 25/267 [00:04<00:44,  5.44batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 50, 768])


train:  10%|█         | 27/267 [00:05<00:44,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  11%|█         | 29/267 [00:05<00:47,  5.05batch/s]

torch.Size([32, 57, 768])


train:  11%|█         | 30/267 [00:05<00:45,  5.25batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 45, 768])


train:  12%|█▏        | 32/267 [00:06<00:44,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])


train:  13%|█▎        | 34/267 [00:06<00:44,  5.25batch/s]

torch.Size([32, 51, 768])


train:  13%|█▎        | 35/267 [00:06<00:42,  5.40batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 52, 768])


train:  14%|█▍        | 37/267 [00:07<00:44,  5.16batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  15%|█▍        | 39/267 [00:07<00:43,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  15%|█▌        | 41/267 [00:07<00:42,  5.35batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  16%|█▌        | 43/267 [00:08<00:40,  5.49batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  17%|█▋        | 45/267 [00:08<00:39,  5.66batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 53, 768])


train:  18%|█▊        | 47/267 [00:08<00:41,  5.24batch/s]

torch.Size([32, 52, 768])


train:  18%|█▊        | 48/267 [00:09<00:41,  5.32batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 59, 768])


train:  19%|█▊        | 50/267 [00:09<00:44,  4.89batch/s]

torch.Size([32, 57, 768])


train:  19%|█▉        | 51/267 [00:09<00:44,  4.87batch/s]

torch.Size([32, 53, 768])


train:  19%|█▉        | 52/267 [00:09<00:44,  4.78batch/s]

torch.Size([32, 57, 768])


train:  20%|█▉        | 53/267 [00:10<00:44,  4.85batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 38, 768])


train:  21%|██        | 55/267 [00:10<00:39,  5.34batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  21%|██▏       | 57/267 [00:10<00:42,  4.99batch/s]

torch.Size([32, 58, 768])


train:  22%|██▏       | 58/267 [00:11<00:40,  5.20batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 38, 768])


train:  22%|██▏       | 60/267 [00:11<00:39,  5.22batch/s]

torch.Size([32, 56, 768])


train:  23%|██▎       | 61/267 [00:11<00:39,  5.23batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  24%|██▎       | 63/267 [00:12<00:38,  5.34batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 53, 768])


train:  24%|██▍       | 65/267 [00:12<00:38,  5.23batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:  25%|██▌       | 67/267 [00:12<00:37,  5.29batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 46, 768])


train:  26%|██▌       | 69/267 [00:13<00:37,  5.34batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 64, 768])


train:  27%|██▋       | 71/267 [00:13<00:39,  4.94batch/s]

torch.Size([32, 52, 768])


train:  27%|██▋       | 72/267 [00:13<00:37,  5.17batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 40, 768])


train:  28%|██▊       | 74/267 [00:14<00:34,  5.61batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 60, 768])


train:  28%|██▊       | 76/267 [00:14<00:35,  5.36batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 48, 768])


train:  29%|██▉       | 78/267 [00:14<00:33,  5.70batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 59, 768])


train:  30%|██▉       | 80/267 [00:15<00:36,  5.16batch/s]

torch.Size([32, 52, 768])


train:  31%|███       | 82/267 [00:15<00:36,  5.06batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 53, 768])


train:  31%|███       | 83/267 [00:15<00:34,  5.30batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 44, 768])


train:  32%|███▏      | 85/267 [00:16<00:33,  5.47batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  33%|███▎      | 87/267 [00:16<00:32,  5.49batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 41, 768])


train:  33%|███▎      | 89/267 [00:16<00:33,  5.36batch/s]

torch.Size([32, 53, 768])


train:  34%|███▎      | 90/267 [00:17<00:32,  5.37batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  34%|███▍      | 92/267 [00:17<00:32,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 43, 768])


train:  35%|███▌      | 94/267 [00:17<00:33,  5.19batch/s]

torch.Size([32, 56, 768])


train:  36%|███▌      | 95/267 [00:18<00:32,  5.27batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:  36%|███▋      | 97/267 [00:18<00:33,  5.10batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 58, 768])


train:  37%|███▋      | 99/267 [00:18<00:33,  5.04batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:  38%|███▊      | 101/267 [00:19<00:31,  5.29batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  39%|███▊      | 103/267 [00:19<00:29,  5.52batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:  39%|███▉      | 105/267 [00:19<00:28,  5.77batch/s]

torch.Size([32, 34, 768])
torch.Size([32, 51, 768])


train:  40%|████      | 107/267 [00:20<00:29,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 55, 768])


train:  41%|████      | 109/267 [00:20<00:29,  5.38batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 54, 768])


train:  42%|████▏     | 111/267 [00:21<00:29,  5.34batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 48, 768])


train:  42%|████▏     | 113/267 [00:21<00:28,  5.48batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train:  43%|████▎     | 115/267 [00:21<00:26,  5.71batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 42, 768])


train:  44%|████▍     | 117/267 [00:22<00:26,  5.61batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  45%|████▍     | 119/267 [00:22<00:28,  5.27batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:  45%|████▌     | 121/267 [00:22<00:27,  5.34batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  46%|████▌     | 123/267 [00:23<00:28,  5.05batch/s]

torch.Size([32, 59, 768])


train:  46%|████▋     | 124/267 [00:23<00:28,  5.06batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])


train:  47%|████▋     | 126/267 [00:23<00:28,  5.02batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 55, 768])


train:  48%|████▊     | 128/267 [00:24<00:27,  5.14batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  48%|████▊     | 129/267 [00:24<00:26,  5.22batch/s]

torch.Size([32, 64, 768])


train:  49%|████▉     | 131/267 [00:24<00:26,  5.13batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  50%|████▉     | 133/267 [00:25<00:25,  5.32batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  51%|█████     | 135/267 [00:25<00:25,  5.11batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  51%|█████▏    | 137/267 [00:26<00:25,  5.13batch/s]

torch.Size([32, 53, 768])


train:  52%|█████▏    | 138/267 [00:26<00:24,  5.34batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  52%|█████▏    | 140/267 [00:26<00:23,  5.51batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 39, 768])


train:  53%|█████▎    | 142/267 [00:26<00:22,  5.48batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  54%|█████▍    | 144/267 [00:27<00:23,  5.26batch/s]

torch.Size([32, 56, 768])


train:  54%|█████▍    | 145/267 [00:27<00:22,  5.32batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  55%|█████▌    | 147/267 [00:27<00:23,  5.03batch/s]

torch.Size([32, 60, 768])


train:  55%|█████▌    | 148/267 [00:28<00:22,  5.25batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  56%|█████▌    | 150/267 [00:28<00:21,  5.54batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 48, 768])


train:  57%|█████▋    | 152/267 [00:28<00:20,  5.56batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  58%|█████▊    | 154/267 [00:29<00:20,  5.52batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  58%|█████▊    | 156/267 [00:29<00:21,  5.25batch/s]

torch.Size([32, 54, 768])


train:  59%|█████▉    | 157/267 [00:29<00:22,  5.00batch/s]

torch.Size([32, 60, 768])


train:  59%|█████▉    | 158/267 [00:29<00:21,  5.13batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:  60%|█████▉    | 160/267 [00:30<00:21,  5.09batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  61%|██████    | 162/267 [00:30<00:19,  5.36batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  61%|██████▏   | 164/267 [00:31<00:18,  5.43batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  62%|██████▏   | 166/267 [00:31<00:17,  5.63batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 58, 768])


train:  63%|██████▎   | 168/267 [00:31<00:18,  5.31batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 36, 768])


train:  64%|██████▎   | 170/267 [00:32<00:16,  5.75batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 50, 768])


train:  64%|██████▍   | 172/267 [00:32<00:17,  5.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  65%|██████▌   | 174/267 [00:32<00:16,  5.48batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 37, 768])


train:  66%|██████▌   | 176/267 [00:33<00:16,  5.56batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


train:  67%|██████▋   | 178/267 [00:33<00:17,  5.18batch/s]

torch.Size([32, 55, 768])


train:  67%|██████▋   | 179/267 [00:33<00:16,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  68%|██████▊   | 181/267 [00:34<00:16,  5.21batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 56, 768])


train:  69%|██████▊   | 183/267 [00:34<00:15,  5.46batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 51, 768])


train:  69%|██████▉   | 185/267 [00:35<00:15,  5.16batch/s]

torch.Size([32, 54, 768])


train:  70%|██████▉   | 186/267 [00:35<00:15,  5.12batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 58, 768])


train:  70%|███████   | 188/267 [00:35<00:15,  4.99batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 48, 768])


train:  71%|███████   | 190/267 [00:35<00:14,  5.39batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 39, 768])


train:  72%|███████▏  | 192/267 [00:36<00:14,  5.33batch/s]

torch.Size([32, 54, 768])


train:  72%|███████▏  | 193/267 [00:36<00:14,  5.23batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  73%|███████▎  | 195/267 [00:36<00:13,  5.15batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  74%|███████▍  | 197/267 [00:37<00:13,  5.25batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  75%|███████▍  | 199/267 [00:37<00:13,  5.04batch/s]

torch.Size([32, 56, 768])


train:  75%|███████▍  | 200/267 [00:37<00:13,  5.00batch/s]

torch.Size([32, 53, 768])


train:  75%|███████▌  | 201/267 [00:38<00:12,  5.13batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 52, 768])


train:  76%|███████▌  | 203/267 [00:38<00:12,  5.15batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 44, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.24batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 39, 768])


train:  78%|███████▊  | 207/267 [00:39<00:11,  5.34batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  78%|███████▊  | 209/267 [00:39<00:11,  5.18batch/s]

torch.Size([32, 54, 768])


train:  79%|███████▊  | 210/267 [00:39<00:11,  5.08batch/s]

torch.Size([32, 53, 768])


train:  79%|███████▉  | 211/267 [00:40<00:10,  5.30batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 43, 768])


train:  80%|███████▉  | 213/267 [00:40<00:09,  5.57batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.32batch/s]

torch.Size([32, 56, 768])


train:  81%|████████  | 216/267 [00:40<00:09,  5.43batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  82%|████████▏ | 218/267 [00:41<00:09,  5.36batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 50, 768])


train:  82%|████████▏ | 220/267 [00:41<00:09,  5.17batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


train:  83%|████████▎ | 222/267 [00:42<00:08,  5.03batch/s]

torch.Size([32, 53, 768])


train:  84%|████████▎ | 223/267 [00:42<00:08,  4.90batch/s]

torch.Size([32, 57, 768])


train:  84%|████████▍ | 224/267 [00:42<00:08,  5.14batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.28batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 55, 768])


train:  85%|████████▌ | 228/267 [00:43<00:07,  5.37batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 58, 768])


train:  86%|████████▌ | 230/267 [00:43<00:07,  5.02batch/s]

torch.Size([32, 54, 768])


train:  87%|████████▋ | 231/267 [00:43<00:07,  5.03batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  87%|████████▋ | 233/267 [00:44<00:06,  4.98batch/s]

torch.Size([32, 59, 768])


train:  88%|████████▊ | 234/267 [00:44<00:06,  5.00batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.20batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  89%|████████▉ | 238/267 [00:45<00:05,  5.42batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  90%|████████▉ | 240/267 [00:45<00:05,  5.22batch/s]

torch.Size([32, 54, 768])


train:  90%|█████████ | 241/267 [00:45<00:05,  5.11batch/s]

torch.Size([32, 54, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.12batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])


train:  91%|█████████▏| 244/267 [00:46<00:04,  5.10batch/s]

torch.Size([32, 54, 768])


train:  92%|█████████▏| 245/267 [00:46<00:04,  5.18batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 59, 768])


train:  93%|█████████▎| 247/267 [00:46<00:04,  4.94batch/s]

torch.Size([32, 53, 768])


train:  93%|█████████▎| 248/267 [00:47<00:03,  5.11batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:  94%|█████████▎| 250/267 [00:47<00:03,  5.32batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


train:  94%|█████████▍| 252/267 [00:47<00:02,  5.18batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


train:  95%|█████████▌| 254/267 [00:48<00:02,  5.16batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 46, 768])


train:  96%|█████████▌| 256/267 [00:48<00:02,  5.49batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 45, 768])


train:  97%|█████████▋| 258/267 [00:49<00:01,  5.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:  97%|█████████▋| 260/267 [00:49<00:01,  5.41batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  98%|█████████▊| 262/267 [00:49<00:00,  5.03batch/s]

torch.Size([32, 60, 768])


train:  99%|█████████▊| 263/267 [00:50<00:00,  5.15batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 56, 768])


train:  99%|█████████▉| 265/267 [00:50<00:00,  5.03batch/s]

torch.Size([32, 51, 768])


train: 100%|█████████▉| 266/267 [00:50<00:00,  4.98batch/s]

torch.Size([32, 54, 768])


train: 100%|██████████| 267/267 [00:50<00:00,  5.26batch/s]
2020-09-01 06:37:27.103 | INFO     | __main__:train:39 - epoch: 1, transformer: gpt2, train_loss: 0.0489, train_acc: 28.17
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 41, 768])


dev:  11%|█▏        | 4/35 [00:00<00:01, 16.84batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 50, 768])
torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 16.40batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])
torch.Size([32, 54, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.20batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])
torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 16.30batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 43, 768])
torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 15.84batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])
torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 16.22batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])
torch.Size([32, 43, 768])
torch.Size([32, 43, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 16.49batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])
torch.Size([32, 43, 768])
torch.Size([32, 45, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.64batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 38, 768])
torch.Size([32, 41, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.40batch/s]
  _warn_prf(average, modifier, msg_start, len(result))
2020-09-01 06:37:29.262 | INFO     | __main__:train:42 - epoch: 1, transformer: gpt2, dev_loss: 0.0465, dev_acc: 39.69
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 58, 768])
torch.Size([13, 47, 768])
torch.Size([32, 44, 768])
torch.Size([32, 42, 768])


test:   6%|▌         | 4/70 [00:00<00:03, 17.31batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 17.11batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 38, 768])
torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.18batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 46, 768])
torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 16.39batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])
torch.Size([32, 35, 768])
torch.Size([32, 49, 768])


test:  29%|██▊       | 20/70 [00:01<00:03, 16.01batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 56, 768])
torch.Size([32, 41, 768])
torch.Size([32, 49, 768])


test:  37%|███▋      | 26/70 [00:01<00:02, 17.30batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 43, 768])
torch.Size([32, 41, 768])
torch.Size([32, 34, 768])


test:  40%|████      | 28/70 [00:01<00:02, 17.04batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 51, 768])
torch.Size([32, 54, 768])
torch.Size([32, 46, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 15.84batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 53, 768])
torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.07batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])
torch.Size([32, 58, 768])
torch.Size([32, 43, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 15.56batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 62, 768])
torch.Size([32, 56, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.01batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 44, 768])
torch.Size([32, 55, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.01batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.04batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 46, 768])
torch.Size([32, 49, 768])
torch.Size([32, 56, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.08batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 48, 768])
torch.Size([32, 39, 768])
torch.Size([32, 52, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 15.75batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])
torch.Size([32, 44, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 15.89batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 41, 768])


test:  97%|█████████▋| 68/70 [00:04<00:00, 16.20batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 46, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.35batch/s]
2020-09-01 06:37:33.563 | INFO     | __main__:train:46 - epoch: 1, transformer: gpt2, test_loss: 0.0461, test_acc: 39.41
2020-09-01 06:37:33.564 | INFO     | __main__:train:47 - epoch: 1, transformer: gpt2, test_precision: 16.42, test_recall: 31.02, test_f1_score: 20.93, test_accuracy_score: 39.41
2020-09-01 06:37:33.565 | INFO     | __main__:train:52 - epoch: 1, transformer: gpt2, test_confusion_matrix: 
[[  0 210   1  68   0]
 [  0 412   0 221   0]
 [  0 175   0 214   0]
 [  0  50   1 459   0]
 [  0   9   1 389   0]]
2020-09-01 06:37:33.565 | INFO     | __main__:train:55 - Total training time elapsed: 0:00:50.809495
2020-09-01 06:37:33.566 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:50.809495
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([2, 28, 768])
torch.Size([32, 59, 768])


train:   1%|          | 2/267 [00:00<00:55,  4.81batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:   1%|▏         | 4/267 [00:00<00:51,  5.15batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


train:   2%|▏         | 6/267 [00:01<00:49,  5.28batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 37, 768])


train:   3%|▎         | 8/267 [00:01<00:49,  5.20batch/s]

torch.Size([32, 59, 768])


train:   3%|▎         | 9/267 [00:01<00:47,  5.38batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 58, 768])


train:   4%|▍         | 11/267 [00:02<00:47,  5.34batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])


train:   5%|▍         | 13/267 [00:02<00:45,  5.54batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 50, 768])


train:   6%|▌         | 15/267 [00:02<00:49,  5.12batch/s]

torch.Size([32, 59, 768])


train:   6%|▌         | 16/267 [00:03<00:49,  5.08batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 57, 768])


train:   7%|▋         | 18/267 [00:03<00:50,  4.95batch/s]

torch.Size([32, 53, 768])


train:   7%|▋         | 19/267 [00:03<00:48,  5.11batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


train:   8%|▊         | 21/267 [00:04<00:48,  5.11batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 48, 768])


train:   9%|▊         | 23/267 [00:04<00:49,  4.98batch/s]

torch.Size([32, 59, 768])


train:   9%|▉         | 24/267 [00:04<00:46,  5.23batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 40, 768])


train:  10%|▉         | 26/267 [00:04<00:44,  5.45batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


train:  10%|█         | 28/267 [00:05<00:41,  5.75batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 53, 768])


train:  11%|█         | 30/267 [00:05<00:43,  5.51batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 50, 768])


train:  12%|█▏        | 32/267 [00:06<00:43,  5.46batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  13%|█▎        | 34/267 [00:06<00:41,  5.57batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 37, 768])


train:  13%|█▎        | 36/267 [00:06<00:40,  5.66batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  14%|█▍        | 38/267 [00:07<00:43,  5.30batch/s]

torch.Size([32, 54, 768])


train:  15%|█▍        | 39/267 [00:07<00:42,  5.33batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  15%|█▌        | 41/267 [00:07<00:40,  5.58batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  16%|█▌        | 43/267 [00:08<00:41,  5.45batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


train:  17%|█▋        | 45/267 [00:08<00:40,  5.53batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  18%|█▊        | 47/267 [00:08<00:41,  5.36batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  18%|█▊        | 49/267 [00:09<00:39,  5.55batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  19%|█▉        | 51/267 [00:09<00:40,  5.33batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  20%|█▉        | 53/267 [00:09<00:41,  5.16batch/s]

torch.Size([32, 59, 768])


train:  20%|██        | 54/267 [00:10<00:41,  5.08batch/s]

torch.Size([32, 54, 768])


train:  21%|██        | 55/267 [00:10<00:42,  5.01batch/s]

torch.Size([32, 55, 768])


train:  21%|██        | 56/267 [00:10<00:41,  5.14batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  22%|██▏       | 58/267 [00:10<00:41,  5.08batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 56, 768])


train:  22%|██▏       | 60/267 [00:11<00:39,  5.21batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  23%|██▎       | 62/267 [00:11<00:41,  4.97batch/s]

torch.Size([32, 60, 768])


train:  24%|██▎       | 63/267 [00:11<00:40,  5.10batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  24%|██▍       | 65/267 [00:12<00:38,  5.20batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  25%|██▌       | 67/267 [00:12<00:39,  5.06batch/s]

torch.Size([32, 53, 768])


train:  25%|██▌       | 68/267 [00:12<00:38,  5.21batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  26%|██▌       | 70/267 [00:13<00:37,  5.23batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  27%|██▋       | 72/267 [00:13<00:36,  5.35batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 43, 768])


train:  28%|██▊       | 74/267 [00:13<00:35,  5.47batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  28%|██▊       | 76/267 [00:14<00:35,  5.39batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  29%|██▉       | 78/267 [00:14<00:35,  5.39batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  30%|██▉       | 80/267 [00:15<00:36,  5.16batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 50, 768])


train:  31%|███       | 82/267 [00:15<00:34,  5.32batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  31%|███▏      | 84/267 [00:15<00:34,  5.31batch/s]

torch.Size([32, 54, 768])


train:  32%|███▏      | 85/267 [00:16<00:33,  5.44batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 57, 768])


train:  33%|███▎      | 87/267 [00:16<00:35,  5.06batch/s]

torch.Size([32, 56, 768])


train:  33%|███▎      | 88/267 [00:16<00:35,  5.05batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 39, 768])


train:  34%|███▎      | 90/267 [00:16<00:32,  5.50batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 58, 768])


train:  34%|███▍      | 92/267 [00:17<00:34,  5.10batch/s]

torch.Size([32, 54, 768])


train:  35%|███▍      | 93/267 [00:17<00:34,  5.05batch/s]

torch.Size([32, 53, 768])


train:  35%|███▌      | 94/267 [00:17<00:33,  5.09batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  36%|███▌      | 96/267 [00:18<00:31,  5.45batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 64, 768])


train:  37%|███▋      | 98/267 [00:18<00:32,  5.13batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 64, 768])


train:  37%|███▋      | 100/267 [00:18<00:32,  5.18batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 43, 768])


train:  38%|███▊      | 102/267 [00:19<00:31,  5.25batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  39%|███▉      | 104/267 [00:19<00:29,  5.48batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


train:  40%|███▉      | 106/267 [00:19<00:27,  5.83batch/s]

torch.Size([32, 34, 768])
torch.Size([32, 56, 768])


train:  40%|████      | 108/267 [00:20<00:28,  5.52batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 34, 768])


train:  41%|████      | 110/267 [00:20<00:26,  5.89batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])


train:  42%|████▏     | 112/267 [00:21<00:28,  5.39batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 55, 768])


train:  43%|████▎     | 114/267 [00:21<00:29,  5.17batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 36, 768])


train:  43%|████▎     | 116/267 [00:21<00:27,  5.41batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 57, 768])


train:  44%|████▍     | 118/267 [00:22<00:28,  5.29batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 40, 768])


train:  45%|████▍     | 120/267 [00:22<00:25,  5.66batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 60, 768])


train:  46%|████▌     | 122/267 [00:22<00:27,  5.19batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 35, 768])


train:  46%|████▋     | 124/267 [00:23<00:25,  5.67batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  47%|████▋     | 126/267 [00:23<00:26,  5.35batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 58, 768])


train:  48%|████▊     | 128/267 [00:24<00:28,  4.95batch/s]

torch.Size([32, 58, 768])


train:  48%|████▊     | 129/267 [00:24<00:26,  5.19batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  49%|████▉     | 131/267 [00:24<00:24,  5.48batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  50%|████▉     | 133/267 [00:25<00:24,  5.37batch/s]

torch.Size([32, 53, 768])


train:  50%|█████     | 134/267 [00:25<00:25,  5.31batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


train:  51%|█████     | 136/267 [00:25<00:25,  5.22batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


train:  52%|█████▏    | 138/267 [00:25<00:24,  5.16batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  52%|█████▏    | 140/267 [00:26<00:24,  5.09batch/s]

torch.Size([32, 53, 768])


train:  53%|█████▎    | 141/267 [00:26<00:24,  5.23batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  54%|█████▎    | 143/267 [00:26<00:23,  5.36batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 36, 768])


train:  54%|█████▍    | 145/267 [00:27<00:22,  5.34batch/s]

torch.Size([32, 58, 768])


train:  55%|█████▍    | 146/267 [00:27<00:22,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 56, 768])


train:  55%|█████▌    | 148/267 [00:27<00:23,  5.13batch/s]

torch.Size([32, 54, 768])


train:  56%|█████▌    | 149/267 [00:28<00:22,  5.30batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 37, 768])


train:  57%|█████▋    | 151/267 [00:28<00:21,  5.36batch/s]

torch.Size([32, 53, 768])


train:  57%|█████▋    | 152/267 [00:28<00:22,  5.22batch/s]

torch.Size([32, 54, 768])


train:  57%|█████▋    | 153/267 [00:28<00:22,  5.17batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


train:  58%|█████▊    | 155/267 [00:29<00:21,  5.30batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  59%|█████▉    | 157/267 [00:29<00:20,  5.28batch/s]

torch.Size([32, 53, 768])


train:  59%|█████▉    | 158/267 [00:29<00:20,  5.37batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


train:  60%|█████▉    | 160/267 [00:30<00:19,  5.49batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


train:  61%|██████    | 162/267 [00:30<00:18,  5.59batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  61%|██████▏   | 164/267 [00:30<00:19,  5.28batch/s]

torch.Size([32, 59, 768])


train:  62%|██████▏   | 165/267 [00:31<00:19,  5.15batch/s]

torch.Size([32, 54, 768])


train:  62%|██████▏   | 166/267 [00:31<00:19,  5.09batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 50, 768])


train:  63%|██████▎   | 168/267 [00:31<00:19,  5.12batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 39, 768])


train:  64%|██████▎   | 170/267 [00:32<00:18,  5.24batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 52, 768])


train:  64%|██████▍   | 172/267 [00:32<00:17,  5.28batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  65%|██████▌   | 174/267 [00:32<00:17,  5.39batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  66%|██████▌   | 176/267 [00:33<00:16,  5.53batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 55, 768])


train:  67%|██████▋   | 178/267 [00:33<00:16,  5.34batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


train:  67%|██████▋   | 180/267 [00:33<00:16,  5.14batch/s]

torch.Size([32, 54, 768])


train:  68%|██████▊   | 181/267 [00:34<00:16,  5.15batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


train:  69%|██████▊   | 183/267 [00:34<00:16,  5.09batch/s]

torch.Size([32, 54, 768])


train:  69%|██████▉   | 184/267 [00:34<00:15,  5.29batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 38, 768])


train:  70%|██████▉   | 186/267 [00:34<00:14,  5.67batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 59, 768])


train:  70%|███████   | 188/267 [00:35<00:14,  5.32batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 36, 768])


train:  71%|███████   | 190/267 [00:35<00:13,  5.62batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:  72%|███████▏  | 192/267 [00:36<00:13,  5.65batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.45batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 41, 768])


train:  73%|███████▎  | 196/267 [00:36<00:12,  5.54batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  74%|███████▍  | 198/267 [00:37<00:12,  5.35batch/s]

torch.Size([32, 54, 768])


train:  75%|███████▍  | 199/267 [00:37<00:12,  5.41batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.14batch/s]

torch.Size([32, 56, 768])


train:  76%|███████▌  | 202/267 [00:37<00:13,  4.98batch/s]

torch.Size([32, 57, 768])


train:  76%|███████▌  | 203/267 [00:38<00:12,  5.24batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 41, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.37batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  78%|███████▊  | 207/267 [00:38<00:11,  5.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 49, 768])


train:  78%|███████▊  | 209/267 [00:39<00:10,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 55, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.12batch/s]

torch.Size([32, 56, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.20batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  80%|████████  | 214/267 [00:40<00:10,  4.97batch/s]

torch.Size([32, 58, 768])


train:  81%|████████  | 215/267 [00:40<00:10,  5.14batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 56, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.24batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 42, 768])


train:  82%|████████▏ | 219/267 [00:41<00:09,  5.10batch/s]

torch.Size([32, 60, 768])


train:  82%|████████▏ | 220/267 [00:41<00:09,  5.04batch/s]

torch.Size([32, 53, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.18batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  84%|████████▎ | 223/267 [00:41<00:08,  5.29batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 34, 768])


train:  84%|████████▍ | 225/267 [00:42<00:07,  5.85batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 47, 768])


train:  85%|████████▌ | 227/267 [00:42<00:06,  5.72batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  86%|████████▌ | 229/267 [00:43<00:07,  5.33batch/s]

torch.Size([32, 57, 768])


train:  86%|████████▌ | 230/267 [00:43<00:06,  5.41batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.56batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.48batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.57batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 41, 768])


train:  89%|████████▉ | 238/267 [00:44<00:05,  5.73batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 42, 768])


train:  90%|████████▉ | 240/267 [00:44<00:04,  5.79batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 60, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.58batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 35, 768])


train:  91%|█████████▏| 244/267 [00:45<00:03,  5.95batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 54, 768])


train:  92%|█████████▏| 246/267 [00:46<00:03,  5.70batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 40, 768])


train:  93%|█████████▎| 248/267 [00:46<00:03,  5.80batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 50, 768])


train:  94%|█████████▎| 250/267 [00:46<00:03,  5.36batch/s]

torch.Size([32, 54, 768])


train:  94%|█████████▍| 251/267 [00:46<00:03,  5.26batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.33batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.15batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  96%|█████████▋| 257/267 [00:48<00:01,  5.17batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.28batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.49batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])


train:  99%|█████████▊| 263/267 [00:49<00:00,  5.29batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.54batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 49, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.34batch/s]
2020-09-01 06:38:23.534 | INFO     | __main__:train:39 - epoch: 2, transformer: gpt2, train_loss: 0.0422, train_acc: 41.98
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


dev:  11%|█▏        | 4/35 [00:00<00:01, 17.40batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 38, 768])
torch.Size([32, 46, 768])
torch.Size([32, 43, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 17.60batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 45, 768])
torch.Size([32, 40, 768])
torch.Size([32, 54, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.72batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 17.01batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])
torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 17.41batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 40, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 17.32batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 46, 768])
torch.Size([32, 44, 768])
torch.Size([32, 35, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 17.47batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])
torch.Size([32, 43, 768])
torch.Size([32, 50, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.10batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 17.19batch/s]
2020-09-01 06:38:25.583 | INFO     | __main__:train:42 - epoch: 2, transformer: gpt2, dev_loss: 0.0397, dev_acc: 45.69
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])
torch.Size([13, 41, 768])
torch.Size([32, 52, 768])


test:   6%|▌         | 4/70 [00:00<00:03, 17.42batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 48, 768])
torch.Size([32, 36, 768])
torch.Size([32, 62, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 16.24batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 40, 768])
torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.00batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 53, 768])
torch.Size([32, 40, 768])
torch.Size([32, 50, 768])


test:  23%|██▎       | 16/70 [00:01<00:03, 15.52batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 56, 768])
torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


test:  29%|██▊       | 20/70 [00:01<00:03, 15.89batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])
torch.Size([32, 49, 768])
torch.Size([32, 52, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 15.95batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


test:  40%|████      | 28/70 [00:01<00:02, 15.98batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 48, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.98batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])
torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.32batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 51, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.55batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.25batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 53, 768])
torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.18batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 58, 768])
torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 43, 768])
torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.54batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])
torch.Size([32, 49, 768])
torch.Size([32, 39, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.82batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])
torch.Size([32, 53, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 16.66batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


test:  97%|█████████▋| 68/70 [00:04<00:00, 16.47batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 40, 768])
torch.Size([32, 55, 768])
torch.Size([32, 42, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.47batch/s]
2020-09-01 06:38:29.850 | INFO     | __main__:train:46 - epoch: 2, transformer: gpt2, test_loss: 0.0391, test_acc: 44.66
2020-09-01 06:38:29.851 | INFO     | __main__:train:47 - epoch: 2, transformer: gpt2, test_precision: 34.43, test_recall: 35.27, test_f1_score: 27.40, test_accuracy_score: 44.66
2020-09-01 06:38:29.851 | INFO     | __main__:train:52 - epoch: 2, transformer: gpt2, test_confusion_matrix: 
[[  0 253   0  26   0]
 [  0 526  17  90   0]
 [  0 229   5 153   2]
 [  0  72   9 409  20]
 [  0  12   1 339  47]]
2020-09-01 06:38:29.852 | INFO     | __main__:train:55 - Total training time elapsed: 0:01:40.776264
2020-09-01 06:38:29.853 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:50.388132
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([2, 39, 768])
torch.Size([32, 52, 768])


train:   1%|          | 2/267 [00:00<00:51,  5.13batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:   1%|▏         | 4/267 [00:00<00:53,  4.92batch/s]

torch.Size([32, 60, 768])


train:   2%|▏         | 5/267 [00:01<00:54,  4.80batch/s]

torch.Size([32, 59, 768])


train:   2%|▏         | 6/267 [00:01<00:51,  5.06batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 48, 768])


train:   3%|▎         | 8/267 [00:01<00:51,  5.07batch/s]

torch.Size([32, 55, 768])


train:   3%|▎         | 9/267 [00:01<00:50,  5.11batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


train:   4%|▍         | 11/267 [00:02<00:46,  5.49batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:   5%|▍         | 13/267 [00:02<00:45,  5.59batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 43, 768])


train:   6%|▌         | 15/267 [00:02<00:43,  5.84batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 52, 768])


train:   6%|▋         | 17/267 [00:03<00:45,  5.54batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:   7%|▋         | 19/267 [00:03<00:43,  5.74batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 42, 768])


train:   8%|▊         | 21/267 [00:03<00:44,  5.59batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:   9%|▊         | 23/267 [00:04<00:44,  5.46batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:   9%|▉         | 25/267 [00:04<00:47,  5.11batch/s]

torch.Size([32, 56, 768])


train:  10%|▉         | 26/267 [00:04<00:45,  5.30batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  10%|█         | 28/267 [00:05<00:45,  5.21batch/s]

torch.Size([32, 54, 768])


train:  11%|█         | 29/267 [00:05<00:46,  5.11batch/s]

torch.Size([32, 54, 768])


train:  11%|█         | 30/267 [00:05<00:44,  5.33batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  12%|█▏        | 32/267 [00:05<00:44,  5.24batch/s]

torch.Size([32, 54, 768])


train:  12%|█▏        | 33/267 [00:06<00:43,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 60, 768])


train:  13%|█▎        | 35/267 [00:06<00:45,  5.09batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  14%|█▍        | 37/267 [00:06<00:43,  5.27batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 43, 768])


train:  15%|█▍        | 39/267 [00:07<00:42,  5.43batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


train:  15%|█▌        | 41/267 [00:07<00:39,  5.67batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 43, 768])


train:  16%|█▌        | 43/267 [00:08<00:41,  5.43batch/s]

torch.Size([32, 56, 768])


train:  16%|█▋        | 44/267 [00:08<00:41,  5.44batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  17%|█▋        | 46/267 [00:08<00:43,  5.06batch/s]

torch.Size([32, 58, 768])


train:  18%|█▊        | 47/267 [00:08<00:42,  5.20batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 40, 768])


train:  18%|█▊        | 49/267 [00:09<00:42,  5.14batch/s]

torch.Size([32, 59, 768])


train:  19%|█▊        | 50/267 [00:09<00:41,  5.22batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  19%|█▉        | 52/267 [00:09<00:39,  5.47batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 55, 768])


train:  20%|██        | 54/267 [00:10<00:40,  5.22batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  21%|██        | 56/267 [00:10<00:39,  5.31batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 60, 768])


train:  22%|██▏       | 58/267 [00:10<00:40,  5.16batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  22%|██▏       | 60/267 [00:11<00:38,  5.43batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 54, 768])


train:  23%|██▎       | 62/267 [00:11<00:38,  5.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


train:  24%|██▍       | 64/267 [00:12<00:39,  5.11batch/s]

torch.Size([32, 54, 768])


train:  24%|██▍       | 65/267 [00:12<00:37,  5.34batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  25%|██▌       | 67/267 [00:12<00:37,  5.40batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  26%|██▌       | 69/267 [00:12<00:38,  5.16batch/s]

torch.Size([32, 53, 768])


train:  26%|██▌       | 70/267 [00:13<00:38,  5.10batch/s]

torch.Size([32, 53, 768])


train:  27%|██▋       | 72/267 [00:13<00:37,  5.18batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 54, 768])
torch.Size([32, 39, 768])


train:  28%|██▊       | 74/267 [00:13<00:36,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  28%|██▊       | 76/267 [00:14<00:36,  5.19batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 41, 768])


train:  29%|██▉       | 78/267 [00:14<00:35,  5.34batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


train:  30%|██▉       | 80/267 [00:15<00:33,  5.50batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 59, 768])


train:  31%|███       | 82/267 [00:15<00:34,  5.29batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


train:  31%|███▏      | 84/267 [00:15<00:33,  5.55batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


train:  32%|███▏      | 86/267 [00:16<00:31,  5.72batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 49, 768])


train:  33%|███▎      | 88/267 [00:16<00:33,  5.40batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  34%|███▎      | 90/267 [00:16<00:33,  5.34batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  34%|███▍      | 92/267 [00:17<00:33,  5.28batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 40, 768])


train:  35%|███▌      | 94/267 [00:17<00:31,  5.51batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 64, 768])


train:  36%|███▌      | 96/267 [00:18<00:33,  5.11batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


train:  37%|███▋      | 98/267 [00:18<00:34,  4.91batch/s]

torch.Size([32, 59, 768])


train:  37%|███▋      | 99/267 [00:18<00:33,  5.06batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


train:  38%|███▊      | 101/267 [00:18<00:30,  5.44batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 41, 768])


train:  39%|███▊      | 103/267 [00:19<00:31,  5.27batch/s]

torch.Size([32, 57, 768])


train:  39%|███▉      | 104/267 [00:19<00:30,  5.31batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  40%|███▉      | 106/267 [00:19<00:29,  5.38batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


train:  40%|████      | 108/267 [00:20<00:29,  5.40batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 48, 768])


train:  41%|████      | 110/267 [00:20<00:30,  5.15batch/s]

torch.Size([32, 57, 768])


train:  42%|████▏     | 111/267 [00:20<00:31,  4.97batch/s]

torch.Size([32, 58, 768])


train:  42%|████▏     | 112/267 [00:21<00:31,  4.94batch/s]

torch.Size([32, 54, 768])


train:  42%|████▏     | 113/267 [00:21<00:29,  5.21batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 41, 768])


train:  43%|████▎     | 115/267 [00:21<00:27,  5.52batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:  44%|████▍     | 117/267 [00:21<00:27,  5.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  45%|████▍     | 119/267 [00:22<00:26,  5.60batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 50, 768])


train:  45%|████▌     | 121/267 [00:22<00:25,  5.62batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  46%|████▌     | 123/267 [00:23<00:25,  5.57batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 58, 768])


train:  47%|████▋     | 125/267 [00:23<00:26,  5.31batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


train:  48%|████▊     | 127/267 [00:23<00:26,  5.22batch/s]

torch.Size([32, 57, 768])


train:  48%|████▊     | 128/267 [00:24<00:26,  5.17batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


train:  49%|████▊     | 130/267 [00:24<00:26,  5.20batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  49%|████▉     | 132/267 [00:24<00:24,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


train:  50%|█████     | 134/267 [00:25<00:24,  5.43batch/s]

torch.Size([32, 52, 768])


train:  51%|█████     | 135/267 [00:25<00:24,  5.32batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 58, 768])


train:  51%|█████▏    | 137/267 [00:25<00:25,  5.18batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


train:  52%|█████▏    | 139/267 [00:26<00:23,  5.52batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 38, 768])


train:  53%|█████▎    | 141/267 [00:26<00:22,  5.62batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 59, 768])


train:  54%|█████▎    | 143/267 [00:26<00:22,  5.51batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 42, 768])


train:  54%|█████▍    | 145/267 [00:27<00:21,  5.75batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 56, 768])


train:  55%|█████▌    | 147/267 [00:27<00:22,  5.36batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


train:  56%|█████▌    | 149/267 [00:27<00:20,  5.62batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 43, 768])


train:  57%|█████▋    | 151/267 [00:28<00:21,  5.50batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  57%|█████▋    | 153/267 [00:28<00:20,  5.66batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  58%|█████▊    | 155/267 [00:28<00:20,  5.42batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  59%|█████▉    | 157/267 [00:29<00:21,  5.24batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])


train:  60%|█████▉    | 159/267 [00:29<00:19,  5.66batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 59, 768])


train:  60%|██████    | 161/267 [00:30<00:19,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


train:  61%|██████    | 163/267 [00:30<00:19,  5.21batch/s]

torch.Size([32, 57, 768])


train:  61%|██████▏   | 164/267 [00:30<00:19,  5.28batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:  62%|██████▏   | 166/267 [00:31<00:18,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  63%|██████▎   | 168/267 [00:31<00:17,  5.65batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


train:  64%|██████▎   | 170/267 [00:31<00:17,  5.63batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])


train:  64%|██████▍   | 172/267 [00:32<00:17,  5.44batch/s]

torch.Size([32, 54, 768])


train:  65%|██████▍   | 173/267 [00:32<00:17,  5.33batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 37, 768])


train:  66%|██████▌   | 175/267 [00:32<00:16,  5.58batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  66%|██████▋   | 177/267 [00:33<00:17,  5.22batch/s]

torch.Size([32, 54, 768])


train:  67%|██████▋   | 178/267 [00:33<00:16,  5.30batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:  67%|██████▋   | 180/267 [00:33<00:15,  5.67batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 42, 768])


train:  68%|██████▊   | 182/267 [00:33<00:14,  5.89batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 55, 768])


train:  69%|██████▉   | 184/267 [00:34<00:15,  5.40batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 58, 768])


train:  70%|██████▉   | 186/267 [00:34<00:15,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 55, 768])


train:  70%|███████   | 188/267 [00:35<00:15,  5.23batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  71%|███████   | 190/267 [00:35<00:14,  5.37batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 42, 768])


train:  72%|███████▏  | 192/267 [00:35<00:13,  5.39batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.23batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


train:  73%|███████▎  | 196/267 [00:36<00:13,  5.43batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


train:  74%|███████▍  | 198/267 [00:36<00:12,  5.61batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  75%|███████▍  | 200/267 [00:37<00:12,  5.26batch/s]

torch.Size([32, 54, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.48batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  76%|███████▌  | 203/267 [00:37<00:11,  5.36batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  77%|███████▋  | 205/267 [00:38<00:12,  5.12batch/s]

torch.Size([32, 56, 768])


train:  77%|███████▋  | 206/267 [00:38<00:11,  5.24batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 46, 768])


train:  78%|███████▊  | 208/267 [00:38<00:11,  5.20batch/s]

torch.Size([32, 54, 768])


train:  78%|███████▊  | 209/267 [00:39<00:11,  5.09batch/s]

torch.Size([32, 56, 768])


train:  79%|███████▊  | 210/267 [00:39<00:10,  5.33batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.03batch/s]

torch.Size([32, 57, 768])


train:  80%|███████▉  | 213/267 [00:39<00:10,  5.19batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 46, 768])


train:  81%|████████  | 215/267 [00:40<00:10,  5.04batch/s]

torch.Size([32, 59, 768])


train:  81%|████████  | 216/267 [00:40<00:10,  5.03batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  82%|████████▏ | 218/267 [00:40<00:09,  5.25batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  82%|████████▏ | 220/267 [00:41<00:08,  5.43batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 32, 768])


train:  83%|████████▎ | 222/267 [00:41<00:07,  5.84batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  84%|████████▍ | 224/267 [00:41<00:07,  5.68batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.54batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 53, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.38batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  86%|████████▌ | 230/267 [00:42<00:06,  5.31batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.45batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.37batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  89%|████████▉ | 238/267 [00:44<00:05,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  90%|████████▉ | 240/267 [00:44<00:05,  5.30batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.47batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 60, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  5.23batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  92%|█████████▏| 246/267 [00:45<00:03,  5.49batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 56, 768])


train:  93%|█████████▎| 248/267 [00:46<00:03,  5.33batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


train:  94%|█████████▎| 250/267 [00:46<00:03,  5.58batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  94%|█████████▍| 252/267 [00:47<00:02,  5.50batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  95%|█████████▌| 254/267 [00:47<00:02,  5.44batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 56, 768])


train:  96%|█████████▌| 256/267 [00:47<00:02,  5.29batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  97%|█████████▋| 258/267 [00:48<00:01,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 41, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.60batch/s]

torch.Size([32, 64, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.06batch/s]

torch.Size([32, 54, 768])


train:  98%|█████████▊| 262/267 [00:48<00:00,  5.28batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 58, 768])


train:  99%|█████████▉| 264/267 [00:49<00:00,  5.03batch/s]

torch.Size([32, 53, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.01batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 43, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.35batch/s]
2020-09-01 06:39:19.725 | INFO     | __main__:train:39 - epoch: 3, transformer: gpt2, train_loss: 0.0383, train_acc: 46.57
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 36, 768])


dev:  11%|█▏        | 4/35 [00:00<00:01, 17.42batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 16.78batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 41, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.73batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])
torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 16.84batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])
torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 16.79batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])
torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 16.97batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 16.85batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 53, 768])
torch.Size([32, 45, 768])
torch.Size([32, 54, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.70batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 44, 768])
torch.Size([32, 38, 768])
torch.Size([32, 48, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 17.04batch/s]
2020-09-01 06:39:21.793 | INFO     | __main__:train:42 - epoch: 3, transformer: gpt2, dev_loss: 0.0382, dev_acc: 46.59
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 37, 768])
torch.Size([13, 51, 768])
torch.Size([32, 53, 768])
torch.Size([32, 50, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 15.71batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 52, 768])
torch.Size([32, 53, 768])
torch.Size([32, 43, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 15.58batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 56, 768])
torch.Size([32, 41, 768])
torch.Size([32, 43, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.30batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 44, 768])
torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 17.15batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 44, 768])
torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


test:  29%|██▊       | 20/70 [00:01<00:02, 17.35batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])
torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.54batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.49batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])
torch.Size([32, 53, 768])
torch.Size([32, 52, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.13batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 45, 768])
torch.Size([32, 52, 768])
torch.Size([32, 46, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 15.70batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 54, 768])
torch.Size([32, 51, 768])


test:  57%|█████▋    | 40/70 [00:02<00:02, 14.98batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 62, 768])
torch.Size([32, 52, 768])


test:  60%|██████    | 42/70 [00:02<00:01, 15.49batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 41, 768])
torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


test:  66%|██████▌   | 46/70 [00:02<00:01, 16.12batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])
torch.Size([32, 41, 768])


test:  71%|███████▏  | 50/70 [00:03<00:01, 16.57batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])
torch.Size([32, 55, 768])
torch.Size([32, 41, 768])


test:  77%|███████▋  | 54/70 [00:03<00:00, 16.50batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 53, 768])
torch.Size([32, 51, 768])


test:  83%|████████▎ | 58/70 [00:03<00:00, 16.38batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])
torch.Size([32, 44, 768])
torch.Size([32, 48, 768])


test:  89%|████████▊ | 62/70 [00:03<00:00, 16.48batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 55, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


test:  94%|█████████▍| 66/70 [00:04<00:00, 16.41batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 49, 768])
torch.Size([32, 36, 768])
torch.Size([32, 42, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.57batch/s]
2020-09-01 06:39:26.035 | INFO     | __main__:train:46 - epoch: 3, transformer: gpt2, test_loss: 0.0373, test_acc: 47.83
2020-09-01 06:39:26.036 | INFO     | __main__:train:47 - epoch: 3, transformer: gpt2, test_precision: 39.05, test_recall: 39.93, test_f1_score: 35.62, test_accuracy_score: 47.83
2020-09-01 06:39:26.037 | INFO     | __main__:train:52 - epoch: 3, transformer: gpt2, test_confusion_matrix: 
[[  0 238  10  31   0]
 [  0 472  41 120   0]
 [  0 180  37 165   7]
 [  0  31  18 399  62]
 [  0   4   2 244 149]]
2020-09-01 06:39:26.037 | INFO     | __main__:train:55 - Total training time elapsed: 0:02:30.647017
2020-09-01 06:39:26.038 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:50.215672
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 45, 768])
torch.Size([2, 23, 768])
torch.Size([32, 44, 768])


train:   1%|          | 2/267 [00:00<00:46,  5.69batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:   1%|▏         | 4/267 [00:00<00:47,  5.55batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:   2%|▏         | 6/267 [00:01<00:47,  5.44batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 38, 768])


train:   3%|▎         | 8/267 [00:01<00:49,  5.26batch/s]

torch.Size([32, 60, 768])


train:   3%|▎         | 9/267 [00:01<00:49,  5.19batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 60, 768])


train:   4%|▍         | 11/267 [00:02<00:49,  5.13batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:   5%|▍         | 13/267 [00:02<00:49,  5.12batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:   6%|▌         | 15/267 [00:02<00:47,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:   6%|▋         | 17/267 [00:03<00:47,  5.32batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])


train:   7%|▋         | 19/267 [00:03<00:46,  5.37batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:   8%|▊         | 21/267 [00:03<00:44,  5.54batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:   9%|▊         | 23/267 [00:04<00:44,  5.49batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:   9%|▉         | 25/267 [00:04<00:42,  5.71batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 56, 768])


train:  10%|█         | 27/267 [00:05<00:43,  5.47batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:  11%|█         | 29/267 [00:05<00:45,  5.19batch/s]

torch.Size([32, 55, 768])


train:  11%|█         | 30/267 [00:05<00:43,  5.40batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


train:  12%|█▏        | 32/267 [00:05<00:43,  5.45batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 38, 768])


train:  13%|█▎        | 34/267 [00:06<00:40,  5.73batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  13%|█▎        | 36/267 [00:06<00:40,  5.73batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 54, 768])


train:  14%|█▍        | 38/267 [00:07<00:40,  5.68batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 49, 768])


train:  15%|█▍        | 40/267 [00:07<00:41,  5.43batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 56, 768])


train:  16%|█▌        | 42/267 [00:07<00:42,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  16%|█▋        | 44/267 [00:08<00:43,  5.17batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  17%|█▋        | 46/267 [00:08<00:41,  5.38batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 47, 768])


train:  18%|█▊        | 48/267 [00:08<00:40,  5.41batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  19%|█▊        | 50/267 [00:09<00:39,  5.55batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 51, 768])


train:  19%|█▉        | 52/267 [00:09<00:38,  5.56batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 41, 768])


train:  20%|██        | 54/267 [00:09<00:37,  5.72batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 57, 768])


train:  21%|██        | 56/267 [00:10<00:38,  5.51batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])


train:  22%|██▏       | 58/267 [00:10<00:38,  5.45batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  22%|██▏       | 60/267 [00:11<00:36,  5.63batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 57, 768])


train:  23%|██▎       | 62/267 [00:11<00:37,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  24%|██▍       | 64/267 [00:11<00:36,  5.52batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  25%|██▍       | 66/267 [00:12<00:36,  5.54batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  25%|██▌       | 68/267 [00:12<00:37,  5.26batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  26%|██▌       | 70/267 [00:12<00:35,  5.50batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 48, 768])


train:  27%|██▋       | 72/267 [00:13<00:37,  5.15batch/s]

torch.Size([32, 60, 768])


train:  27%|██▋       | 73/267 [00:13<00:38,  5.10batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])


train:  28%|██▊       | 75/267 [00:13<00:36,  5.25batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  29%|██▉       | 77/267 [00:14<00:37,  5.06batch/s]

torch.Size([32, 55, 768])


train:  29%|██▉       | 78/267 [00:14<00:37,  5.09batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  30%|██▉       | 80/267 [00:14<00:37,  5.03batch/s]

torch.Size([32, 54, 768])


train:  30%|███       | 81/267 [00:15<00:36,  5.05batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  31%|███       | 83/267 [00:15<00:35,  5.22batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 52, 768])


train:  32%|███▏      | 85/267 [00:15<00:35,  5.11batch/s]

torch.Size([32, 53, 768])


train:  32%|███▏      | 86/267 [00:16<00:34,  5.24batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  33%|███▎      | 88/267 [00:16<00:32,  5.57batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 54, 768])


train:  34%|███▎      | 90/267 [00:16<00:32,  5.50batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 40, 768])


train:  34%|███▍      | 92/267 [00:17<00:31,  5.50batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 45, 768])


train:  35%|███▌      | 94/267 [00:17<00:32,  5.33batch/s]

torch.Size([32, 54, 768])


train:  36%|███▌      | 95/267 [00:17<00:31,  5.50batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 52, 768])


train:  36%|███▋      | 97/267 [00:18<00:33,  5.09batch/s]

torch.Size([32, 59, 768])


train:  37%|███▋      | 98/267 [00:18<00:32,  5.22batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  37%|███▋      | 100/267 [00:18<00:30,  5.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 36, 768])


train:  38%|███▊      | 102/267 [00:18<00:29,  5.57batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  39%|███▉      | 104/267 [00:19<00:28,  5.73batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 41, 768])


train:  40%|███▉      | 106/267 [00:19<00:27,  5.88batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 49, 768])


train:  40%|████      | 108/267 [00:20<00:28,  5.65batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 58, 768])


train:  41%|████      | 110/267 [00:20<00:27,  5.69batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 46, 768])


train:  42%|████▏     | 112/267 [00:20<00:28,  5.38batch/s]

torch.Size([32, 56, 768])


train:  42%|████▏     | 113/267 [00:20<00:27,  5.57batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 52, 768])


train:  43%|████▎     | 115/267 [00:21<00:27,  5.62batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 55, 768])


train:  44%|████▍     | 117/267 [00:21<00:27,  5.49batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  45%|████▍     | 119/267 [00:22<00:26,  5.62batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 56, 768])


train:  45%|████▌     | 121/267 [00:22<00:27,  5.40batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  46%|████▌     | 123/267 [00:22<00:26,  5.34batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  47%|████▋     | 125/267 [00:23<00:25,  5.51batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  48%|████▊     | 127/267 [00:23<00:25,  5.47batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 53, 768])


train:  48%|████▊     | 129/267 [00:23<00:26,  5.18batch/s]

torch.Size([32, 54, 768])


train:  49%|████▊     | 130/267 [00:24<00:25,  5.40batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 41, 768])


train:  49%|████▉     | 132/267 [00:24<00:25,  5.35batch/s]

torch.Size([32, 54, 768])


train:  50%|████▉     | 133/267 [00:24<00:24,  5.56batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


train:  51%|█████     | 135/267 [00:24<00:23,  5.66batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


train:  51%|█████▏    | 137/267 [00:25<00:23,  5.44batch/s]

torch.Size([32, 54, 768])
torch.Size([32, 50, 768])


train:  52%|█████▏    | 139/267 [00:25<00:24,  5.27batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  53%|█████▎    | 141/267 [00:26<00:24,  5.20batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  53%|█████▎    | 142/267 [00:26<00:24,  5.16batch/s]

torch.Size([32, 64, 768])


train:  54%|█████▍    | 144/267 [00:26<00:23,  5.13batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 41, 768])


train:  55%|█████▍    | 146/267 [00:27<00:22,  5.48batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  55%|█████▌    | 148/267 [00:27<00:21,  5.58batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  56%|█████▌    | 150/267 [00:27<00:20,  5.62batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 43, 768])


train:  57%|█████▋    | 152/267 [00:28<00:21,  5.28batch/s]

torch.Size([32, 60, 768])


train:  57%|█████▋    | 153/267 [00:28<00:22,  5.16batch/s]

torch.Size([32, 54, 768])


train:  58%|█████▊    | 154/267 [00:28<00:22,  4.98batch/s]

torch.Size([32, 58, 768])


train:  58%|█████▊    | 155/267 [00:28<00:22,  4.96batch/s]

torch.Size([32, 54, 768])


train:  58%|█████▊    | 156/267 [00:28<00:22,  5.02batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:  59%|█████▉    | 158/267 [00:29<00:20,  5.29batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 57, 768])


train:  60%|█████▉    | 160/267 [00:29<00:20,  5.24batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


train:  61%|██████    | 162/267 [00:30<00:19,  5.27batch/s]

torch.Size([32, 54, 768])


train:  61%|██████    | 163/267 [00:30<00:20,  5.20batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  62%|██████▏   | 165/267 [00:30<00:19,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.28batch/s]

torch.Size([32, 56, 768])


train:  63%|██████▎   | 168/267 [00:31<00:18,  5.24batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 38, 768])


train:  64%|██████▎   | 170/267 [00:31<00:16,  5.83batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 51, 768])


train:  64%|██████▍   | 172/267 [00:31<00:17,  5.39batch/s]

torch.Size([32, 53, 768])


train:  65%|██████▍   | 173/267 [00:32<00:17,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  66%|██████▌   | 175/267 [00:32<00:16,  5.59batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])


train:  66%|██████▋   | 177/267 [00:32<00:17,  5.20batch/s]

torch.Size([32, 59, 768])


train:  67%|██████▋   | 178/267 [00:33<00:16,  5.31batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


train:  67%|██████▋   | 180/267 [00:33<00:14,  5.88batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 46, 768])


train:  68%|██████▊   | 182/267 [00:33<00:15,  5.38batch/s]

torch.Size([32, 57, 768])


train:  69%|██████▊   | 183/267 [00:33<00:16,  5.24batch/s]

torch.Size([32, 54, 768])


train:  69%|██████▉   | 184/267 [00:34<00:15,  5.23batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


train:  70%|██████▉   | 186/267 [00:34<00:15,  5.38batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:  70%|███████   | 188/267 [00:34<00:14,  5.36batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 34, 768])


train:  71%|███████   | 190/267 [00:35<00:13,  5.66batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 59, 768])


train:  72%|███████▏  | 192/267 [00:35<00:14,  5.25batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.44batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  73%|███████▎  | 196/267 [00:36<00:13,  5.24batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:  74%|███████▍  | 198/267 [00:36<00:12,  5.48batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:  75%|███████▍  | 200/267 [00:37<00:12,  5.45batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 56, 768])


train:  76%|███████▌  | 202/267 [00:37<00:12,  5.20batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  76%|███████▋  | 204/267 [00:37<00:11,  5.28batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 56, 768])


train:  77%|███████▋  | 206/267 [00:38<00:12,  4.98batch/s]

torch.Size([32, 57, 768])


train:  78%|███████▊  | 207/267 [00:38<00:11,  5.23batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 46, 768])


train:  78%|███████▊  | 209/267 [00:38<00:11,  5.27batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.20batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.11batch/s]

torch.Size([32, 64, 768])


train:  80%|████████  | 214/267 [00:39<00:10,  4.91batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])


train:  81%|████████  | 216/267 [00:40<00:09,  5.11batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])


train:  82%|████████▏ | 218/267 [00:40<00:09,  5.12batch/s]

torch.Size([32, 53, 768])


train:  82%|████████▏ | 219/267 [00:40<00:08,  5.34batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  84%|████████▎ | 223/267 [00:41<00:07,  5.56batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  84%|████████▍ | 225/267 [00:41<00:07,  5.67batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  85%|████████▌ | 227/267 [00:42<00:07,  5.42batch/s]

torch.Size([32, 55, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.57batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  86%|████████▌ | 230/267 [00:42<00:06,  5.74batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 43, 768])


train:  87%|████████▋ | 232/267 [00:43<00:05,  5.91batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 48, 768])


train:  88%|████████▊ | 234/267 [00:43<00:05,  5.64batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  88%|████████▊ | 236/267 [00:43<00:05,  5.43batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


train:  89%|████████▉ | 238/267 [00:44<00:05,  5.15batch/s]

torch.Size([32, 56, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.24batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  90%|█████████ | 241/267 [00:44<00:05,  5.06batch/s]

torch.Size([32, 59, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.17batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  5.23batch/s]

torch.Size([32, 53, 768])


train:  92%|█████████▏| 245/267 [00:45<00:04,  5.01batch/s]

torch.Size([32, 59, 768])


train:  92%|█████████▏| 246/267 [00:45<00:04,  5.04batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 40, 768])


train:  93%|█████████▎| 248/267 [00:46<00:03,  5.15batch/s]

torch.Size([32, 56, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.13batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  94%|█████████▍| 251/267 [00:46<00:03,  5.26batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.78batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 50, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.52batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 59, 768])


train:  96%|█████████▋| 257/267 [00:47<00:01,  5.63batch/s]

torch.Size([32, 34, 768])
torch.Size([32, 58, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.37batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.42batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  99%|█████████▊| 263/267 [00:48<00:00,  5.48batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 47, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.29batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 49, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.37batch/s]
2020-09-01 06:40:15.746 | INFO     | __main__:train:39 - epoch: 4, transformer: gpt2, train_loss: 0.0357, train_acc: 49.87
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


dev:  11%|█▏        | 4/35 [00:00<00:01, 16.17batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])
torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 15.87batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 58, 768])
torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.16batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 45, 768])
torch.Size([32, 50, 768])
torch.Size([32, 46, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 16.08batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])
torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 16.29batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 54, 768])
torch.Size([32, 43, 768])
torch.Size([32, 43, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 16.87batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 46, 768])
torch.Size([32, 43, 768])
torch.Size([32, 44, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 16.94batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.94batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 39, 768])
torch.Size([32, 46, 768])
torch.Size([32, 43, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.84batch/s]
2020-09-01 06:40:17.837 | INFO     | __main__:train:42 - epoch: 4, transformer: gpt2, dev_loss: 0.0382, dev_acc: 46.96
test:   3%|▎         | 2/70 [00:00<00:03, 18.84batch/s]

torch.Size([32, 40, 768])
torch.Size([13, 48, 768])
torch.Size([32, 42, 768])
torch.Size([32, 37, 768])


test:   9%|▊         | 6/70 [00:00<00:03, 18.62batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 38, 768])
torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 18.56batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 43, 768])
torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 17.02batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 52, 768])
torch.Size([32, 39, 768])
torch.Size([32, 39, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 17.20batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])
torch.Size([32, 52, 768])
torch.Size([32, 62, 768])


test:  29%|██▊       | 20/70 [00:01<00:03, 15.85batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 51, 768])
torch.Size([32, 44, 768])
torch.Size([32, 39, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 15.98batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 53, 768])
torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])
torch.Size([32, 55, 768])
torch.Size([32, 40, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 15.84batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 56, 768])
torch.Size([32, 52, 768])
torch.Size([32, 36, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.31batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])
torch.Size([32, 41, 768])
torch.Size([32, 41, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.83batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])
torch.Size([32, 43, 768])
torch.Size([32, 58, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.63batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])
torch.Size([32, 56, 768])
torch.Size([32, 51, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.71batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 40, 768])
torch.Size([32, 39, 768])
torch.Size([32, 52, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.37batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])
torch.Size([32, 56, 768])
torch.Size([32, 50, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.46batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.43batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])
torch.Size([32, 53, 768])
torch.Size([32, 47, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 16.41batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])
torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.83batch/s]
2020-09-01 06:40:22.013 | INFO     | __main__:train:46 - epoch: 4, transformer: gpt2, test_loss: 0.0373, test_acc: 47.96
2020-09-01 06:40:22.014 | INFO     | __main__:train:47 - epoch: 4, transformer: gpt2, test_precision: 58.57, test_recall: 41.83, test_f1_score: 38.72, test_accuracy_score: 47.96
2020-09-01 06:40:22.015 | INFO     | __main__:train:52 - epoch: 4, transformer: gpt2, test_confusion_matrix: 
[[  1 224  19  35   0]
 [  0 395 101 136   1]
 [  0 112  72 193  12]
 [  0  15  14 376 105]
 [  0   4   3 176 216]]
2020-09-01 06:40:22.016 | INFO     | __main__:train:55 - Total training time elapsed: 0:03:20.354207
2020-09-01 06:40:22.017 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:50.088552
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])
torch.Size([32, 37, 768])
torch.Size([2, 53, 768])


train:   0%|          | 1/267 [00:00<00:44,  5.95batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 64, 768])


train:   1%|          | 3/267 [00:00<00:51,  5.17batch/s]

torch.Size([32, 54, 768])


train:   1%|▏         | 4/267 [00:00<00:50,  5.25batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 39, 768])


train:   2%|▏         | 6/267 [00:01<00:48,  5.39batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 60, 768])


train:   3%|▎         | 8/267 [00:01<00:51,  5.04batch/s]

torch.Size([32, 54, 768])


train:   3%|▎         | 9/267 [00:01<00:50,  5.15batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:   4%|▍         | 11/267 [00:02<00:46,  5.47batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 60, 768])


train:   5%|▍         | 13/267 [00:02<00:49,  5.08batch/s]

torch.Size([32, 53, 768])


train:   5%|▌         | 14/267 [00:02<00:47,  5.28batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 49, 768])


train:   6%|▌         | 16/267 [00:03<00:48,  5.19batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])


train:   7%|▋         | 18/267 [00:03<00:45,  5.51batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:   7%|▋         | 20/267 [00:03<00:44,  5.58batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 53, 768])


train:   8%|▊         | 22/267 [00:04<00:46,  5.31batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:   9%|▉         | 24/267 [00:04<00:44,  5.48batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 52, 768])


train:  10%|▉         | 26/267 [00:04<00:46,  5.19batch/s]

torch.Size([32, 54, 768])
torch.Size([32, 45, 768])


train:  10%|█         | 28/267 [00:05<00:43,  5.54batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 44, 768])


train:  11%|█         | 30/267 [00:05<00:43,  5.44batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 43, 768])


train:  12%|█▏        | 32/267 [00:06<00:42,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 56, 768])


train:  13%|█▎        | 34/267 [00:06<00:41,  5.57batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 43, 768])


train:  13%|█▎        | 36/267 [00:06<00:41,  5.57batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  14%|█▍        | 38/267 [00:07<00:41,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 40, 768])


train:  15%|█▍        | 40/267 [00:07<00:41,  5.44batch/s]

torch.Size([32, 54, 768])


train:  15%|█▌        | 41/267 [00:07<00:42,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


train:  16%|█▌        | 43/267 [00:08<00:41,  5.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 59, 768])


train:  17%|█▋        | 45/267 [00:08<00:41,  5.33batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 42, 768])


train:  18%|█▊        | 47/267 [00:08<00:41,  5.27batch/s]

torch.Size([32, 55, 768])


train:  18%|█▊        | 48/267 [00:08<00:40,  5.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 56, 768])


train:  19%|█▊        | 50/267 [00:09<00:41,  5.24batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 56, 768])


train:  19%|█▉        | 52/267 [00:09<00:41,  5.19batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


train:  20%|██        | 54/267 [00:10<00:38,  5.60batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 58, 768])


train:  21%|██        | 56/267 [00:10<00:40,  5.21batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  22%|██▏       | 58/267 [00:10<00:41,  5.01batch/s]

torch.Size([32, 55, 768])


train:  22%|██▏       | 59/267 [00:11<00:42,  4.89batch/s]

torch.Size([32, 58, 768])


train:  22%|██▏       | 60/267 [00:11<00:41,  4.95batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  23%|██▎       | 62/267 [00:11<00:38,  5.37batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  24%|██▍       | 64/267 [00:12<00:37,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


train:  25%|██▍       | 66/267 [00:12<00:37,  5.36batch/s]

torch.Size([32, 53, 768])


train:  25%|██▌       | 67/267 [00:12<00:36,  5.48batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 41, 768])


train:  26%|██▌       | 69/267 [00:12<00:37,  5.35batch/s]

torch.Size([32, 56, 768])


train:  26%|██▌       | 70/267 [00:13<00:37,  5.21batch/s]

torch.Size([32, 54, 768])


train:  27%|██▋       | 71/267 [00:13<00:39,  5.02batch/s]

torch.Size([32, 58, 768])


train:  27%|██▋       | 72/267 [00:13<00:39,  4.91batch/s]

torch.Size([32, 57, 768])


train:  27%|██▋       | 73/267 [00:13<00:39,  4.91batch/s]

torch.Size([32, 54, 768])


train:  28%|██▊       | 74/267 [00:13<00:38,  4.96batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 40, 768])


train:  28%|██▊       | 76/267 [00:14<00:35,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  29%|██▉       | 78/267 [00:14<00:36,  5.25batch/s]

torch.Size([32, 55, 768])


train:  30%|██▉       | 79/267 [00:14<00:34,  5.48batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


train:  30%|███       | 81/267 [00:15<00:34,  5.34batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


train:  31%|███       | 83/267 [00:15<00:36,  5.01batch/s]

torch.Size([32, 59, 768])


train:  31%|███▏      | 84/267 [00:15<00:34,  5.28batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 39, 768])


train:  32%|███▏      | 86/267 [00:16<00:32,  5.49batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  33%|███▎      | 88/267 [00:16<00:34,  5.23batch/s]

torch.Size([32, 59, 768])


train:  33%|███▎      | 89/267 [00:16<00:33,  5.28batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:  34%|███▍      | 91/267 [00:17<00:30,  5.72batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 43, 768])


train:  35%|███▍      | 93/267 [00:17<00:31,  5.56batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  36%|███▌      | 95/267 [00:17<00:33,  5.20batch/s]

torch.Size([32, 59, 768])


train:  36%|███▌      | 96/267 [00:18<00:33,  5.17batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


train:  37%|███▋      | 98/267 [00:18<00:31,  5.36batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 54, 768])


train:  37%|███▋      | 100/267 [00:18<00:30,  5.46batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 44, 768])


train:  38%|███▊      | 102/267 [00:19<00:30,  5.40batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 35, 768])


train:  39%|███▉      | 104/267 [00:19<00:29,  5.59batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 57, 768])


train:  40%|███▉      | 106/267 [00:19<00:30,  5.33batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  40%|████      | 108/267 [00:20<00:29,  5.35batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  41%|████      | 110/267 [00:20<00:28,  5.57batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  42%|████▏     | 112/267 [00:20<00:28,  5.40batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  43%|████▎     | 114/267 [00:21<00:28,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 58, 768])


train:  43%|████▎     | 116/267 [00:21<00:28,  5.28batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


train:  44%|████▍     | 118/267 [00:22<00:29,  5.08batch/s]

torch.Size([32, 54, 768])


train:  45%|████▍     | 119/267 [00:22<00:29,  5.06batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 40, 768])


train:  45%|████▌     | 121/267 [00:22<00:27,  5.24batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


train:  46%|████▌     | 123/267 [00:23<00:27,  5.16batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])


train:  47%|████▋     | 125/267 [00:23<00:25,  5.47batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 53, 768])


train:  48%|████▊     | 127/267 [00:23<00:26,  5.35batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  48%|████▊     | 129/267 [00:24<00:26,  5.25batch/s]

torch.Size([32, 54, 768])


train:  49%|████▊     | 130/267 [00:24<00:25,  5.35batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:  49%|████▉     | 132/267 [00:24<00:24,  5.57batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 44, 768])


train:  50%|█████     | 134/267 [00:25<00:25,  5.26batch/s]

torch.Size([32, 59, 768])


train:  51%|█████     | 135/267 [00:25<00:23,  5.52batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 53, 768])


train:  51%|█████▏    | 137/267 [00:25<00:25,  5.18batch/s]

torch.Size([32, 56, 768])


train:  52%|█████▏    | 138/267 [00:25<00:24,  5.17batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:  52%|█████▏    | 140/267 [00:26<00:23,  5.32batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  53%|█████▎    | 142/267 [00:26<00:21,  5.75batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 48, 768])


train:  54%|█████▍    | 144/267 [00:26<00:22,  5.38batch/s]

torch.Size([32, 54, 768])


train:  54%|█████▍    | 145/267 [00:27<00:21,  5.76batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 59, 768])


train:  55%|█████▌    | 147/267 [00:27<00:23,  5.07batch/s]

torch.Size([32, 60, 768])


train:  55%|█████▌    | 148/267 [00:27<00:22,  5.37batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 51, 768])


train:  56%|█████▌    | 150/267 [00:28<00:21,  5.54batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 45, 768])


train:  57%|█████▋    | 152/267 [00:28<00:20,  5.52batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 37, 768])


train:  58%|█████▊    | 154/267 [00:28<00:20,  5.53batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 48, 768])


train:  58%|█████▊    | 156/267 [00:29<00:20,  5.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:  59%|█████▉    | 158/267 [00:29<00:20,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  60%|█████▉    | 160/267 [00:29<00:19,  5.38batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  61%|██████    | 162/267 [00:30<00:20,  5.09batch/s]

torch.Size([32, 60, 768])


train:  61%|██████    | 163/267 [00:30<00:20,  5.17batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  62%|██████▏   | 165/267 [00:30<00:20,  5.09batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 50, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.35batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])


train:  63%|██████▎   | 168/267 [00:31<00:18,  5.30batch/s]

torch.Size([32, 64, 768])


train:  64%|██████▎   | 170/267 [00:31<00:18,  5.23batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 49, 768])


train:  64%|██████▍   | 172/267 [00:32<00:18,  5.14batch/s]

torch.Size([32, 53, 768])


train:  65%|██████▍   | 173/267 [00:32<00:17,  5.26batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  66%|██████▌   | 175/267 [00:32<00:17,  5.23batch/s]

torch.Size([32, 55, 768])


train:  66%|██████▌   | 176/267 [00:33<00:17,  5.15batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 57, 768])


train:  67%|██████▋   | 178/267 [00:33<00:17,  5.17batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:  67%|██████▋   | 180/267 [00:33<00:16,  5.18batch/s]

torch.Size([32, 56, 768])


train:  68%|██████▊   | 181/267 [00:33<00:16,  5.25batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


train:  69%|██████▊   | 183/267 [00:34<00:15,  5.60batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 54, 768])


train:  69%|██████▉   | 185/267 [00:34<00:15,  5.42batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  70%|███████   | 187/267 [00:35<00:14,  5.61batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  71%|███████   | 189/267 [00:35<00:14,  5.44batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  72%|███████▏  | 191/267 [00:35<00:13,  5.45batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


train:  72%|███████▏  | 193/267 [00:36<00:13,  5.45batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  73%|███████▎  | 195/267 [00:36<00:13,  5.32batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 37, 768])


train:  74%|███████▍  | 197/267 [00:36<00:12,  5.45batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  75%|███████▍  | 199/267 [00:37<00:12,  5.43batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.44batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:  76%|███████▌  | 203/267 [00:38<00:11,  5.38batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 58, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.22batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  78%|███████▊  | 207/267 [00:38<00:11,  5.21batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  78%|███████▊  | 209/267 [00:39<00:10,  5.33batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 43, 768])


train:  80%|███████▉  | 213/267 [00:39<00:09,  5.66batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 47, 768])


train:  81%|████████  | 215/267 [00:40<00:08,  5.79batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 52, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.55batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 38, 768])


train:  82%|████████▏ | 219/267 [00:40<00:08,  5.82batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.54batch/s]

torch.Size([32, 54, 768])


train:  83%|████████▎ | 222/267 [00:41<00:08,  5.55batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  84%|████████▍ | 224/267 [00:41<00:07,  5.68batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 42, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.77batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.54batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  86%|████████▌ | 230/267 [00:42<00:06,  5.65batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.62batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  88%|████████▊ | 234/267 [00:43<00:05,  5.62batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 57, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.26batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  89%|████████▉ | 238/267 [00:44<00:05,  5.31batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 54, 768])


train:  90%|████████▉ | 240/267 [00:44<00:05,  5.25batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 35, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.59batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  5.50batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 52, 768])


train:  92%|█████████▏| 246/267 [00:45<00:03,  5.49batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])


train:  93%|█████████▎| 248/267 [00:46<00:03,  5.08batch/s]

torch.Size([32, 59, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.09batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 41, 768])


train:  94%|█████████▍| 251/267 [00:46<00:02,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 43, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.40batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 56, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.29batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  96%|█████████▋| 257/267 [00:47<00:01,  5.45batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.69batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 48, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.56batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  99%|█████████▊| 263/267 [00:49<00:00,  5.47batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.45batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 53, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.37batch/s]
2020-09-01 06:41:11.765 | INFO     | __main__:train:39 - epoch: 5, transformer: gpt2, train_loss: 0.0337, train_acc: 53.10
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 51, 768])


dev:  11%|█▏        | 4/35 [00:00<00:02, 14.96batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 51, 768])
torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 15.89batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])
torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.38batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])
torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 16.95batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 38, 768])
torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 17.05batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 51, 768])
torch.Size([32, 36, 768])
torch.Size([32, 54, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 16.49batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 51, 768])
torch.Size([32, 38, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 16.65batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 53, 768])
torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.93batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.97batch/s]
2020-09-01 06:41:13.841 | INFO     | __main__:train:42 - epoch: 5, transformer: gpt2, dev_loss: 0.0368, dev_acc: 47.77
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 48, 768])
torch.Size([13, 43, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 15.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 52, 768])
torch.Size([32, 45, 768])
torch.Size([32, 53, 768])


test:  11%|█▏        | 8/70 [00:00<00:04, 14.96batch/s]

torch.Size([32, 62, 768])
torch.Size([32, 53, 768])
torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 15.59batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 43, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


test:  23%|██▎       | 16/70 [00:01<00:03, 16.06batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


test:  29%|██▊       | 20/70 [00:01<00:02, 16.68batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])
torch.Size([32, 46, 768])
torch.Size([32, 46, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.99batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


test:  40%|████      | 28/70 [00:01<00:02, 17.26batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 39, 768])
torch.Size([32, 44, 768])
torch.Size([32, 49, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.68batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 53, 768])
torch.Size([32, 49, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.06batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 15.51batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 50, 768])
torch.Size([32, 49, 768])
torch.Size([32, 41, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.37batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])
torch.Size([32, 40, 768])
torch.Size([32, 43, 768])


test:  71%|███████▏  | 50/70 [00:03<00:01, 17.68batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 45, 768])
torch.Size([32, 37, 768])
torch.Size([32, 42, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.95batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])
torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 17.04batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])
torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.25batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 53, 768])
torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 15.65batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 55, 768])
torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.50batch/s]
2020-09-01 06:41:18.101 | INFO     | __main__:train:46 - epoch: 5, transformer: gpt2, test_loss: 0.0355, test_acc: 51.95
2020-09-01 06:41:18.102 | INFO     | __main__:train:47 - epoch: 5, transformer: gpt2, test_precision: 51.99, test_recall: 46.55, test_f1_score: 46.83, test_accuracy_score: 51.95
2020-09-01 06:41:18.102 | INFO     | __main__:train:52 - epoch: 5, transformer: gpt2, test_confusion_matrix: 
[[ 71 193   9   6   0]
 [ 58 496  52  25   2]
 [ 11 222  72  79   5]
 [  0  70  61 314  65]
 [  0  13  22 169 195]]
2020-09-01 06:41:18.103 | INFO     | __main__:train:55 - Total training time elapsed: 0:04:10.100981
2020-09-01 06:41:18.104 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:50.020196
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 41, 768])
torch.Size([32, 58, 768])
torch.Size([2, 14, 768])


train:   0%|          | 1/267 [00:00<00:48,  5.48batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:   1%|          | 3/267 [00:00<00:47,  5.56batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:   2%|▏         | 5/267 [00:00<00:46,  5.61batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 60, 768])


train:   3%|▎         | 7/267 [00:01<00:48,  5.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:   3%|▎         | 9/267 [00:01<00:47,  5.46batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:   4%|▍         | 11/267 [00:01<00:45,  5.68batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 38, 768])


train:   5%|▍         | 13/267 [00:02<00:44,  5.70batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:   6%|▌         | 15/267 [00:02<00:44,  5.64batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 58, 768])


train:   6%|▋         | 17/267 [00:03<00:46,  5.33batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 57, 768])


train:   7%|▋         | 19/267 [00:03<00:49,  5.05batch/s]

torch.Size([32, 54, 768])


train:   7%|▋         | 20/267 [00:03<00:48,  5.05batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 43, 768])


train:   8%|▊         | 22/267 [00:04<00:46,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:   9%|▉         | 24/267 [00:04<00:44,  5.48batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 44, 768])


train:  10%|▉         | 26/267 [00:04<00:42,  5.63batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 57, 768])


train:  10%|█         | 28/267 [00:05<00:43,  5.44batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])


train:  11%|█         | 30/267 [00:05<00:43,  5.47batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:  12%|█▏        | 32/267 [00:05<00:43,  5.46batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  13%|█▎        | 34/267 [00:06<00:43,  5.39batch/s]

torch.Size([32, 54, 768])


train:  13%|█▎        | 35/267 [00:06<00:42,  5.51batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  14%|█▍        | 37/267 [00:06<00:43,  5.29batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  15%|█▍        | 39/267 [00:07<00:42,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


train:  15%|█▌        | 41/267 [00:07<00:42,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  16%|█▌        | 43/267 [00:07<00:40,  5.57batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 52, 768])


train:  17%|█▋        | 45/267 [00:08<00:40,  5.49batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  18%|█▊        | 47/267 [00:08<00:42,  5.22batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])


train:  18%|█▊        | 49/267 [00:09<00:41,  5.30batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 50, 768])


train:  19%|█▉        | 51/267 [00:09<00:40,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  20%|█▉        | 53/267 [00:09<00:40,  5.24batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  21%|██        | 55/267 [00:10<00:41,  5.14batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  21%|██▏       | 57/267 [00:10<00:38,  5.39batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 50, 768])


train:  22%|██▏       | 59/267 [00:10<00:39,  5.25batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  23%|██▎       | 61/267 [00:11<00:40,  5.11batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  24%|██▎       | 63/267 [00:11<00:39,  5.11batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])


train:  24%|██▍       | 65/267 [00:12<00:37,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  25%|██▌       | 67/267 [00:12<00:37,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 39, 768])


train:  26%|██▌       | 69/267 [00:12<00:34,  5.72batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  27%|██▋       | 71/267 [00:13<00:35,  5.55batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 60, 768])


train:  27%|██▋       | 73/267 [00:13<00:36,  5.37batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  28%|██▊       | 75/267 [00:13<00:34,  5.58batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 50, 768])


train:  29%|██▉       | 77/267 [00:14<00:34,  5.57batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 52, 768])


train:  30%|██▉       | 79/267 [00:14<00:34,  5.41batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  30%|███       | 81/267 [00:15<00:35,  5.27batch/s]

torch.Size([32, 53, 768])


train:  31%|███       | 82/267 [00:15<00:35,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])


train:  31%|███▏      | 84/267 [00:15<00:33,  5.46batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  32%|███▏      | 86/267 [00:15<00:33,  5.46batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  33%|███▎      | 88/267 [00:16<00:33,  5.30batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  34%|███▎      | 90/267 [00:16<00:35,  5.02batch/s]

torch.Size([32, 59, 768])


train:  34%|███▍      | 91/267 [00:16<00:35,  4.97batch/s]

torch.Size([32, 54, 768])


train:  34%|███▍      | 92/267 [00:17<00:35,  4.95batch/s]

torch.Size([32, 54, 768])


train:  35%|███▍      | 93/267 [00:17<00:34,  4.99batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  36%|███▌      | 95/267 [00:17<00:34,  5.01batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 58, 768])


train:  36%|███▋      | 97/267 [00:18<00:32,  5.25batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 58, 768])


train:  37%|███▋      | 99/267 [00:18<00:33,  5.01batch/s]

torch.Size([32, 54, 768])


train:  37%|███▋      | 100/267 [00:18<00:32,  5.14batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  38%|███▊      | 102/267 [00:19<00:30,  5.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])


train:  39%|███▉      | 104/267 [00:19<00:31,  5.20batch/s]

torch.Size([32, 54, 768])


train:  39%|███▉      | 105/267 [00:19<00:31,  5.11batch/s]

torch.Size([32, 55, 768])


train:  40%|███▉      | 106/267 [00:19<00:31,  5.10batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


train:  40%|████      | 108/267 [00:20<00:30,  5.27batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 56, 768])


train:  41%|████      | 110/267 [00:20<00:29,  5.27batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 39, 768])


train:  42%|████▏     | 112/267 [00:21<00:29,  5.28batch/s]

torch.Size([32, 56, 768])


train:  42%|████▏     | 113/267 [00:21<00:28,  5.36batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:  43%|████▎     | 115/267 [00:21<00:27,  5.46batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:  44%|████▍     | 117/267 [00:21<00:28,  5.29batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 36, 768])


train:  45%|████▍     | 119/267 [00:22<00:27,  5.32batch/s]

torch.Size([32, 57, 768])


train:  45%|████▍     | 120/267 [00:22<00:28,  5.24batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


train:  46%|████▌     | 122/267 [00:22<00:28,  5.10batch/s]

torch.Size([32, 53, 768])


train:  46%|████▌     | 123/267 [00:23<00:28,  5.06batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 47, 768])


train:  47%|████▋     | 125/267 [00:23<00:26,  5.34batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  48%|████▊     | 127/267 [00:23<00:25,  5.39batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  48%|████▊     | 129/267 [00:24<00:24,  5.60batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  49%|████▉     | 131/267 [00:24<00:24,  5.50batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  50%|████▉     | 133/267 [00:24<00:24,  5.51batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


train:  51%|█████     | 135/267 [00:25<00:24,  5.43batch/s]

torch.Size([32, 53, 768])


train:  51%|█████     | 136/267 [00:25<00:23,  5.60batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  52%|█████▏    | 138/267 [00:25<00:22,  5.66batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  52%|█████▏    | 140/267 [00:26<00:22,  5.66batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 42, 768])


train:  53%|█████▎    | 142/267 [00:26<00:21,  5.83batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  54%|█████▍    | 144/267 [00:26<00:22,  5.45batch/s]

torch.Size([32, 54, 768])


train:  54%|█████▍    | 145/267 [00:27<00:23,  5.15batch/s]

torch.Size([32, 59, 768])
torch.Size([32, 64, 768])


train:  55%|█████▌    | 147/267 [00:27<00:25,  4.76batch/s]

torch.Size([32, 59, 768])


train:  55%|█████▌    | 148/267 [00:27<00:25,  4.72batch/s]

torch.Size([32, 57, 768])


train:  56%|█████▌    | 149/267 [00:27<00:24,  4.86batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 55, 768])


train:  57%|█████▋    | 151/267 [00:28<00:22,  5.10batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  57%|█████▋    | 153/267 [00:28<00:21,  5.28batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  58%|█████▊    | 155/267 [00:29<00:20,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 48, 768])


train:  59%|█████▉    | 157/267 [00:29<00:20,  5.40batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 36, 768])


train:  60%|█████▉    | 159/267 [00:29<00:18,  5.85batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 55, 768])


train:  60%|██████    | 161/267 [00:30<00:19,  5.49batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  61%|██████    | 163/267 [00:30<00:19,  5.44batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  62%|██████▏   | 165/267 [00:30<00:18,  5.47batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.39batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  63%|██████▎   | 169/267 [00:31<00:17,  5.64batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 50, 768])


train:  64%|██████▍   | 171/267 [00:31<00:16,  5.66batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 59, 768])


train:  65%|██████▍   | 173/267 [00:32<00:17,  5.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  66%|██████▌   | 175/267 [00:32<00:17,  5.39batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  66%|██████▋   | 177/267 [00:33<00:16,  5.37batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  67%|██████▋   | 179/267 [00:33<00:16,  5.44batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  68%|██████▊   | 181/267 [00:33<00:17,  5.04batch/s]

torch.Size([32, 59, 768])


train:  68%|██████▊   | 182/267 [00:34<00:16,  5.05batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  69%|██████▉   | 184/267 [00:34<00:15,  5.20batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 53, 768])


train:  70%|██████▉   | 186/267 [00:34<00:16,  4.96batch/s]

torch.Size([32, 58, 768])


train:  70%|███████   | 187/267 [00:35<00:16,  4.85batch/s]

torch.Size([32, 58, 768])


train:  70%|███████   | 188/267 [00:35<00:15,  5.09batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  71%|███████   | 190/267 [00:35<00:14,  5.24batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  72%|███████▏  | 192/267 [00:36<00:14,  5.28batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.42batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


train:  73%|███████▎  | 196/267 [00:36<00:12,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  74%|███████▍  | 198/267 [00:37<00:12,  5.66batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 53, 768])


train:  75%|███████▍  | 200/267 [00:37<00:12,  5.24batch/s]

torch.Size([32, 55, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.17batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  76%|███████▌  | 203/267 [00:38<00:11,  5.47batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 57, 768])


train:  77%|███████▋  | 205/267 [00:38<00:12,  4.98batch/s]

torch.Size([32, 60, 768])


train:  77%|███████▋  | 206/267 [00:38<00:12,  4.92batch/s]

torch.Size([32, 56, 768])


train:  78%|███████▊  | 207/267 [00:38<00:12,  4.92batch/s]

torch.Size([32, 54, 768])


train:  78%|███████▊  | 208/267 [00:39<00:11,  4.93batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 48, 768])


train:  79%|███████▊  | 210/267 [00:39<00:11,  5.07batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.09batch/s]

torch.Size([32, 54, 768])


train:  80%|███████▉  | 213/267 [00:40<00:10,  5.21batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.44batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  81%|████████▏ | 217/267 [00:40<00:08,  5.69batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


train:  82%|████████▏ | 219/267 [00:41<00:08,  5.63batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


train:  83%|████████▎ | 221/267 [00:41<00:07,  5.76batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 48, 768])


train:  84%|████████▎ | 223/267 [00:41<00:07,  5.59batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 59, 768])


train:  84%|████████▍ | 225/267 [00:42<00:08,  5.15batch/s]

torch.Size([32, 53, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.39batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 43, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.30batch/s]

torch.Size([32, 56, 768])


train:  86%|████████▌ | 229/267 [00:42<00:07,  5.37batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


train:  87%|████████▋ | 231/267 [00:43<00:06,  5.32batch/s]

torch.Size([32, 54, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])


train:  88%|████████▊ | 234/267 [00:43<00:05,  5.53batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  88%|████████▊ | 236/267 [00:44<00:06,  5.16batch/s]

torch.Size([32, 60, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.44batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 46, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.37batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  90%|█████████ | 241/267 [00:45<00:04,  5.32batch/s]

torch.Size([32, 53, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.34batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 44, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  5.61batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 36, 768])


train:  92%|█████████▏| 246/267 [00:46<00:03,  5.75batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  93%|█████████▎| 248/267 [00:46<00:03,  5.69batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  94%|█████████▎| 250/267 [00:46<00:03,  5.45batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  94%|█████████▍| 252/267 [00:47<00:02,  5.27batch/s]

torch.Size([32, 52, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.67batch/s]

torch.Size([32, 34, 768])
torch.Size([32, 46, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.70batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 64, 768])


train:  96%|█████████▋| 257/267 [00:48<00:01,  5.40batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 42, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.43batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.68batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 38, 768])


train:  99%|█████████▊| 263/267 [00:49<00:00,  5.63batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 37, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.61batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 56, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.35batch/s]
2020-09-01 06:42:07.989 | INFO     | __main__:train:39 - epoch: 6, transformer: gpt2, train_loss: 0.0322, train_acc: 54.90
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 58, 768])


dev:   6%|▌         | 2/35 [00:00<00:02, 16.49batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])
torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


dev:  17%|█▋        | 6/35 [00:00<00:01, 16.35batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])
torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


dev:  29%|██▊       | 10/35 [00:00<00:01, 17.53batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 41, 768])
torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


dev:  40%|████      | 14/35 [00:00<00:01, 17.05batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 46, 768])
torch.Size([32, 58, 768])
torch.Size([32, 53, 768])


dev:  51%|█████▏    | 18/35 [00:01<00:01, 16.37batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 40, 768])
torch.Size([32, 51, 768])
torch.Size([32, 40, 768])


dev:  63%|██████▎   | 22/35 [00:01<00:00, 16.57batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])
torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


dev:  74%|███████▍  | 26/35 [00:01<00:00, 17.03batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])
torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


dev:  86%|████████▌ | 30/35 [00:01<00:00, 16.66batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])
torch.Size([32, 40, 768])
torch.Size([32, 46, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 17.05batch/s]
2020-09-01 06:42:10.054 | INFO     | __main__:train:42 - epoch: 6, transformer: gpt2, dev_loss: 0.0373, dev_acc: 49.86
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])
torch.Size([13, 45, 768])
torch.Size([32, 49, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 15.58batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 42, 768])
torch.Size([32, 56, 768])
torch.Size([32, 44, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 16.02batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])
torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.01batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 50, 768])
torch.Size([32, 49, 768])
torch.Size([32, 41, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 16.55batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 47, 768])
torch.Size([32, 42, 768])
torch.Size([32, 50, 768])


test:  29%|██▊       | 20/70 [00:01<00:02, 16.78batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 42, 768])
torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.52batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 39, 768])
torch.Size([32, 53, 768])
torch.Size([32, 48, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.92batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 41, 768])
torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.66batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])
torch.Size([32, 56, 768])
torch.Size([32, 52, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.53batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.80batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 40, 768])
torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.70batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])
torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.79batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])
torch.Size([32, 37, 768])
torch.Size([32, 53, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.05batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 51, 768])
torch.Size([32, 37, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.72batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 41, 768])
torch.Size([32, 48, 768])
torch.Size([32, 55, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.41batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 53, 768])
torch.Size([32, 42, 768])
torch.Size([32, 50, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 16.77batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 39, 768])
torch.Size([32, 49, 768])
torch.Size([32, 56, 768])


test:  97%|█████████▋| 68/70 [00:04<00:00, 15.71batch/s]

torch.Size([32, 62, 768])
torch.Size([32, 53, 768])
torch.Size([32, 46, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.53batch/s]
2020-09-01 06:42:14.308 | INFO     | __main__:train:46 - epoch: 6, transformer: gpt2, test_loss: 0.0353, test_acc: 51.09
2020-09-01 06:42:14.308 | INFO     | __main__:train:47 - epoch: 6, transformer: gpt2, test_precision: 50.62, test_recall: 49.84, test_f1_score: 49.47, test_accuracy_score: 51.09
2020-09-01 06:42:14.309 | INFO     | __main__:train:52 - epoch: 6, transformer: gpt2, test_confusion_matrix: 
[[137 103  24  15   0]
 [124 323 104  78   4]
 [ 21 119  93 148   8]
 [  2  20  35 352 101]
 [  1   4   9 161 224]]
2020-09-01 06:42:14.310 | INFO     | __main__:train:55 - Total training time elapsed: 0:04:59.985760
2020-09-01 06:42:14.310 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.997627
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 58, 768])
torch.Size([2, 34, 768])
torch.Size([32, 51, 768])


train:   1%|          | 2/267 [00:00<00:51,  5.17batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:   1%|▏         | 4/267 [00:00<00:51,  5.08batch/s]

torch.Size([32, 55, 768])


train:   2%|▏         | 5/267 [00:00<00:50,  5.18batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


train:   3%|▎         | 7/267 [00:01<00:46,  5.56batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])


train:   3%|▎         | 9/267 [00:01<00:46,  5.53batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 40, 768])


train:   4%|▍         | 11/267 [00:02<00:47,  5.44batch/s]

torch.Size([32, 53, 768])


train:   4%|▍         | 12/267 [00:02<00:47,  5.31batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 51, 768])


train:   5%|▌         | 14/267 [00:02<00:47,  5.30batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:   6%|▌         | 16/267 [00:03<00:50,  4.98batch/s]

torch.Size([32, 59, 768])


train:   6%|▋         | 17/267 [00:03<00:50,  4.95batch/s]

torch.Size([32, 54, 768])


train:   7%|▋         | 18/267 [00:03<00:47,  5.21batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:   7%|▋         | 20/267 [00:03<00:45,  5.42batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 59, 768])


train:   8%|▊         | 22/267 [00:04<00:48,  5.08batch/s]

torch.Size([32, 53, 768])


train:   9%|▊         | 23/267 [00:04<00:48,  5.06batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:   9%|▉         | 25/267 [00:04<00:45,  5.30batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 59, 768])


train:  10%|█         | 27/267 [00:05<00:48,  4.92batch/s]

torch.Size([32, 57, 768])


train:  10%|█         | 28/267 [00:05<00:48,  4.91batch/s]

torch.Size([32, 54, 768])


train:  11%|█         | 29/267 [00:05<00:47,  5.05batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  12%|█▏        | 31/267 [00:05<00:45,  5.19batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


train:  12%|█▏        | 33/267 [00:06<00:45,  5.19batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 56, 768])


train:  13%|█▎        | 35/267 [00:06<00:45,  5.11batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


train:  14%|█▍        | 37/267 [00:07<00:41,  5.52batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  15%|█▍        | 39/267 [00:07<00:38,  5.90batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 58, 768])


train:  15%|█▌        | 41/267 [00:07<00:40,  5.60batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])


train:  16%|█▌        | 43/267 [00:08<00:42,  5.30batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


train:  17%|█▋        | 45/267 [00:08<00:42,  5.25batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 55, 768])


train:  18%|█▊        | 47/267 [00:08<00:44,  4.92batch/s]

torch.Size([32, 60, 768])


train:  18%|█▊        | 48/267 [00:09<00:43,  5.07batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  19%|█▊        | 50/267 [00:09<00:40,  5.42batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 46, 768])


train:  19%|█▉        | 52/267 [00:09<00:40,  5.32batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  20%|██        | 54/267 [00:10<00:41,  5.07batch/s]

torch.Size([32, 59, 768])


train:  21%|██        | 55/267 [00:10<00:40,  5.26batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  21%|██▏       | 57/267 [00:10<00:40,  5.25batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


train:  22%|██▏       | 59/267 [00:11<00:38,  5.44batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 50, 768])


train:  23%|██▎       | 61/267 [00:11<00:39,  5.16batch/s]

torch.Size([32, 56, 768])


train:  23%|██▎       | 62/267 [00:11<00:40,  5.08batch/s]

torch.Size([32, 54, 768])
torch.Size([32, 48, 768])


train:  24%|██▍       | 64/267 [00:12<00:38,  5.25batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  25%|██▍       | 66/267 [00:12<00:36,  5.45batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:  25%|██▌       | 68/267 [00:12<00:35,  5.59batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 56, 768])


train:  26%|██▌       | 70/267 [00:13<00:36,  5.46batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  27%|██▋       | 72/267 [00:13<00:36,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 41, 768])


train:  28%|██▊       | 74/267 [00:14<00:35,  5.42batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 58, 768])


train:  28%|██▊       | 76/267 [00:14<00:35,  5.36batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 58, 768])


train:  29%|██▉       | 78/267 [00:14<00:35,  5.36batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 55, 768])


train:  30%|██▉       | 80/267 [00:15<00:36,  5.10batch/s]

torch.Size([32, 54, 768])


train:  30%|███       | 81/267 [00:15<00:35,  5.24batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  31%|███       | 83/267 [00:15<00:33,  5.44batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  32%|███▏      | 85/267 [00:16<00:35,  5.13batch/s]

torch.Size([32, 56, 768])


train:  32%|███▏      | 86/267 [00:16<00:35,  5.16batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])


train:  33%|███▎      | 88/267 [00:16<00:33,  5.33batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 35, 768])


train:  34%|███▎      | 90/267 [00:16<00:30,  5.80batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 39, 768])


train:  34%|███▍      | 92/267 [00:17<00:29,  5.91batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


train:  35%|███▌      | 94/267 [00:17<00:30,  5.66batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 39, 768])


train:  36%|███▌      | 96/267 [00:18<00:30,  5.54batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


train:  37%|███▋      | 98/267 [00:18<00:31,  5.43batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 41, 768])


train:  37%|███▋      | 100/267 [00:18<00:30,  5.55batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  38%|███▊      | 102/267 [00:19<00:30,  5.37batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 44, 768])


train:  39%|███▉      | 104/267 [00:19<00:29,  5.47batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  40%|███▉      | 106/267 [00:19<00:29,  5.49batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 39, 768])


train:  40%|████      | 108/267 [00:20<00:29,  5.42batch/s]

torch.Size([32, 54, 768])


train:  41%|████      | 109/267 [00:20<00:29,  5.42batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


train:  42%|████▏     | 111/267 [00:20<00:29,  5.23batch/s]

torch.Size([32, 53, 768])


train:  42%|████▏     | 112/267 [00:21<00:30,  5.14batch/s]

torch.Size([32, 54, 768])


train:  42%|████▏     | 113/267 [00:21<00:29,  5.21batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  43%|████▎     | 115/267 [00:21<00:29,  5.18batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  44%|████▍     | 117/267 [00:21<00:28,  5.32batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 54, 768])


train:  45%|████▍     | 119/267 [00:22<00:27,  5.42batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 44, 768])


train:  45%|████▌     | 121/267 [00:22<00:26,  5.54batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  46%|████▌     | 123/267 [00:23<00:25,  5.59batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  47%|████▋     | 125/267 [00:23<00:24,  5.72batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  48%|████▊     | 127/267 [00:23<00:26,  5.24batch/s]

torch.Size([32, 60, 768])


train:  48%|████▊     | 128/267 [00:23<00:25,  5.40batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  49%|████▊     | 130/267 [00:24<00:26,  5.19batch/s]

torch.Size([32, 57, 768])


train:  49%|████▉     | 131/267 [00:24<00:25,  5.31batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 58, 768])


train:  50%|████▉     | 133/267 [00:24<00:26,  5.10batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  51%|█████     | 135/267 [00:25<00:25,  5.17batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  51%|█████▏    | 137/267 [00:25<00:24,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  52%|█████▏    | 139/267 [00:26<00:25,  5.00batch/s]

torch.Size([32, 59, 768])


train:  52%|█████▏    | 140/267 [00:26<00:25,  5.04batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 38, 768])


train:  53%|█████▎    | 142/267 [00:26<00:23,  5.25batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 60, 768])


train:  54%|█████▍    | 144/267 [00:27<00:23,  5.17batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 43, 768])


train:  55%|█████▍    | 146/267 [00:27<00:21,  5.50batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 41, 768])


train:  55%|█████▌    | 148/267 [00:27<00:21,  5.61batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  56%|█████▌    | 150/267 [00:28<00:21,  5.43batch/s]

torch.Size([32, 53, 768])


train:  57%|█████▋    | 151/267 [00:28<00:20,  5.58batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 41, 768])


train:  57%|█████▋    | 153/267 [00:28<00:19,  5.80batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 49, 768])


train:  58%|█████▊    | 155/267 [00:29<00:20,  5.59batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  59%|█████▉    | 157/267 [00:29<00:20,  5.40batch/s]

torch.Size([32, 54, 768])


train:  59%|█████▉    | 158/267 [00:29<00:20,  5.26batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 46, 768])


train:  60%|█████▉    | 160/267 [00:29<00:20,  5.28batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])


train:  61%|██████    | 162/267 [00:30<00:20,  5.12batch/s]

torch.Size([32, 58, 768])


train:  61%|██████    | 163/267 [00:30<00:20,  5.14batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  62%|██████▏   | 165/267 [00:30<00:19,  5.21batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 53, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.55batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 45, 768])


train:  63%|██████▎   | 169/267 [00:31<00:17,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  64%|██████▍   | 171/267 [00:32<00:18,  5.23batch/s]

torch.Size([32, 54, 768])


train:  64%|██████▍   | 172/267 [00:32<00:18,  5.16batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 40, 768])


train:  65%|██████▌   | 174/267 [00:32<00:17,  5.44batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  66%|██████▌   | 176/267 [00:32<00:16,  5.54batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 36, 768])


train:  67%|██████▋   | 178/267 [00:33<00:15,  5.73batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  67%|██████▋   | 180/267 [00:33<00:15,  5.52batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  68%|██████▊   | 182/267 [00:34<00:15,  5.41batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  69%|██████▉   | 184/267 [00:34<00:14,  5.61batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 47, 768])


train:  70%|██████▉   | 186/267 [00:34<00:14,  5.68batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 45, 768])


train:  70%|███████   | 188/267 [00:35<00:14,  5.47batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  71%|███████   | 190/267 [00:35<00:13,  5.70batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 51, 768])


train:  72%|███████▏  | 192/267 [00:35<00:13,  5.49batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.54batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  73%|███████▎  | 196/267 [00:36<00:12,  5.50batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])


train:  74%|███████▍  | 198/267 [00:36<00:12,  5.60batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


train:  75%|███████▍  | 200/267 [00:37<00:12,  5.51batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


train:  76%|███████▌  | 202/267 [00:37<00:11,  5.66batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 46, 768])


train:  76%|███████▋  | 204/267 [00:38<00:11,  5.37batch/s]

torch.Size([32, 56, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.31batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 40, 768])


train:  78%|███████▊  | 207/267 [00:38<00:11,  5.38batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  78%|███████▊  | 209/267 [00:38<00:10,  5.31batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  79%|███████▉  | 211/267 [00:39<00:11,  5.07batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 43, 768])


train:  80%|███████▉  | 213/267 [00:39<00:09,  5.51batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 51, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.40batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 60, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.21batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:  82%|████████▏ | 219/267 [00:40<00:08,  5.47batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 45, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.72batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 59, 768])


train:  83%|████████▎ | 222/267 [00:41<00:08,  5.31batch/s]

torch.Size([32, 64, 768])


train:  84%|████████▍ | 224/267 [00:41<00:08,  4.98batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.13batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.16batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  86%|████████▌ | 230/267 [00:42<00:06,  5.41batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.29batch/s]

torch.Size([32, 56, 768])


train:  87%|████████▋ | 233/267 [00:43<00:06,  5.16batch/s]

torch.Size([32, 54, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.34batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.26batch/s]

torch.Size([32, 53, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.22batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.03batch/s]

torch.Size([32, 56, 768])


train:  90%|████████▉ | 240/267 [00:44<00:05,  5.25batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.26batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  5.24batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


train:  92%|█████████▏| 246/267 [00:46<00:04,  5.13batch/s]

torch.Size([32, 54, 768])


train:  93%|█████████▎| 247/267 [00:46<00:03,  5.16batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 57, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.11batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  94%|█████████▍| 251/267 [00:46<00:03,  5.28batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 36, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.69batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.88batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 45, 768])


train:  96%|█████████▋| 257/267 [00:48<00:01,  5.47batch/s]

torch.Size([32, 56, 768])


train:  97%|█████████▋| 258/267 [00:48<00:01,  5.61batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 57, 768])


train:  97%|█████████▋| 260/267 [00:48<00:01,  5.08batch/s]

torch.Size([32, 57, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.28batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 64, 768])


train:  99%|█████████▊| 263/267 [00:49<00:00,  5.09batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.37batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 55, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.35batch/s]
2020-09-01 06:43:04.234 | INFO     | __main__:train:39 - epoch: 7, transformer: gpt2, train_loss: 0.0306, train_acc: 57.65
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


dev:  11%|█▏        | 4/35 [00:00<00:01, 15.78batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 58, 768])
torch.Size([32, 45, 768])


dev:  23%|██▎       | 8/35 [00:00<00:01, 16.45batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])
torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


dev:  34%|███▍      | 12/35 [00:00<00:01, 16.19batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 49, 768])
torch.Size([32, 51, 768])
torch.Size([32, 36, 768])


dev:  46%|████▌     | 16/35 [00:00<00:01, 16.62batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


dev:  57%|█████▋    | 20/35 [00:01<00:00, 17.42batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])
torch.Size([32, 36, 768])
torch.Size([32, 52, 768])


dev:  69%|██████▊   | 24/35 [00:01<00:00, 16.73batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])
torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


dev:  80%|████████  | 28/35 [00:01<00:00, 16.69batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


dev:  91%|█████████▏| 32/35 [00:01<00:00, 16.52batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.89batch/s]
2020-09-01 06:43:06.319 | INFO     | __main__:train:42 - epoch: 7, transformer: gpt2, dev_loss: 0.0364, dev_acc: 49.23
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 46, 768])
torch.Size([13, 36, 768])
torch.Size([32, 52, 768])
torch.Size([32, 43, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 16.22batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 55, 768])
torch.Size([32, 53, 768])
torch.Size([32, 55, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 16.22batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 45, 768])
torch.Size([32, 52, 768])
torch.Size([32, 46, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.64batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 16.93batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


test:  29%|██▊       | 20/70 [00:01<00:03, 16.62batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.12batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 53, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.68batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 45, 768])
torch.Size([32, 53, 768])
torch.Size([32, 56, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 15.82batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 46, 768])
torch.Size([32, 43, 768])
torch.Size([32, 42, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.99batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 43, 768])
torch.Size([32, 52, 768])
torch.Size([32, 37, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.47batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])
torch.Size([32, 56, 768])
torch.Size([32, 40, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.60batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 52, 768])
torch.Size([32, 56, 768])
torch.Size([32, 42, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.43batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])
torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])
torch.Size([32, 53, 768])
torch.Size([32, 48, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.35batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 39, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.53batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])
torch.Size([32, 52, 768])
torch.Size([32, 44, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 15.72batch/s]

torch.Size([32, 62, 768])
torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


test:  97%|█████████▋| 68/70 [00:04<00:00, 16.19batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 53, 768])
torch.Size([32, 42, 768])
torch.Size([32, 42, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.61batch/s]
2020-09-01 06:43:10.552 | INFO     | __main__:train:46 - epoch: 7, transformer: gpt2, test_loss: 0.0342, test_acc: 53.26
2020-09-01 06:43:10.553 | INFO     | __main__:train:47 - epoch: 7, transformer: gpt2, test_precision: 52.67, test_recall: 50.12, test_f1_score: 50.57, test_accuracy_score: 53.26
2020-09-01 06:43:10.554 | INFO     | __main__:train:52 - epoch: 7, transformer: gpt2, test_confusion_matrix: 
[[ 90 154  24  10   1]
 [ 63 418  98  48   6]
 [  9 144 117 108  11]
 [  0  31  59 295 125]
 [  0   7  16 119 257]]
2020-09-01 06:43:10.555 | INFO     | __main__:train:55 - Total training time elapsed: 0:05:49.908388
2020-09-01 06:43:10.556 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.986913
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([2, 24, 768])
torch.Size([32, 47, 768])


train:   1%|          | 2/267 [00:00<00:49,  5.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


train:   1%|▏         | 4/267 [00:00<00:51,  5.10batch/s]

torch.Size([32, 57, 768])


train:   2%|▏         | 5/267 [00:00<00:52,  4.95batch/s]

torch.Size([32, 58, 768])


train:   2%|▏         | 6/267 [00:01<00:51,  5.12batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 37, 768])


train:   3%|▎         | 8/267 [00:01<00:46,  5.54batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 44, 768])


train:   4%|▎         | 10/267 [00:01<00:44,  5.72batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 35, 768])


train:   4%|▍         | 12/267 [00:02<00:44,  5.72batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 60, 768])


train:   5%|▌         | 14/267 [00:02<00:44,  5.68batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 56, 768])


train:   6%|▌         | 16/267 [00:02<00:46,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:   7%|▋         | 18/267 [00:03<00:42,  5.80batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 36, 768])


train:   7%|▋         | 20/267 [00:03<00:40,  6.03batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 42, 768])


train:   8%|▊         | 22/267 [00:03<00:40,  5.99batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 59, 768])


train:   9%|▉         | 24/267 [00:04<00:43,  5.60batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 39, 768])


train:  10%|▉         | 26/267 [00:04<00:44,  5.47batch/s]

torch.Size([32, 54, 768])


train:  10%|█         | 27/267 [00:04<00:42,  5.60batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 51, 768])


train:  11%|█         | 29/267 [00:05<00:42,  5.56batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 44, 768])


train:  12%|█▏        | 31/267 [00:05<00:43,  5.41batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 43, 768])


train:  12%|█▏        | 33/267 [00:05<00:43,  5.41batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])


train:  13%|█▎        | 35/267 [00:06<00:43,  5.30batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 59, 768])


train:  14%|█▍        | 37/267 [00:06<00:43,  5.31batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  15%|█▍        | 39/267 [00:07<00:44,  5.14batch/s]

torch.Size([32, 60, 768])


train:  15%|█▍        | 40/267 [00:07<00:43,  5.25batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


train:  16%|█▌        | 42/267 [00:07<00:41,  5.45batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  16%|█▋        | 44/267 [00:08<00:43,  5.09batch/s]

torch.Size([32, 58, 768])


train:  17%|█▋        | 45/267 [00:08<00:41,  5.32batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train:  18%|█▊        | 47/267 [00:08<00:42,  5.23batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 57, 768])


train:  18%|█▊        | 49/267 [00:09<00:43,  5.00batch/s]

torch.Size([32, 54, 768])


train:  19%|█▊        | 50/267 [00:09<00:43,  4.99batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 40, 768])


train:  19%|█▉        | 52/267 [00:09<00:40,  5.25batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


train:  20%|██        | 54/267 [00:09<00:41,  5.11batch/s]

torch.Size([32, 54, 768])


train:  21%|██        | 55/267 [00:10<00:41,  5.13batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  21%|██▏       | 57/267 [00:10<00:38,  5.45batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  22%|██▏       | 59/267 [00:10<00:39,  5.23batch/s]

torch.Size([32, 59, 768])


train:  22%|██▏       | 60/267 [00:11<00:38,  5.44batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 45, 768])


train:  23%|██▎       | 62/267 [00:11<00:38,  5.36batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  24%|██▍       | 64/267 [00:11<00:37,  5.47batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:  25%|██▍       | 66/267 [00:12<00:37,  5.31batch/s]

torch.Size([32, 54, 768])


train:  25%|██▌       | 67/267 [00:12<00:37,  5.39batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 46, 768])


train:  26%|██▌       | 69/267 [00:12<00:36,  5.37batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 59, 768])


train:  27%|██▋       | 71/267 [00:13<00:37,  5.21batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  27%|██▋       | 73/267 [00:13<00:36,  5.24batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  28%|██▊       | 75/267 [00:13<00:36,  5.21batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  29%|██▉       | 77/267 [00:14<00:36,  5.15batch/s]

torch.Size([32, 55, 768])


train:  29%|██▉       | 78/267 [00:14<00:36,  5.13batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  30%|██▉       | 80/267 [00:14<00:34,  5.41batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 37, 768])


train:  31%|███       | 82/267 [00:15<00:33,  5.47batch/s]

torch.Size([32, 52, 768])


train:  31%|███       | 83/267 [00:15<00:34,  5.29batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 36, 768])


train:  32%|███▏      | 85/267 [00:15<00:33,  5.48batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  33%|███▎      | 87/267 [00:16<00:32,  5.55batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 54, 768])


train:  33%|███▎      | 89/267 [00:16<00:31,  5.59batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 46, 768])


train:  34%|███▍      | 91/267 [00:16<00:31,  5.64batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 42, 768])


train:  35%|███▍      | 93/267 [00:17<00:30,  5.62batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  36%|███▌      | 95/267 [00:17<00:32,  5.34batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  36%|███▋      | 97/267 [00:17<00:32,  5.17batch/s]

torch.Size([32, 57, 768])


train:  37%|███▋      | 98/267 [00:18<00:32,  5.28batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 40, 768])


train:  37%|███▋      | 100/267 [00:18<00:30,  5.46batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 44, 768])


train:  38%|███▊      | 102/267 [00:18<00:30,  5.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:  39%|███▉      | 104/267 [00:19<00:30,  5.26batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


train:  40%|███▉      | 106/267 [00:19<00:31,  5.05batch/s]

torch.Size([32, 58, 768])


train:  40%|████      | 107/267 [00:19<00:31,  5.00batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


train:  41%|████      | 109/267 [00:20<00:29,  5.29batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


train:  42%|████▏     | 111/267 [00:20<00:29,  5.31batch/s]

torch.Size([32, 54, 768])


train:  42%|████▏     | 112/267 [00:20<00:28,  5.53batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 41, 768])


train:  43%|████▎     | 114/267 [00:21<00:26,  5.75batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 55, 768])


train:  43%|████▎     | 116/267 [00:21<00:27,  5.55batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 41, 768])


train:  44%|████▍     | 118/267 [00:21<00:27,  5.51batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])


train:  45%|████▍     | 120/267 [00:22<00:26,  5.53batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


train:  46%|████▌     | 122/267 [00:22<00:26,  5.47batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  46%|████▋     | 124/267 [00:22<00:27,  5.29batch/s]

torch.Size([32, 55, 768])


train:  47%|████▋     | 125/267 [00:23<00:27,  5.21batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 41, 768])


train:  48%|████▊     | 127/267 [00:23<00:26,  5.27batch/s]

torch.Size([32, 54, 768])


train:  48%|████▊     | 128/267 [00:23<00:25,  5.46batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  49%|████▊     | 130/267 [00:24<00:25,  5.43batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])


train:  49%|████▉     | 132/267 [00:24<00:26,  5.16batch/s]

torch.Size([32, 56, 768])


train:  50%|████▉     | 133/267 [00:24<00:25,  5.27batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 50, 768])


train:  51%|█████     | 135/267 [00:25<00:25,  5.19batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  51%|█████▏    | 137/267 [00:25<00:25,  5.20batch/s]

torch.Size([32, 53, 768])


train:  52%|█████▏    | 138/267 [00:25<00:25,  5.16batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  52%|█████▏    | 140/267 [00:26<00:24,  5.20batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  53%|█████▎    | 142/267 [00:26<00:24,  5.09batch/s]

torch.Size([32, 59, 768])


train:  54%|█████▎    | 143/267 [00:26<00:23,  5.17batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  54%|█████▍    | 145/267 [00:26<00:22,  5.36batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  55%|█████▌    | 147/267 [00:27<00:21,  5.47batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  56%|█████▌    | 149/267 [00:27<00:23,  5.09batch/s]

torch.Size([32, 59, 768])


train:  56%|█████▌    | 150/267 [00:27<00:22,  5.20batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  57%|█████▋    | 152/267 [00:28<00:21,  5.33batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 60, 768])


train:  58%|█████▊    | 154/267 [00:28<00:21,  5.17batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 37, 768])


train:  58%|█████▊    | 156/267 [00:29<00:19,  5.61batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  59%|█████▉    | 157/267 [00:29<00:19,  5.59batch/s]

torch.Size([32, 64, 768])


train:  60%|█████▉    | 159/267 [00:29<00:21,  4.93batch/s]

torch.Size([32, 60, 768])


train:  60%|█████▉    | 160/267 [00:29<00:21,  5.06batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  61%|██████    | 162/267 [00:30<00:20,  5.22batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  61%|██████▏   | 164/267 [00:30<00:19,  5.24batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  62%|██████▏   | 166/267 [00:30<00:19,  5.12batch/s]

torch.Size([32, 54, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.37batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 56, 768])


train:  63%|██████▎   | 169/267 [00:31<00:18,  5.27batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  64%|██████▍   | 171/267 [00:31<00:17,  5.38batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 54, 768])


train:  65%|██████▍   | 173/267 [00:32<00:17,  5.51batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 42, 768])


train:  66%|██████▌   | 175/267 [00:32<00:16,  5.56batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


train:  66%|██████▋   | 177/267 [00:32<00:16,  5.43batch/s]

torch.Size([32, 53, 768])


train:  67%|██████▋   | 178/267 [00:33<00:16,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  67%|██████▋   | 180/267 [00:33<00:15,  5.46batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  68%|██████▊   | 182/267 [00:33<00:16,  5.30batch/s]

torch.Size([32, 53, 768])


train:  69%|██████▊   | 183/267 [00:34<00:15,  5.33batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  69%|██████▉   | 185/267 [00:34<00:14,  5.68batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 50, 768])


train:  70%|███████   | 187/267 [00:34<00:15,  5.32batch/s]

torch.Size([32, 54, 768])


train:  70%|███████   | 188/267 [00:35<00:15,  5.10batch/s]

torch.Size([32, 57, 768])


train:  71%|███████   | 189/267 [00:35<00:14,  5.21batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


train:  72%|███████▏  | 191/267 [00:35<00:14,  5.32batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  72%|███████▏  | 193/267 [00:35<00:13,  5.54batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  73%|███████▎  | 195/267 [00:36<00:13,  5.49batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  74%|███████▍  | 197/267 [00:36<00:12,  5.52batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 50, 768])


train:  75%|███████▍  | 199/267 [00:37<00:12,  5.30batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.22batch/s]

torch.Size([32, 53, 768])


train:  76%|███████▌  | 202/267 [00:37<00:12,  5.30batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  76%|███████▋  | 204/267 [00:38<00:11,  5.26batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 41, 768])


train:  77%|███████▋  | 206/267 [00:38<00:10,  5.56batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  78%|███████▊  | 208/267 [00:38<00:10,  5.52batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  79%|███████▊  | 210/267 [00:39<00:10,  5.19batch/s]

torch.Size([32, 58, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.25batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  80%|███████▉  | 213/267 [00:39<00:10,  5.08batch/s]

torch.Size([32, 57, 768])


train:  80%|████████  | 214/267 [00:39<00:10,  5.03batch/s]

torch.Size([32, 54, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.24batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.18batch/s]

torch.Size([32, 56, 768])


train:  82%|████████▏ | 218/267 [00:40<00:09,  5.40batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  82%|████████▏ | 220/267 [00:41<00:08,  5.48batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 42, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.60batch/s]

torch.Size([32, 64, 768])


train:  84%|████████▎ | 223/267 [00:41<00:08,  5.36batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 39, 768])


train:  84%|████████▍ | 225/267 [00:41<00:07,  5.69batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


train:  85%|████████▌ | 227/267 [00:42<00:07,  5.57batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:  86%|████████▌ | 229/267 [00:42<00:07,  5.19batch/s]

torch.Size([32, 58, 768])


train:  86%|████████▌ | 230/267 [00:42<00:06,  5.37batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 54, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.32batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.15batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 43, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.18batch/s]

torch.Size([32, 56, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.36batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.28batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 46, 768])


train:  90%|█████████ | 241/267 [00:44<00:04,  5.43batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 59, 768])


train:  91%|█████████ | 243/267 [00:45<00:04,  5.16batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 46, 768])


train:  92%|█████████▏| 245/267 [00:45<00:04,  5.45batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 50, 768])


train:  93%|█████████▎| 247/267 [00:46<00:03,  5.56batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.63batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 39, 768])


train:  94%|█████████▍| 251/267 [00:46<00:02,  5.81batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.49batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.29batch/s]

torch.Size([32, 54, 768])


train:  96%|█████████▌| 256/267 [00:47<00:02,  5.45batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 54, 768])


train:  97%|█████████▋| 258/267 [00:48<00:01,  5.15batch/s]

torch.Size([32, 55, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.07batch/s]

torch.Size([32, 54, 768])


train:  97%|█████████▋| 260/267 [00:48<00:01,  5.39batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 51, 768])


train:  98%|█████████▊| 262/267 [00:48<00:00,  5.46batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train:  99%|█████████▉| 264/267 [00:49<00:00,  5.40batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train: 100%|█████████▉| 266/267 [00:49<00:00,  5.37batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.37batch/s]
2020-09-01 06:44:00.311 | INFO     | __main__:train:39 - epoch: 8, transformer: gpt2, train_loss: 0.0292, train_acc: 59.53
dev:   6%|▌         | 2/35 [00:00<00:02, 16.02batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 48, 768])
torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


dev:  17%|█▋        | 6/35 [00:00<00:01, 16.06batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 38, 768])
torch.Size([32, 46, 768])
torch.Size([32, 46, 768])


dev:  29%|██▊       | 10/35 [00:00<00:01, 16.71batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


dev:  40%|████      | 14/35 [00:00<00:01, 16.20batch/s]

torch.Size([32, 54, 768])
torch.Size([32, 50, 768])
torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


dev:  51%|█████▏    | 18/35 [00:01<00:01, 16.57batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])
torch.Size([32, 43, 768])
torch.Size([32, 45, 768])


dev:  63%|██████▎   | 22/35 [00:01<00:00, 17.09batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 41, 768])
torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


dev:  74%|███████▍  | 26/35 [00:01<00:00, 17.36batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])
torch.Size([32, 53, 768])
torch.Size([32, 58, 768])


dev:  86%|████████▌ | 30/35 [00:01<00:00, 16.57batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])
torch.Size([32, 47, 768])
torch.Size([32, 40, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.95batch/s]
2020-09-01 06:44:02.389 | INFO     | __main__:train:42 - epoch: 8, transformer: gpt2, dev_loss: 0.0370, dev_acc: 48.41
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 46, 768])
torch.Size([13, 45, 768])
torch.Size([32, 53, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 15.43batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 48, 768])
torch.Size([32, 44, 768])
torch.Size([32, 58, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 15.79batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 39, 768])
torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.34batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])
torch.Size([32, 47, 768])
torch.Size([32, 37, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 16.61batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])
torch.Size([32, 53, 768])
torch.Size([32, 55, 768])


test:  29%|██▊       | 20/70 [00:01<00:02, 17.03batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 43, 768])
torch.Size([32, 42, 768])
torch.Size([32, 54, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.74batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])
torch.Size([32, 44, 768])
torch.Size([32, 42, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.83batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])
torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.34batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])
torch.Size([32, 56, 768])
torch.Size([32, 40, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.05batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])
torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.01batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])
torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 15.44batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])
torch.Size([32, 62, 768])


test:  66%|██████▌   | 46/70 [00:02<00:01, 15.79batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 37, 768])
torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


test:  71%|███████▏  | 50/70 [00:03<00:01, 16.14batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


test:  77%|███████▋  | 54/70 [00:03<00:00, 16.28batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 50, 768])
torch.Size([32, 52, 768])
torch.Size([32, 35, 768])


test:  83%|████████▎ | 58/70 [00:03<00:00, 16.75batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


test:  89%|████████▊ | 62/70 [00:03<00:00, 16.35batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 56, 768])
torch.Size([32, 53, 768])
torch.Size([32, 37, 768])


test:  94%|█████████▍| 66/70 [00:04<00:00, 16.53batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 39, 768])
torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.63batch/s]
2020-09-01 06:44:06.616 | INFO     | __main__:train:46 - epoch: 8, transformer: gpt2, test_loss: 0.0359, test_acc: 50.54
2020-09-01 06:44:06.616 | INFO     | __main__:train:47 - epoch: 8, transformer: gpt2, test_precision: 51.83, test_recall: 49.01, test_f1_score: 49.60, test_accuracy_score: 50.54
2020-09-01 06:44:06.617 | INFO     | __main__:train:52 - epoch: 8, transformer: gpt2, test_confusion_matrix: 
[[106 123  30  19   1]
 [ 78 310 169  73   3]
 [ 10  96 141 134   8]
 [  1  10  65 339  95]
 [  1   2  11 164 221]]
2020-09-01 06:44:06.618 | INFO     | __main__:train:55 - Total training time elapsed: 0:06:39.662056
2020-09-01 06:44:06.619 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.957757
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 44, 768])
torch.Size([2, 34, 768])
torch.Size([32, 54, 768])


train:   1%|          | 2/267 [00:00<00:49,  5.38batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 38, 768])


train:   1%|▏         | 4/267 [00:00<00:46,  5.60batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 41, 768])


train:   2%|▏         | 6/267 [00:01<00:45,  5.75batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 39, 768])


train:   3%|▎         | 8/267 [00:01<00:46,  5.59batch/s]

torch.Size([32, 52, 768])


train:   3%|▎         | 9/267 [00:01<00:46,  5.59batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:   4%|▍         | 11/267 [00:01<00:48,  5.27batch/s]

torch.Size([32, 53, 768])


train:   4%|▍         | 12/267 [00:02<00:47,  5.32batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 56, 768])


train:   5%|▌         | 14/267 [00:02<00:50,  4.98batch/s]

torch.Size([32, 59, 768])


train:   6%|▌         | 15/267 [00:02<00:50,  5.02batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:   6%|▋         | 17/267 [00:03<00:49,  5.01batch/s]

torch.Size([32, 54, 768])


train:   7%|▋         | 18/267 [00:03<00:50,  4.94batch/s]

torch.Size([32, 56, 768])


train:   7%|▋         | 19/267 [00:03<00:48,  5.08batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:   8%|▊         | 21/267 [00:03<00:46,  5.27batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:   9%|▊         | 23/267 [00:04<00:45,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:   9%|▉         | 25/267 [00:04<00:45,  5.34batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  10%|█         | 27/267 [00:05<00:44,  5.43batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  11%|█         | 29/267 [00:05<00:44,  5.30batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


train:  12%|█▏        | 31/267 [00:05<00:44,  5.35batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  12%|█▏        | 33/267 [00:06<00:44,  5.23batch/s]

torch.Size([32, 54, 768])


train:  13%|█▎        | 34/267 [00:06<00:44,  5.28batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  13%|█▎        | 36/267 [00:06<00:42,  5.46batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  14%|█▍        | 38/267 [00:07<00:42,  5.37batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])


train:  15%|█▍        | 40/267 [00:07<00:39,  5.73batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 51, 768])


train:  16%|█▌        | 42/267 [00:07<00:42,  5.32batch/s]

torch.Size([32, 54, 768])


train:  16%|█▌        | 43/267 [00:08<00:42,  5.21batch/s]

torch.Size([32, 53, 768])


train:  16%|█▋        | 44/267 [00:08<00:43,  5.16batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 44, 768])


train:  17%|█▋        | 46/267 [00:08<00:39,  5.56batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 46, 768])


train:  18%|█▊        | 48/267 [00:08<00:39,  5.56batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 37, 768])


train:  19%|█▊        | 50/267 [00:09<00:38,  5.71batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:  19%|█▉        | 52/267 [00:09<00:41,  5.22batch/s]

torch.Size([32, 57, 768])


train:  20%|█▉        | 53/267 [00:09<00:42,  4.98batch/s]

torch.Size([32, 60, 768])


train:  20%|██        | 54/267 [00:10<00:41,  5.09batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  21%|██        | 56/267 [00:10<00:40,  5.25batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  22%|██▏       | 58/267 [00:10<00:39,  5.23batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


train:  22%|██▏       | 60/267 [00:11<00:38,  5.38batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 39, 768])


train:  23%|██▎       | 62/267 [00:11<00:36,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 50, 768])


train:  24%|██▍       | 64/267 [00:11<00:38,  5.31batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  25%|██▍       | 66/267 [00:12<00:36,  5.52batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 42, 768])


train:  25%|██▌       | 68/267 [00:12<00:37,  5.26batch/s]

torch.Size([32, 59, 768])


train:  26%|██▌       | 69/267 [00:12<00:35,  5.52batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 43, 768])


train:  27%|██▋       | 71/267 [00:13<00:36,  5.35batch/s]

torch.Size([32, 56, 768])


train:  27%|██▋       | 72/267 [00:13<00:35,  5.49batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:  28%|██▊       | 74/267 [00:13<00:34,  5.53batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 40, 768])


train:  28%|██▊       | 76/267 [00:14<00:34,  5.53batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 58, 768])


train:  29%|██▉       | 78/267 [00:14<00:37,  5.09batch/s]

torch.Size([32, 56, 768])


train:  30%|██▉       | 79/267 [00:14<00:38,  4.91batch/s]

torch.Size([32, 60, 768])


train:  30%|██▉       | 80/267 [00:14<00:37,  4.93batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 40, 768])


train:  31%|███       | 82/267 [00:15<00:33,  5.48batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 47, 768])


train:  31%|███▏      | 84/267 [00:15<00:33,  5.48batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  32%|███▏      | 86/267 [00:15<00:32,  5.62batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 46, 768])


train:  33%|███▎      | 88/267 [00:16<00:33,  5.42batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


train:  34%|███▎      | 90/267 [00:16<00:32,  5.37batch/s]

torch.Size([32, 55, 768])


train:  34%|███▍      | 91/267 [00:16<00:32,  5.39batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 54, 768])


train:  35%|███▍      | 93/267 [00:17<00:34,  5.01batch/s]

torch.Size([32, 59, 768])


train:  35%|███▌      | 94/267 [00:17<00:33,  5.23batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:  36%|███▌      | 96/267 [00:17<00:32,  5.32batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 41, 768])


train:  37%|███▋      | 98/267 [00:18<00:30,  5.49batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  37%|███▋      | 100/267 [00:18<00:31,  5.22batch/s]

torch.Size([32, 54, 768])


train:  38%|███▊      | 101/267 [00:18<00:32,  5.10batch/s]

torch.Size([32, 55, 768])


train:  38%|███▊      | 102/267 [00:19<00:31,  5.23batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 39, 768])


train:  39%|███▉      | 104/267 [00:19<00:30,  5.36batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  40%|███▉      | 106/267 [00:19<00:30,  5.30batch/s]

torch.Size([32, 54, 768])


train:  40%|████      | 107/267 [00:19<00:30,  5.22batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 46, 768])


train:  41%|████      | 109/267 [00:20<00:30,  5.20batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


train:  42%|████▏     | 111/267 [00:20<00:27,  5.74batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 40, 768])


train:  42%|████▏     | 113/267 [00:21<00:28,  5.50batch/s]

torch.Size([32, 56, 768])


train:  43%|████▎     | 114/267 [00:21<00:27,  5.58batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  43%|████▎     | 116/267 [00:21<00:28,  5.24batch/s]

torch.Size([32, 58, 768])


train:  44%|████▍     | 117/267 [00:21<00:28,  5.29batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  45%|████▍     | 119/267 [00:22<00:27,  5.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  45%|████▌     | 121/267 [00:22<00:27,  5.31batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  46%|████▌     | 123/267 [00:22<00:27,  5.31batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  47%|████▋     | 125/267 [00:23<00:27,  5.10batch/s]

torch.Size([32, 58, 768])


train:  47%|████▋     | 126/267 [00:23<00:27,  5.22batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  48%|████▊     | 128/267 [00:23<00:25,  5.38batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 51, 768])


train:  49%|████▊     | 130/267 [00:24<00:25,  5.45batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 59, 768])


train:  49%|████▉     | 132/267 [00:24<00:25,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 40, 768])


train:  50%|█████     | 134/267 [00:24<00:23,  5.62batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  51%|█████     | 136/267 [00:25<00:23,  5.60batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  52%|█████▏    | 138/267 [00:25<00:22,  5.77batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 45, 768])


train:  52%|█████▏    | 140/267 [00:26<00:23,  5.51batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  53%|█████▎    | 142/267 [00:26<00:22,  5.67batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 58, 768])


train:  54%|█████▍    | 144/267 [00:26<00:23,  5.26batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 43, 768])


train:  55%|█████▍    | 146/267 [00:27<00:22,  5.26batch/s]

torch.Size([32, 54, 768])


train:  55%|█████▌    | 147/267 [00:27<00:23,  5.21batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


train:  56%|█████▌    | 149/267 [00:27<00:21,  5.61batch/s]

torch.Size([32, 31, 768])
torch.Size([32, 46, 768])


train:  57%|█████▋    | 151/267 [00:28<00:22,  5.22batch/s]

torch.Size([32, 60, 768])


train:  57%|█████▋    | 152/267 [00:28<00:21,  5.31batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 49, 768])


train:  58%|█████▊    | 154/267 [00:28<00:22,  5.13batch/s]

torch.Size([32, 56, 768])


train:  58%|█████▊    | 155/267 [00:28<00:21,  5.21batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  59%|█████▉    | 157/267 [00:29<00:21,  5.00batch/s]

torch.Size([32, 53, 768])


train:  59%|█████▉    | 158/267 [00:29<00:21,  5.05batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 45, 768])


train:  60%|█████▉    | 160/267 [00:29<00:19,  5.40batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 59, 768])


train:  61%|██████    | 162/267 [00:30<00:18,  5.57batch/s]

torch.Size([32, 33, 768])
torch.Size([32, 54, 768])


train:  61%|██████    | 163/267 [00:30<00:19,  5.36batch/s]

torch.Size([32, 64, 768])


train:  62%|██████▏   | 165/267 [00:30<00:19,  5.15batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  63%|██████▎   | 169/267 [00:31<00:17,  5.53batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 59, 768])


train:  64%|██████▍   | 171/267 [00:31<00:19,  5.03batch/s]

torch.Size([32, 57, 768])


train:  64%|██████▍   | 172/267 [00:32<00:19,  5.00batch/s]

torch.Size([32, 53, 768])


train:  65%|██████▍   | 173/267 [00:32<00:18,  5.02batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


train:  66%|██████▌   | 175/267 [00:32<00:16,  5.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 49, 768])


train:  66%|██████▋   | 177/267 [00:33<00:17,  5.26batch/s]

torch.Size([32, 53, 768])


train:  67%|██████▋   | 178/267 [00:33<00:17,  5.20batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


train:  67%|██████▋   | 180/267 [00:33<00:16,  5.39batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:  68%|██████▊   | 182/267 [00:33<00:15,  5.50batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


train:  69%|██████▉   | 184/267 [00:34<00:14,  5.68batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 54, 768])


train:  70%|██████▉   | 186/267 [00:34<00:15,  5.15batch/s]

torch.Size([32, 58, 768])


train:  70%|███████   | 187/267 [00:34<00:15,  5.27batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


train:  71%|███████   | 189/267 [00:35<00:14,  5.40batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  72%|███████▏  | 191/267 [00:35<00:13,  5.52batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 41, 768])


train:  72%|███████▏  | 193/267 [00:35<00:13,  5.57batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  73%|███████▎  | 195/267 [00:36<00:13,  5.36batch/s]

torch.Size([32, 53, 768])


train:  73%|███████▎  | 196/267 [00:36<00:13,  5.26batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  74%|███████▍  | 198/267 [00:36<00:12,  5.52batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 41, 768])


train:  75%|███████▍  | 200/267 [00:37<00:11,  5.68batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 56, 768])


train:  76%|███████▌  | 202/267 [00:37<00:12,  5.39batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  76%|███████▋  | 204/267 [00:38<00:11,  5.43batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


train:  77%|███████▋  | 206/267 [00:38<00:11,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  78%|███████▊  | 208/267 [00:38<00:10,  5.41batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])


train:  79%|███████▊  | 210/267 [00:39<00:10,  5.31batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 53, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.43batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


train:  80%|████████  | 214/267 [00:39<00:09,  5.60batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 50, 768])


train:  81%|████████  | 216/267 [00:40<00:09,  5.27batch/s]

torch.Size([32, 54, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.03batch/s]

torch.Size([32, 59, 768])


train:  82%|████████▏ | 218/267 [00:40<00:09,  5.24batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 57, 768])


train:  82%|████████▏ | 220/267 [00:41<00:09,  5.15batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:  83%|████████▎ | 222/267 [00:41<00:08,  5.13batch/s]

torch.Size([32, 53, 768])


train:  84%|████████▎ | 223/267 [00:41<00:08,  5.15batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])


train:  84%|████████▍ | 225/267 [00:42<00:08,  5.12batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  85%|████████▍ | 226/267 [00:42<00:08,  5.11batch/s]

torch.Size([32, 64, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  4.89batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 49, 768])


train:  86%|████████▌ | 230/267 [00:43<00:07,  4.99batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 44, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.11batch/s]

torch.Size([32, 54, 768])


train:  87%|████████▋ | 233/267 [00:43<00:06,  5.29batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:  88%|████████▊ | 235/267 [00:43<00:06,  5.19batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.28batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.47batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 44, 768])


train:  90%|█████████ | 241/267 [00:45<00:04,  5.68batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 45, 768])


train:  91%|█████████ | 243/267 [00:45<00:04,  5.56batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  92%|█████████▏| 245/267 [00:45<00:04,  5.31batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 55, 768])


train:  93%|█████████▎| 247/267 [00:46<00:03,  5.28batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 54, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.00batch/s]

torch.Size([32, 57, 768])


train:  94%|█████████▎| 250/267 [00:46<00:03,  5.20batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 48, 768])


train:  94%|█████████▍| 252/267 [00:47<00:02,  5.33batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 50, 768])


train:  95%|█████████▌| 254/267 [00:47<00:02,  5.65batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 46, 768])


train:  96%|█████████▌| 256/267 [00:47<00:01,  5.52batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 57, 768])


train:  97%|█████████▋| 258/267 [00:48<00:01,  5.12batch/s]

torch.Size([32, 54, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.14batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.01batch/s]

torch.Size([32, 54, 768])


train:  98%|█████████▊| 262/267 [00:49<00:00,  5.16batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  99%|█████████▉| 264/267 [00:49<00:00,  5.42batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 53, 768])


train: 100%|█████████▉| 266/267 [00:49<00:00,  5.48batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 44, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.34batch/s]
2020-09-01 06:44:56.609 | INFO     | __main__:train:39 - epoch: 9, transformer: gpt2, train_loss: 0.0277, train_acc: 62.06
dev:   6%|▌         | 2/35 [00:00<00:02, 16.21batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 43, 768])


dev:  17%|█▋        | 6/35 [00:00<00:01, 16.56batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])
torch.Size([32, 49, 768])
torch.Size([32, 58, 768])


dev:  29%|██▊       | 10/35 [00:00<00:01, 16.45batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])
torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


dev:  40%|████      | 14/35 [00:00<00:01, 16.54batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 46, 768])


dev:  51%|█████▏    | 18/35 [00:01<00:01, 16.65batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])
torch.Size([32, 36, 768])
torch.Size([32, 43, 768])


dev:  63%|██████▎   | 22/35 [00:01<00:00, 16.85batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])
torch.Size([32, 40, 768])
torch.Size([32, 40, 768])


dev:  74%|███████▍  | 26/35 [00:01<00:00, 17.45batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 41, 768])
torch.Size([32, 54, 768])
torch.Size([32, 50, 768])


dev:  86%|████████▌ | 30/35 [00:01<00:00, 16.37batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])
torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.76batch/s]
2020-09-01 06:44:58.710 | INFO     | __main__:train:42 - epoch: 9, transformer: gpt2, dev_loss: 0.0377, dev_acc: 48.41
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])
torch.Size([13, 50, 768])
torch.Size([32, 50, 768])


test:   6%|▌         | 4/70 [00:00<00:04, 14.64batch/s]

torch.Size([32, 62, 768])
torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


test:   9%|▊         | 6/70 [00:00<00:04, 14.97batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])
torch.Size([32, 52, 768])


test:  14%|█▍        | 10/70 [00:00<00:03, 15.39batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 38, 768])
torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


test:  20%|██        | 14/70 [00:00<00:03, 15.89batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])
torch.Size([32, 53, 768])
torch.Size([32, 37, 768])


test:  26%|██▌       | 18/70 [00:01<00:03, 15.92batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 42, 768])
torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


test:  31%|███▏      | 22/70 [00:01<00:02, 16.02batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])
torch.Size([32, 40, 768])
torch.Size([32, 55, 768])


test:  37%|███▋      | 26/70 [00:01<00:02, 15.78batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])
torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


test:  43%|████▎     | 30/70 [00:01<00:02, 16.42batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 41, 768])
torch.Size([32, 49, 768])
torch.Size([32, 45, 768])


test:  49%|████▊     | 34/70 [00:02<00:02, 16.54batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])
torch.Size([32, 43, 768])
torch.Size([32, 53, 768])


test:  54%|█████▍    | 38/70 [00:02<00:01, 16.32batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 53, 768])
torch.Size([32, 47, 768])
torch.Size([32, 56, 768])


test:  60%|██████    | 42/70 [00:02<00:01, 16.57batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 42, 768])
torch.Size([32, 39, 768])
torch.Size([32, 48, 768])


test:  66%|██████▌   | 46/70 [00:02<00:01, 17.00batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 37, 768])
torch.Size([32, 56, 768])
torch.Size([32, 54, 768])


test:  71%|███████▏  | 50/70 [00:03<00:01, 15.96batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])
torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


test:  77%|███████▋  | 54/70 [00:03<00:00, 16.75batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 44, 768])
torch.Size([32, 56, 768])
torch.Size([32, 46, 768])


test:  83%|████████▎ | 58/70 [00:03<00:00, 16.20batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 48, 768])
torch.Size([32, 53, 768])
torch.Size([32, 38, 768])


test:  89%|████████▊ | 62/70 [00:03<00:00, 16.18batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])
torch.Size([32, 49, 768])
torch.Size([32, 39, 768])


test:  94%|█████████▍| 66/70 [00:04<00:00, 16.26batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 43, 768])
torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.40batch/s]
2020-09-01 06:45:02.996 | INFO     | __main__:train:46 - epoch: 9, transformer: gpt2, test_loss: 0.0357, test_acc: 51.49
2020-09-01 06:45:02.996 | INFO     | __main__:train:47 - epoch: 9, transformer: gpt2, test_precision: 51.36, test_recall: 50.64, test_f1_score: 50.76, test_accuracy_score: 51.49
2020-09-01 06:45:02.997 | INFO     | __main__:train:52 - epoch: 9, transformer: gpt2, test_confusion_matrix: 
[[128 114  22  14   1]
 [101 323 136  67   6]
 [ 18 101 133 123  14]
 [  3  16  70 307 114]
 [  1   4   8 139 247]]
2020-09-01 06:45:02.998 | INFO     | __main__:train:55 - Total training time elapsed: 0:07:29.650835
2020-09-01 06:45:02.998 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.961204
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 42, 768])
torch.Size([2, 22, 768])
torch.Size([32, 54, 768])


train:   1%|          | 2/267 [00:00<00:53,  4.92batch/s]

torch.Size([32, 54, 768])


train:   1%|          | 3/267 [00:00<00:54,  4.89batch/s]

torch.Size([32, 56, 768])


train:   1%|▏         | 4/267 [00:00<00:51,  5.12batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 53, 768])


train:   2%|▏         | 6/267 [00:01<00:51,  5.10batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 41, 768])


train:   3%|▎         | 8/267 [00:01<00:50,  5.11batch/s]

torch.Size([32, 57, 768])


train:   3%|▎         | 9/267 [00:01<00:50,  5.12batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:   4%|▍         | 11/267 [00:02<00:48,  5.26batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 49, 768])


train:   5%|▍         | 13/267 [00:02<00:47,  5.34batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:   6%|▌         | 15/267 [00:02<00:46,  5.44batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


train:   6%|▋         | 17/267 [00:03<00:47,  5.32batch/s]

torch.Size([32, 54, 768])


train:   7%|▋         | 18/267 [00:03<00:47,  5.27batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 58, 768])


train:   7%|▋         | 20/267 [00:03<00:46,  5.30batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 56, 768])


train:   8%|▊         | 22/267 [00:04<00:46,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:   9%|▉         | 24/267 [00:04<00:44,  5.46batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 51, 768])


train:  10%|▉         | 26/267 [00:04<00:44,  5.36batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  10%|█         | 28/267 [00:05<00:45,  5.24batch/s]

torch.Size([32, 53, 768])


train:  11%|█         | 29/267 [00:05<00:44,  5.32batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:  12%|█▏        | 31/267 [00:05<00:43,  5.44batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:  12%|█▏        | 33/267 [00:06<00:43,  5.40batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 38, 768])


train:  13%|█▎        | 35/267 [00:06<00:40,  5.68batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 40, 768])


train:  14%|█▍        | 37/267 [00:06<00:39,  5.80batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 42, 768])


train:  15%|█▍        | 39/267 [00:07<00:38,  5.90batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 52, 768])


train:  15%|█▌        | 41/267 [00:07<00:42,  5.35batch/s]

torch.Size([32, 55, 768])


train:  16%|█▌        | 42/267 [00:07<00:41,  5.37batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 58, 768])


train:  16%|█▋        | 44/267 [00:08<00:45,  4.93batch/s]

torch.Size([32, 59, 768])


train:  17%|█▋        | 45/267 [00:08<00:43,  5.10batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 52, 768])


train:  18%|█▊        | 47/267 [00:08<00:39,  5.62batch/s]

torch.Size([32, 29, 768])
torch.Size([32, 52, 768])


train:  18%|█▊        | 49/267 [00:09<00:41,  5.22batch/s]

torch.Size([32, 56, 768])


train:  19%|█▊        | 50/267 [00:09<00:42,  5.11batch/s]

torch.Size([32, 54, 768])


train:  19%|█▉        | 51/267 [00:09<00:40,  5.29batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])


train:  20%|█▉        | 53/267 [00:09<00:40,  5.29batch/s]

torch.Size([32, 53, 768])


train:  20%|██        | 54/267 [00:10<00:40,  5.25batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 37, 768])


train:  21%|██        | 56/267 [00:10<00:38,  5.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 40, 768])


train:  22%|██▏       | 58/267 [00:10<00:37,  5.50batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  22%|██▏       | 60/267 [00:11<00:37,  5.51batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


train:  23%|██▎       | 62/267 [00:11<00:39,  5.21batch/s]

torch.Size([32, 54, 768])


train:  24%|██▎       | 63/267 [00:11<00:39,  5.19batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 50, 768])


train:  24%|██▍       | 65/267 [00:12<00:40,  4.97batch/s]

torch.Size([32, 59, 768])


train:  25%|██▍       | 66/267 [00:12<00:38,  5.25batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 51, 768])


train:  25%|██▌       | 68/267 [00:12<00:37,  5.30batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  26%|██▌       | 70/267 [00:13<00:37,  5.32batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 54, 768])


train:  27%|██▋       | 72/267 [00:13<00:37,  5.19batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  28%|██▊       | 74/267 [00:13<00:34,  5.53batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  28%|██▊       | 76/267 [00:14<00:34,  5.51batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 52, 768])


train:  29%|██▉       | 78/267 [00:14<00:34,  5.50batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 56, 768])


train:  30%|██▉       | 80/267 [00:15<00:36,  5.13batch/s]

torch.Size([32, 56, 768])


train:  30%|███       | 81/267 [00:15<00:35,  5.22batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  31%|███       | 83/267 [00:15<00:35,  5.17batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 46, 768])


train:  32%|███▏      | 85/267 [00:15<00:34,  5.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


train:  33%|███▎      | 87/267 [00:16<00:32,  5.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  33%|███▎      | 89/267 [00:16<00:34,  5.22batch/s]

torch.Size([32, 59, 768])


train:  34%|███▎      | 90/267 [00:16<00:34,  5.17batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


train:  34%|███▍      | 92/267 [00:17<00:33,  5.20batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:  35%|███▌      | 94/267 [00:17<00:31,  5.41batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])


train:  36%|███▌      | 96/267 [00:18<00:33,  5.16batch/s]

torch.Size([32, 54, 768])


train:  36%|███▋      | 97/267 [00:18<00:32,  5.27batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train:  37%|███▋      | 99/267 [00:18<00:31,  5.29batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 40, 768])


train:  38%|███▊      | 101/267 [00:18<00:29,  5.54batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  39%|███▊      | 103/267 [00:19<00:29,  5.53batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 57, 768])


train:  39%|███▉      | 105/267 [00:19<00:30,  5.30batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 55, 768])


train:  40%|████      | 107/267 [00:20<00:30,  5.27batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 56, 768])


train:  41%|████      | 109/267 [00:20<00:29,  5.38batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  42%|████▏     | 111/267 [00:20<00:29,  5.26batch/s]

torch.Size([32, 53, 768])


train:  42%|████▏     | 113/267 [00:21<00:28,  5.39batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  43%|████▎     | 115/267 [00:21<00:30,  5.03batch/s]

torch.Size([32, 60, 768])


train:  43%|████▎     | 116/267 [00:21<00:30,  5.02batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  44%|████▍     | 118/267 [00:22<00:28,  5.15batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 51, 768])


train:  45%|████▍     | 120/267 [00:22<00:29,  5.06batch/s]

torch.Size([32, 54, 768])


train:  45%|████▌     | 121/267 [00:22<00:28,  5.20batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


train:  46%|████▌     | 122/267 [00:22<00:26,  5.40batch/s]

torch.Size([32, 64, 768])


train:  46%|████▋     | 124/267 [00:23<00:28,  5.04batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 58, 768])


train:  47%|████▋     | 126/267 [00:23<00:27,  5.17batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 50, 768])


train:  48%|████▊     | 128/267 [00:24<00:26,  5.18batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 41, 768])


train:  49%|████▊     | 130/267 [00:24<00:25,  5.42batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 49, 768])


train:  49%|████▉     | 132/267 [00:24<00:25,  5.30batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


train:  50%|█████     | 134/267 [00:25<00:23,  5.65batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 51, 768])


train:  51%|█████     | 136/267 [00:25<00:23,  5.59batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  52%|█████▏    | 138/267 [00:25<00:23,  5.42batch/s]

torch.Size([32, 54, 768])


train:  52%|█████▏    | 139/267 [00:26<00:23,  5.56batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 57, 768])


train:  53%|█████▎    | 141/267 [00:26<00:23,  5.32batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:  54%|█████▎    | 143/267 [00:26<00:23,  5.26batch/s]

torch.Size([32, 55, 768])


train:  54%|█████▍    | 144/267 [00:27<00:23,  5.16batch/s]

torch.Size([32, 53, 768])


train:  54%|█████▍    | 145/267 [00:27<00:22,  5.39batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 40, 768])


train:  55%|█████▌    | 147/267 [00:27<00:20,  5.73batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 52, 768])


train:  56%|█████▌    | 149/267 [00:27<00:21,  5.47batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 60, 768])


train:  57%|█████▋    | 151/267 [00:28<00:20,  5.57batch/s]

torch.Size([32, 34, 768])
torch.Size([32, 46, 768])


train:  57%|█████▋    | 153/267 [00:28<00:20,  5.46batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 40, 768])


train:  58%|█████▊    | 155/267 [00:29<00:20,  5.44batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  59%|█████▉    | 157/267 [00:29<00:21,  5.09batch/s]

torch.Size([32, 59, 768])


train:  59%|█████▉    | 158/267 [00:29<00:21,  5.19batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 44, 768])


train:  60%|█████▉    | 160/267 [00:30<00:18,  5.76batch/s]

torch.Size([32, 33, 768])
torch.Size([32, 39, 768])


train:  61%|██████    | 162/267 [00:30<00:18,  5.57batch/s]

torch.Size([32, 53, 768])


train:  61%|██████    | 163/267 [00:30<00:19,  5.41batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  62%|██████▏   | 165/267 [00:30<00:17,  5.68batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 53, 768])


train:  63%|██████▎   | 167/267 [00:31<00:18,  5.43batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 50, 768])


train:  63%|██████▎   | 169/267 [00:31<00:19,  5.11batch/s]

torch.Size([32, 58, 768])


train:  64%|██████▎   | 170/267 [00:31<00:19,  5.10batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  64%|██████▍   | 172/267 [00:32<00:17,  5.35batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 60, 768])


train:  65%|██████▌   | 174/267 [00:32<00:18,  5.10batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


train:  66%|██████▌   | 176/267 [00:33<00:17,  5.21batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  67%|██████▋   | 178/267 [00:33<00:17,  5.04batch/s]

torch.Size([32, 54, 768])


train:  67%|██████▋   | 179/267 [00:33<00:17,  5.16batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  68%|██████▊   | 181/267 [00:34<00:16,  5.36batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 38, 768])


train:  69%|██████▊   | 183/267 [00:34<00:16,  5.24batch/s]

torch.Size([32, 59, 768])


train:  69%|██████▉   | 184/267 [00:34<00:16,  5.18batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  70%|██████▉   | 186/267 [00:34<00:15,  5.20batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  70%|███████   | 188/267 [00:35<00:14,  5.44batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  71%|███████   | 190/267 [00:35<00:13,  5.56batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 46, 768])


train:  72%|███████▏  | 192/267 [00:36<00:13,  5.67batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 45, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.59batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 41, 768])


train:  73%|███████▎  | 196/267 [00:36<00:13,  5.35batch/s]

torch.Size([32, 57, 768])


train:  74%|███████▍  | 197/267 [00:36<00:13,  5.29batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:  75%|███████▍  | 199/267 [00:37<00:12,  5.46batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 55, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.32batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:  76%|███████▌  | 203/267 [00:38<00:11,  5.58batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 42, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.51batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 38, 768])


train:  78%|███████▊  | 207/267 [00:38<00:10,  5.68batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 43, 768])


train:  78%|███████▊  | 209/267 [00:39<00:10,  5.64batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.40batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  80%|███████▉  | 213/267 [00:39<00:09,  5.63batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 48, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.45batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 45, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.52batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 60, 768])


train:  82%|████████▏ | 219/267 [00:40<00:09,  5.14batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 53, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.41batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 44, 768])


train:  84%|████████▎ | 223/267 [00:41<00:08,  5.49batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  84%|████████▍ | 225/267 [00:42<00:07,  5.34batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


train:  85%|████████▌ | 227/267 [00:42<00:07,  5.28batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  86%|████████▌ | 229/267 [00:42<00:07,  5.16batch/s]

torch.Size([32, 58, 768])


train:  86%|████████▌ | 230/267 [00:43<00:06,  5.33batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.63batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 52, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.14batch/s]

torch.Size([32, 59, 768])


train:  88%|████████▊ | 235/267 [00:43<00:06,  5.24batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 54, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.47batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 64, 768])


train:  90%|█████████ | 241/267 [00:45<00:04,  5.30batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 58, 768])


train:  91%|█████████ | 243/267 [00:45<00:04,  5.29batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 46, 768])


train:  92%|█████████▏| 245/267 [00:45<00:04,  5.39batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  93%|█████████▎| 247/267 [00:46<00:03,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 48, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.43batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  94%|█████████▍| 251/267 [00:46<00:03,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  95%|█████████▍| 253/267 [00:47<00:02,  5.32batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 50, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.53batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 54, 768])


train:  96%|█████████▋| 257/267 [00:48<00:01,  5.18batch/s]

torch.Size([32, 56, 768])


train:  97%|█████████▋| 258/267 [00:48<00:01,  5.25batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  97%|█████████▋| 260/267 [00:48<00:01,  5.27batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 54, 768])


train:  98%|█████████▊| 262/267 [00:49<00:00,  5.35batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 48, 768])


train:  99%|█████████▉| 264/267 [00:49<00:00,  5.52batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train: 100%|█████████▉| 266/267 [00:49<00:00,  5.54batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.34batch/s]
2020-09-01 06:45:52.968 | INFO     | __main__:train:39 - epoch: 10, transformer: gpt2, train_loss: 0.0262, train_acc: 64.41
dev:   6%|▌         | 2/35 [00:00<00:02, 15.94batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


dev:  17%|█▋        | 6/35 [00:00<00:01, 16.66batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 36, 768])
torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


dev:  29%|██▊       | 10/35 [00:00<00:01, 17.09batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])
torch.Size([32, 43, 768])
torch.Size([32, 45, 768])


dev:  40%|████      | 14/35 [00:00<00:01, 16.85batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 52, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


dev:  51%|█████▏    | 18/35 [00:01<00:00, 17.27batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 41, 768])
torch.Size([32, 54, 768])
torch.Size([32, 45, 768])


dev:  63%|██████▎   | 22/35 [00:01<00:00, 16.48batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 53, 768])
torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


dev:  74%|███████▍  | 26/35 [00:01<00:00, 16.36batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])
torch.Size([32, 46, 768])
torch.Size([32, 43, 768])


dev:  86%|████████▌ | 30/35 [00:01<00:00, 16.44batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 42, 768])
torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 17.03batch/s]
2020-09-01 06:45:55.037 | INFO     | __main__:train:42 - epoch: 10, transformer: gpt2, dev_loss: 0.0379, dev_acc: 48.14
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])
torch.Size([13, 46, 768])
torch.Size([32, 39, 768])


test:   6%|▌         | 4/70 [00:00<00:03, 16.83batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 45, 768])
torch.Size([32, 41, 768])
torch.Size([32, 53, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 15.93batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.51batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])
torch.Size([32, 38, 768])
torch.Size([32, 39, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 17.08batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 40, 768])
torch.Size([32, 40, 768])
torch.Size([32, 46, 768])


test:  29%|██▊       | 20/70 [00:01<00:02, 16.92batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 42, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.33batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 46, 768])
torch.Size([32, 54, 768])
torch.Size([32, 44, 768])


test:  40%|████      | 28/70 [00:01<00:02, 17.07batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])
torch.Size([32, 39, 768])
torch.Size([32, 42, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.79batch/s]

torch.Size([32, 58, 768])
torch.Size([32, 45, 768])
torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.33batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])
torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.35batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 41, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.84batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])
torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.92batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])
torch.Size([32, 41, 768])
torch.Size([32, 36, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 17.16batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])
torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 17.15batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 52, 768])
torch.Size([32, 43, 768])
torch.Size([32, 51, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 16.29batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 62, 768])
torch.Size([32, 39, 768])
torch.Size([32, 49, 768])


test:  91%|█████████▏| 64/70 [00:03<00:00, 15.98batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 52, 768])
torch.Size([32, 56, 768])
torch.Size([32, 55, 768])


test:  97%|█████████▋| 68/70 [00:04<00:00, 15.41batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 51, 768])
torch.Size([32, 53, 768])
torch.Size([32, 52, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.67batch/s]
2020-09-01 06:45:59.254 | INFO     | __main__:train:46 - epoch: 10, transformer: gpt2, test_loss: 0.0359, test_acc: 51.67
2020-09-01 06:45:59.254 | INFO     | __main__:train:47 - epoch: 10, transformer: gpt2, test_precision: 52.14, test_recall: 50.34, test_f1_score: 50.88, test_accuracy_score: 51.67
2020-09-01 06:45:59.255 | INFO     | __main__:train:52 - epoch: 10, transformer: gpt2, test_confusion_matrix: 
[[110 127  30  11   1]
 [ 88 334 160  45   6]
 [ 14 104 166  96   9]
 [  2  13  91 302 102]
 [  1   4  19 145 230]]
2020-09-01 06:45:59.256 | INFO     | __main__:train:55 - Total training time elapsed: 0:08:19.618526
2020-09-01 06:45:59.257 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.961853
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([2, 25, 768])
torch.Size([32, 47, 768])


train:   1%|          | 2/267 [00:00<00:47,  5.56batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:   1%|▏         | 4/267 [00:00<00:46,  5.63batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 49, 768])


train:   2%|▏         | 6/267 [00:01<00:48,  5.36batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 38, 768])


train:   3%|▎         | 8/267 [00:01<00:46,  5.53batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 48, 768])


train:   4%|▎         | 10/267 [00:01<00:49,  5.17batch/s]

torch.Size([32, 59, 768])


train:   4%|▍         | 11/267 [00:02<00:51,  4.95batch/s]

torch.Size([32, 60, 768])


train:   4%|▍         | 12/267 [00:02<00:51,  4.97batch/s]

torch.Size([32, 52, 768])


train:   5%|▍         | 13/267 [00:02<00:52,  4.85batch/s]

torch.Size([32, 58, 768])


train:   5%|▌         | 14/267 [00:02<00:49,  5.15batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 52, 768])


train:   6%|▌         | 16/267 [00:03<00:48,  5.13batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])


train:   7%|▋         | 18/267 [00:03<00:46,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 47, 768])


train:   7%|▋         | 20/267 [00:03<00:44,  5.59batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 37, 768])


train:   8%|▊         | 22/267 [00:04<00:42,  5.79batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:   9%|▉         | 24/267 [00:04<00:40,  5.94batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 45, 768])


train:  10%|▉         | 26/267 [00:04<00:43,  5.59batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 46, 768])


train:  10%|█         | 28/267 [00:05<00:42,  5.56batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


train:  11%|█         | 30/267 [00:05<00:42,  5.54batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 50, 768])


train:  12%|█▏        | 32/267 [00:05<00:44,  5.32batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 43, 768])


train:  13%|█▎        | 34/267 [00:06<00:41,  5.61batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 50, 768])


train:  13%|█▎        | 36/267 [00:06<00:41,  5.50batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 42, 768])


train:  14%|█▍        | 38/267 [00:06<00:42,  5.34batch/s]

torch.Size([32, 55, 768])


train:  15%|█▍        | 39/267 [00:07<00:43,  5.21batch/s]

torch.Size([32, 54, 768])


train:  15%|█▍        | 40/267 [00:07<00:42,  5.31batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 44, 768])


train:  16%|█▌        | 42/267 [00:07<00:42,  5.28batch/s]

torch.Size([32, 53, 768])


train:  16%|█▌        | 43/267 [00:07<00:43,  5.17batch/s]

torch.Size([32, 54, 768])


train:  16%|█▋        | 44/267 [00:08<00:41,  5.41batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 39, 768])


train:  17%|█▋        | 46/267 [00:08<00:38,  5.68batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])


train:  18%|█▊        | 48/267 [00:08<00:41,  5.27batch/s]

torch.Size([32, 56, 768])


train:  18%|█▊        | 49/267 [00:09<00:41,  5.20batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  19%|█▉        | 51/267 [00:09<00:41,  5.15batch/s]

torch.Size([32, 54, 768])


train:  19%|█▉        | 52/267 [00:09<00:41,  5.23batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  20%|██        | 54/267 [00:10<00:40,  5.26batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:  21%|██        | 56/267 [00:10<00:38,  5.48batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  22%|██▏       | 58/267 [00:10<00:38,  5.48batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  22%|██▏       | 60/267 [00:11<00:37,  5.48batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  23%|██▎       | 62/267 [00:11<00:38,  5.30batch/s]

torch.Size([32, 58, 768])


train:  24%|██▎       | 63/267 [00:11<00:39,  5.22batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 56, 768])


train:  24%|██▍       | 65/267 [00:12<00:37,  5.33batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  25%|██▌       | 67/267 [00:12<00:37,  5.26batch/s]

torch.Size([32, 56, 768])


train:  25%|██▌       | 68/267 [00:12<00:37,  5.34batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


train:  26%|██▌       | 70/267 [00:12<00:36,  5.46batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 47, 768])


train:  27%|██▋       | 72/267 [00:13<00:33,  5.81batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 53, 768])


train:  28%|██▊       | 74/267 [00:13<00:34,  5.61batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 51, 768])


train:  28%|██▊       | 76/267 [00:14<00:34,  5.47batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 59, 768])


train:  29%|██▉       | 78/267 [00:14<00:36,  5.25batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


train:  30%|██▉       | 80/267 [00:14<00:34,  5.40batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


train:  31%|███       | 82/267 [00:15<00:35,  5.15batch/s]

torch.Size([32, 57, 768])


train:  31%|███       | 83/267 [00:15<00:34,  5.36batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 58, 768])


train:  32%|███▏      | 85/267 [00:15<00:35,  5.14batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  33%|███▎      | 87/267 [00:16<00:33,  5.37batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  33%|███▎      | 89/267 [00:16<00:32,  5.45batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 39, 768])


train:  34%|███▍      | 91/267 [00:16<00:33,  5.24batch/s]

torch.Size([32, 60, 768])


train:  34%|███▍      | 92/267 [00:17<00:32,  5.40batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 54, 768])


train:  35%|███▌      | 94/267 [00:17<00:33,  5.15batch/s]

torch.Size([32, 54, 768])


train:  36%|███▌      | 95/267 [00:17<00:34,  4.95batch/s]

torch.Size([32, 60, 768])


train:  36%|███▌      | 96/267 [00:17<00:32,  5.23batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 48, 768])


train:  37%|███▋      | 98/267 [00:18<00:31,  5.34batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  37%|███▋      | 100/267 [00:18<00:30,  5.51batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 39, 768])


train:  38%|███▊      | 102/267 [00:18<00:29,  5.61batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 54, 768])


train:  39%|███▉      | 104/267 [00:19<00:30,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  40%|███▉      | 106/267 [00:19<00:28,  5.62batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  40%|████      | 108/267 [00:20<00:29,  5.32batch/s]

torch.Size([32, 56, 768])


train:  41%|████      | 109/267 [00:20<00:29,  5.35batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])


train:  42%|████▏     | 111/267 [00:20<00:28,  5.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


train:  42%|████▏     | 113/267 [00:20<00:28,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 47, 768])


train:  43%|████▎     | 115/267 [00:21<00:27,  5.59batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 56, 768])


train:  44%|████▍     | 117/267 [00:21<00:27,  5.47batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 64, 768])


train:  45%|████▍     | 119/267 [00:22<00:28,  5.28batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 58, 768])


train:  45%|████▌     | 121/267 [00:22<00:28,  5.06batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  46%|████▌     | 123/267 [00:22<00:26,  5.40batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 48, 768])


train:  47%|████▋     | 125/267 [00:23<00:25,  5.59batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 48, 768])


train:  48%|████▊     | 127/267 [00:23<00:25,  5.43batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  48%|████▊     | 129/267 [00:23<00:24,  5.55batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 47, 768])


train:  49%|████▉     | 131/267 [00:24<00:24,  5.49batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 51, 768])


train:  50%|████▉     | 133/267 [00:24<00:25,  5.27batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  51%|█████     | 135/267 [00:25<00:25,  5.15batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 49, 768])


train:  51%|█████▏    | 137/267 [00:25<00:25,  5.15batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


train:  52%|█████▏    | 139/267 [00:25<00:24,  5.24batch/s]

torch.Size([32, 53, 768])


train:  52%|█████▏    | 140/267 [00:26<00:23,  5.33batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 56, 768])


train:  53%|█████▎    | 142/267 [00:26<00:23,  5.35batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])


train:  54%|█████▍    | 144/267 [00:26<00:23,  5.29batch/s]

torch.Size([32, 55, 768])


train:  54%|█████▍    | 145/267 [00:26<00:21,  5.68batch/s]

torch.Size([32, 35, 768])
torch.Size([32, 41, 768])


train:  55%|█████▌    | 147/267 [00:27<00:20,  5.86batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 47, 768])


train:  56%|█████▌    | 149/267 [00:27<00:21,  5.52batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


train:  57%|█████▋    | 151/267 [00:27<00:19,  5.84batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 53, 768])


train:  57%|█████▋    | 153/267 [00:28<00:21,  5.40batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 47, 768])


train:  58%|█████▊    | 155/267 [00:28<00:20,  5.46batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 49, 768])


train:  59%|█████▉    | 157/267 [00:29<00:20,  5.39batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 56, 768])


train:  60%|█████▉    | 159/267 [00:29<00:20,  5.16batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  60%|██████    | 161/267 [00:29<00:19,  5.34batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 44, 768])


train:  61%|██████    | 163/267 [00:30<00:20,  5.15batch/s]

torch.Size([32, 60, 768])


train:  61%|██████▏   | 164/267 [00:30<00:19,  5.27batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 45, 768])


train:  62%|██████▏   | 166/267 [00:30<00:18,  5.55batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 46, 768])


train:  63%|██████▎   | 168/267 [00:31<00:18,  5.42batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 37, 768])


train:  64%|██████▎   | 170/267 [00:31<00:17,  5.41batch/s]

torch.Size([32, 54, 768])


train:  64%|██████▍   | 171/267 [00:31<00:16,  5.65batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 48, 768])


train:  65%|██████▍   | 173/267 [00:32<00:16,  5.71batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 51, 768])


train:  66%|██████▌   | 175/267 [00:32<00:17,  5.34batch/s]

torch.Size([32, 53, 768])


train:  66%|██████▌   | 176/267 [00:32<00:16,  5.39batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:  67%|██████▋   | 178/267 [00:33<00:17,  5.14batch/s]

torch.Size([32, 59, 768])


train:  67%|██████▋   | 179/267 [00:33<00:16,  5.21batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  68%|██████▊   | 181/267 [00:33<00:15,  5.56batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])


train:  69%|██████▊   | 183/267 [00:33<00:15,  5.59batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 57, 768])


train:  69%|██████▉   | 185/267 [00:34<00:15,  5.33batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  70%|███████   | 187/267 [00:34<00:14,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 53, 768])


train:  71%|███████   | 189/267 [00:35<00:15,  5.19batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 43, 768])


train:  72%|███████▏  | 191/267 [00:35<00:14,  5.29batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 52, 768])


train:  72%|███████▏  | 193/267 [00:35<00:13,  5.31batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 44, 768])


train:  73%|███████▎  | 195/267 [00:36<00:12,  5.60batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 42, 768])


train:  74%|███████▍  | 197/267 [00:36<00:11,  5.84batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 59, 768])


train:  75%|███████▍  | 199/267 [00:36<00:13,  5.23batch/s]

torch.Size([32, 54, 768])


train:  75%|███████▍  | 200/267 [00:37<00:12,  5.20batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 47, 768])


train:  76%|███████▌  | 202/267 [00:37<00:12,  5.23batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])


train:  76%|███████▋  | 204/267 [00:37<00:11,  5.27batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


train:  77%|███████▋  | 206/267 [00:38<00:11,  5.19batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 43, 768])


train:  78%|███████▊  | 208/267 [00:38<00:10,  5.37batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  79%|███████▊  | 210/267 [00:38<00:10,  5.42batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 46, 768])


train:  79%|███████▉  | 212/267 [00:39<00:10,  5.33batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 51, 768])


train:  80%|████████  | 214/267 [00:39<00:10,  5.13batch/s]

torch.Size([32, 54, 768])


train:  81%|████████  | 215/267 [00:39<00:10,  5.10batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.02batch/s]

torch.Size([32, 53, 768])


train:  82%|████████▏ | 218/267 [00:40<00:09,  4.97batch/s]

torch.Size([32, 55, 768])


train:  82%|████████▏ | 219/267 [00:40<00:09,  5.22batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 56, 768])


train:  83%|████████▎ | 221/267 [00:41<00:09,  5.07batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 64, 768])


train:  84%|████████▎ | 223/267 [00:41<00:09,  4.71batch/s]

torch.Size([32, 59, 768])


train:  84%|████████▍ | 224/267 [00:41<00:08,  4.81batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


train:  85%|████████▍ | 226/267 [00:42<00:08,  5.08batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 45, 768])


train:  85%|████████▌ | 228/267 [00:42<00:07,  5.14batch/s]

torch.Size([32, 53, 768])


train:  86%|████████▌ | 229/267 [00:42<00:07,  5.07batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


train:  87%|████████▋ | 231/267 [00:43<00:06,  5.16batch/s]

torch.Size([32, 53, 768])


train:  87%|████████▋ | 232/267 [00:43<00:06,  5.10batch/s]

torch.Size([32, 53, 768])


train:  87%|████████▋ | 233/267 [00:43<00:06,  4.95batch/s]

torch.Size([32, 57, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  4.92batch/s]

torch.Size([32, 54, 768])


train:  88%|████████▊ | 235/267 [00:43<00:06,  5.04batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 59, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.11batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 49, 768])


train:  90%|████████▉ | 239/267 [00:44<00:05,  5.22batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 48, 768])


train:  90%|█████████ | 241/267 [00:45<00:05,  5.05batch/s]

torch.Size([32, 58, 768])


train:  91%|█████████ | 242/267 [00:45<00:04,  5.05batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  91%|█████████▏| 244/267 [00:45<00:04,  4.86batch/s]

torch.Size([32, 59, 768])


train:  92%|█████████▏| 245/267 [00:45<00:04,  4.99batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 42, 768])


train:  93%|█████████▎| 247/267 [00:46<00:03,  5.17batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 44, 768])


train:  93%|█████████▎| 249/267 [00:46<00:03,  5.17batch/s]

torch.Size([32, 56, 768])


train:  94%|█████████▎| 250/267 [00:46<00:03,  5.38batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


train:  94%|█████████▍| 252/267 [00:47<00:02,  5.31batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  95%|█████████▌| 254/267 [00:47<00:02,  5.17batch/s]

torch.Size([32, 58, 768])


train:  96%|█████████▌| 255/267 [00:47<00:02,  5.16batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 40, 768])


train:  96%|█████████▋| 257/267 [00:48<00:01,  5.30batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 41, 768])


train:  97%|█████████▋| 259/267 [00:48<00:01,  5.40batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


train:  98%|█████████▊| 261/267 [00:48<00:01,  5.71batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 44, 768])


train:  99%|█████████▊| 263/267 [00:49<00:00,  5.80batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  99%|█████████▉| 265/267 [00:49<00:00,  5.72batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 53, 768])


train: 100%|██████████| 267/267 [00:49<00:00,  5.35batch/s]
2020-09-01 06:46:49.206 | INFO     | __main__:train:39 - epoch: 11, transformer: gpt2, train_loss: 0.0249, train_acc: 66.62
dev:   0%|          | 0/35 [00:00<?, ?batch/s]

torch.Size([32, 57, 768])


dev:   6%|▌         | 2/35 [00:00<00:02, 16.49batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 51, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])


dev:  17%|█▋        | 6/35 [00:00<00:01, 16.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 46, 768])
torch.Size([32, 47, 768])
torch.Size([32, 47, 768])


dev:  29%|██▊       | 10/35 [00:00<00:01, 17.18batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 35, 768])
torch.Size([32, 46, 768])
torch.Size([32, 51, 768])


dev:  40%|████      | 14/35 [00:00<00:01, 16.62batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 51, 768])
torch.Size([32, 46, 768])
torch.Size([32, 48, 768])


dev:  51%|█████▏    | 18/35 [00:01<00:01, 16.77batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 43, 768])
torch.Size([32, 45, 768])
torch.Size([32, 46, 768])


dev:  63%|██████▎   | 22/35 [00:01<00:00, 16.37batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 58, 768])
torch.Size([32, 45, 768])
torch.Size([32, 41, 768])


dev:  74%|███████▍  | 26/35 [00:01<00:00, 16.48batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 47, 768])
torch.Size([32, 46, 768])
torch.Size([32, 54, 768])


dev:  86%|████████▌ | 30/35 [00:01<00:00, 16.27batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 52, 768])
torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


dev: 100%|██████████| 35/35 [00:02<00:00, 16.72batch/s]
2020-09-01 06:46:51.314 | INFO     | __main__:train:42 - epoch: 11, transformer: gpt2, dev_loss: 0.0377, dev_acc: 48.23
test:   0%|          | 0/70 [00:00<?, ?batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 45, 768])
torch.Size([13, 45, 768])
torch.Size([32, 46, 768])


test:   6%|▌         | 4/70 [00:00<00:03, 17.26batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 41, 768])
torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


test:  11%|█▏        | 8/70 [00:00<00:03, 16.33batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 53, 768])
torch.Size([32, 44, 768])


test:  17%|█▋        | 12/70 [00:00<00:03, 16.34batch/s]

torch.Size([32, 55, 768])
torch.Size([32, 53, 768])
torch.Size([32, 37, 768])
torch.Size([32, 56, 768])


test:  23%|██▎       | 16/70 [00:00<00:03, 16.19batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 56, 768])
torch.Size([32, 42, 768])
torch.Size([32, 53, 768])


test:  29%|██▊       | 20/70 [00:01<00:03, 15.78batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 52, 768])
torch.Size([32, 48, 768])
torch.Size([32, 47, 768])


test:  34%|███▍      | 24/70 [00:01<00:02, 16.90batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 39, 768])
torch.Size([32, 38, 768])
torch.Size([32, 50, 768])


test:  40%|████      | 28/70 [00:01<00:02, 16.51batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 52, 768])
torch.Size([32, 43, 768])
torch.Size([32, 41, 768])


test:  46%|████▌     | 32/70 [00:01<00:02, 16.52batch/s]

torch.Size([32, 56, 768])
torch.Size([32, 50, 768])
torch.Size([32, 40, 768])
torch.Size([32, 58, 768])


test:  51%|█████▏    | 36/70 [00:02<00:02, 16.12batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 48, 768])
torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


test:  57%|█████▋    | 40/70 [00:02<00:01, 16.09batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])
torch.Size([32, 45, 768])
torch.Size([32, 48, 768])


test:  63%|██████▎   | 44/70 [00:02<00:01, 16.51batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 47, 768])
torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


test:  69%|██████▊   | 48/70 [00:02<00:01, 16.44batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])
torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


test:  74%|███████▍  | 52/70 [00:03<00:01, 16.88batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 39, 768])
torch.Size([32, 44, 768])
torch.Size([32, 42, 768])


test:  80%|████████  | 56/70 [00:03<00:00, 16.77batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 53, 768])
torch.Size([32, 45, 768])
torch.Size([32, 55, 768])


test:  86%|████████▌ | 60/70 [00:03<00:00, 15.47batch/s]

torch.Size([32, 62, 768])
torch.Size([32, 51, 768])
torch.Size([32, 52, 768])


test:  89%|████████▊ | 62/70 [00:03<00:00, 15.68batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 53, 768])
torch.Size([32, 49, 768])
torch.Size([32, 43, 768])


test:  94%|█████████▍| 66/70 [00:04<00:00, 16.47batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 42, 768])
torch.Size([32, 47, 768])
torch.Size([32, 38, 768])


test: 100%|██████████| 70/70 [00:04<00:00, 16.55batch/s]
2020-09-01 06:46:55.562 | INFO     | __main__:train:46 - epoch: 11, transformer: gpt2, test_loss: 0.0361, test_acc: 50.63
2020-09-01 06:46:55.563 | INFO     | __main__:train:47 - epoch: 11, transformer: gpt2, test_precision: 52.21, test_recall: 48.59, test_f1_score: 49.57, test_accuracy_score: 50.63
2020-09-01 06:46:55.564 | INFO     | __main__:train:52 - epoch: 11, transformer: gpt2, test_confusion_matrix: 
[[102 136  31  10   0]
 [ 74 349 160  47   3]
 [ 12 117 152 100   8]
 [  1  21  98 314  76]
 [  1   3  21 172 202]]
2020-09-01 06:46:55.564 | INFO     | __main__:train:55 - Total training time elapsed: 0:09:09.567379
2020-09-01 06:46:55.565 | INFO     | __main__:train:56 - Mean time per train epoch: 0:00:49.960671
train:   0%|          | 0/267 [00:00<?, ?batch/s]

torch.Size([32, 41, 768])
torch.Size([2, 26, 768])
torch.Size([32, 48, 768])


train:   1%|          | 2/267 [00:00<00:49,  5.40batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 43, 768])


train:   1%|▏         | 4/267 [00:00<00:46,  5.72batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 59, 768])


train:   2%|▏         | 6/267 [00:01<00:47,  5.49batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 51, 768])


train:   3%|▎         | 8/267 [00:01<00:47,  5.40batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 45, 768])


train:   3%|▎         | 9/267 [00:01<00:47,  5.46batch/s]

torch.Size([32, 64, 768])


train:   4%|▍         | 11/267 [00:02<00:50,  5.03batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:   5%|▍         | 13/267 [00:02<00:51,  4.94batch/s]

torch.Size([32, 60, 768])


train:   5%|▌         | 14/267 [00:02<00:48,  5.19batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


train:   6%|▌         | 16/267 [00:03<00:46,  5.41batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 49, 768])


train:   7%|▋         | 18/267 [00:03<00:47,  5.26batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 39, 768])


train:   7%|▋         | 20/267 [00:03<00:45,  5.38batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 40, 768])


train:   8%|▊         | 22/267 [00:04<00:41,  5.88batch/s]

torch.Size([32, 36, 768])
torch.Size([32, 44, 768])


train:   9%|▉         | 24/267 [00:04<00:42,  5.78batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 39, 768])


train:  10%|▉         | 26/267 [00:04<00:40,  5.99batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 38, 768])


train:  10%|█         | 28/267 [00:05<00:43,  5.54batch/s]

torch.Size([32, 58, 768])


train:  11%|█         | 29/267 [00:05<00:41,  5.69batch/s]

torch.Size([32, 39, 768])
torch.Size([32, 59, 768])


train:  12%|█▏        | 31/267 [00:05<00:45,  5.14batch/s]

torch.Size([32, 56, 768])


train:  12%|█▏        | 32/267 [00:05<00:44,  5.33batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


train:  13%|█▎        | 34/267 [00:06<00:44,  5.28batch/s]

torch.Size([32, 54, 768])


train:  13%|█▎        | 35/267 [00:06<00:44,  5.20batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 45, 768])


train:  14%|█▍        | 37/267 [00:06<00:42,  5.39batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 42, 768])


train:  15%|█▍        | 39/267 [00:07<00:43,  5.23batch/s]

torch.Size([32, 57, 768])


train:  15%|█▍        | 40/267 [00:07<00:42,  5.29batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 50, 768])


train:  15%|█▌        | 41/267 [00:07<00:43,  5.25batch/s]

torch.Size([32, 64, 768])


train:  16%|█▌        | 43/267 [00:08<00:45,  4.89batch/s]

torch.Size([32, 56, 768])


train:  16%|█▋        | 44/267 [00:08<00:44,  5.04batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 37, 768])


train:  17%|█▋        | 46/267 [00:08<00:41,  5.32batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 50, 768])


train:  18%|█▊        | 48/267 [00:08<00:40,  5.42batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 44, 768])


train:  19%|█▊        | 50/267 [00:09<00:38,  5.60batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 52, 768])


train:  19%|█▉        | 52/267 [00:09<00:40,  5.31batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 50, 768])


train:  20%|██        | 54/267 [00:10<00:39,  5.45batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 45, 768])


train:  21%|██        | 56/267 [00:10<00:37,  5.65batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  22%|██▏       | 58/267 [00:10<00:39,  5.23batch/s]

torch.Size([32, 59, 768])


train:  22%|██▏       | 59/267 [00:10<00:41,  4.99batch/s]

torch.Size([32, 60, 768])


train:  22%|██▏       | 60/267 [00:11<00:41,  4.95batch/s]

torch.Size([32, 54, 768])


train:  23%|██▎       | 61/267 [00:11<00:41,  5.00batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 44, 768])


train:  24%|██▎       | 63/267 [00:11<00:40,  5.09batch/s]

torch.Size([32, 56, 768])


train:  24%|██▍       | 64/267 [00:11<00:40,  5.05batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 40, 768])


train:  25%|██▍       | 66/267 [00:12<00:36,  5.51batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 53, 768])


train:  25%|██▌       | 68/267 [00:12<00:38,  5.21batch/s]

torch.Size([32, 53, 768])


train:  26%|██▌       | 69/267 [00:12<00:38,  5.16batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 58, 768])


train:  27%|██▋       | 71/267 [00:13<00:39,  5.00batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 53, 768])


train:  27%|██▋       | 73/267 [00:13<00:38,  5.04batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 49, 768])


train:  28%|██▊       | 75/267 [00:14<00:36,  5.23batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 50, 768])


train:  29%|██▉       | 77/267 [00:14<00:36,  5.17batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 42, 768])


train:  30%|██▉       | 79/267 [00:14<00:36,  5.15batch/s]

torch.Size([32, 57, 768])


train:  30%|██▉       | 80/267 [00:15<00:35,  5.22batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 54, 768])


train:  31%|███       | 82/267 [00:15<00:37,  4.96batch/s]

torch.Size([32, 58, 768])


train:  31%|███       | 83/267 [00:15<00:35,  5.18batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  32%|███▏      | 85/267 [00:16<00:34,  5.34batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  33%|███▎      | 87/267 [00:16<00:33,  5.42batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 53, 768])


train:  33%|███▎      | 89/267 [00:16<00:34,  5.15batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 52, 768])


train:  34%|███▍      | 91/267 [00:17<00:33,  5.21batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  35%|███▍      | 93/267 [00:17<00:33,  5.24batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 56, 768])


train:  36%|███▌      | 95/267 [00:17<00:33,  5.14batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 53, 768])


train:  36%|███▋      | 97/267 [00:18<00:32,  5.29batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  37%|███▋      | 99/267 [00:18<00:31,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 45, 768])


train:  38%|███▊      | 101/267 [00:19<00:31,  5.34batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 44, 768])


train:  39%|███▊      | 103/267 [00:19<00:30,  5.45batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 46, 768])


train:  39%|███▉      | 105/267 [00:19<00:28,  5.59batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 48, 768])


train:  40%|████      | 107/267 [00:20<00:29,  5.42batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 41, 768])


train:  41%|████      | 109/267 [00:20<00:27,  5.66batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 48, 768])


train:  42%|████▏     | 111/267 [00:20<00:27,  5.65batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  42%|████▏     | 113/267 [00:21<00:26,  5.73batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 48, 768])


train:  43%|████▎     | 115/267 [00:21<00:28,  5.37batch/s]

torch.Size([32, 56, 768])


train:  43%|████▎     | 116/267 [00:21<00:27,  5.53batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 44, 768])


train:  44%|████▍     | 118/267 [00:22<00:26,  5.67batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 56, 768])


train:  45%|████▍     | 120/267 [00:22<00:27,  5.40batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 57, 768])


train:  46%|████▌     | 122/267 [00:22<00:28,  5.12batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 58, 768])


train:  46%|████▋     | 124/267 [00:23<00:27,  5.19batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 38, 768])


train:  47%|████▋     | 126/267 [00:23<00:25,  5.57batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 39, 768])


train:  48%|████▊     | 128/267 [00:23<00:26,  5.32batch/s]

torch.Size([32, 59, 768])


train:  48%|████▊     | 129/267 [00:24<00:24,  5.56batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 50, 768])


train:  49%|████▉     | 131/267 [00:24<00:25,  5.27batch/s]

torch.Size([32, 54, 768])


train:  49%|████▉     | 132/267 [00:24<00:24,  5.48batch/s]

torch.Size([32, 40, 768])
torch.Size([32, 47, 768])


train:  50%|█████     | 134/267 [00:25<00:23,  5.60batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 50, 768])


train:  51%|█████     | 136/267 [00:25<00:23,  5.57batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 40, 768])


train:  52%|█████▏    | 138/267 [00:25<00:22,  5.63batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 42, 768])


train:  52%|█████▏    | 140/267 [00:26<00:22,  5.67batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 47, 768])


train:  53%|█████▎    | 142/267 [00:26<00:23,  5.40batch/s]

torch.Size([32, 53, 768])


train:  54%|█████▎    | 143/267 [00:26<00:23,  5.24batch/s]

torch.Size([32, 54, 768])


train:  54%|█████▍    | 144/267 [00:26<00:22,  5.43batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 43, 768])


train:  55%|█████▍    | 146/267 [00:27<00:22,  5.39batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 51, 768])


train:  55%|█████▌    | 148/267 [00:27<00:21,  5.47batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 47, 768])


train:  56%|█████▌    | 150/267 [00:27<00:20,  5.61batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 54, 768])


train:  57%|█████▋    | 152/267 [00:28<00:22,  5.21batch/s]

torch.Size([32, 56, 768])


train:  57%|█████▋    | 153/267 [00:28<00:21,  5.28batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 40, 768])


train:  58%|█████▊    | 155/267 [00:28<00:20,  5.41batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 55, 768])


train:  59%|█████▉    | 157/267 [00:29<00:20,  5.30batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 52, 768])


train:  60%|█████▉    | 159/267 [00:29<00:20,  5.39batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 49, 768])


train:  60%|██████    | 161/267 [00:30<00:19,  5.36batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  61%|██████    | 163/267 [00:30<00:19,  5.31batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 48, 768])


train:  62%|██████▏   | 165/267 [00:30<00:19,  5.22batch/s]

torch.Size([32, 53, 768])


train:  63%|██████▎   | 167/267 [00:31<00:19,  5.07batch/s]

torch.Size([32, 53, 768])
torch.Size([32, 54, 768])


train:  63%|██████▎   | 168/267 [00:31<00:18,  5.31batch/s]

torch.Size([32, 41, 768])
torch.Size([32, 44, 768])


train:  64%|██████▎   | 170/267 [00:31<00:17,  5.55batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 50, 768])


train:  64%|██████▍   | 172/267 [00:32<00:17,  5.42batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 44, 768])


train:  65%|██████▌   | 174/267 [00:32<00:16,  5.48batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 37, 768])


train:  66%|██████▌   | 176/267 [00:32<00:16,  5.60batch/s]

torch.Size([32, 48, 768])
torch.Size([32, 39, 768])


train:  67%|██████▋   | 178/267 [00:33<00:15,  5.57batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 42, 768])


train:  67%|██████▋   | 180/267 [00:33<00:14,  5.84batch/s]

torch.Size([32, 38, 768])
torch.Size([32, 46, 768])


train:  68%|██████▊   | 182/267 [00:33<00:14,  5.68batch/s]

torch.Size([32, 46, 768])
torch.Size([32, 46, 768])


train:  69%|██████▉   | 184/267 [00:34<00:15,  5.28batch/s]

torch.Size([32, 57, 768])


train:  69%|██████▉   | 185/267 [00:34<00:16,  5.00batch/s]

torch.Size([32, 59, 768])


train:  70%|██████▉   | 186/267 [00:34<00:15,  5.21batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 54, 768])


train:  70%|███████   | 188/267 [00:35<00:14,  5.43batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 47, 768])


train:  71%|███████   | 190/267 [00:35<00:13,  5.67batch/s]

torch.Size([32, 37, 768])
torch.Size([32, 53, 768])


train:  72%|███████▏  | 192/267 [00:35<00:13,  5.48batch/s]

torch.Size([32, 45, 768])
torch.Size([32, 46, 768])


train:  73%|███████▎  | 194/267 [00:36<00:13,  5.26batch/s]

torch.Size([32, 56, 768])


train:  73%|███████▎  | 195/267 [00:36<00:13,  5.18batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 47, 768])


train:  74%|███████▍  | 197/267 [00:36<00:13,  5.31batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 45, 768])


train:  75%|███████▍  | 199/267 [00:37<00:12,  5.39batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  75%|███████▌  | 201/267 [00:37<00:12,  5.25batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 54, 768])


train:  76%|███████▌  | 203/267 [00:37<00:12,  5.12batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 54, 768])


train:  77%|███████▋  | 205/267 [00:38<00:11,  5.24batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 58, 768])


train:  78%|███████▊  | 207/267 [00:38<00:11,  5.27batch/s]

torch.Size([32, 42, 768])
torch.Size([32, 48, 768])


train:  78%|███████▊  | 209/267 [00:38<00:10,  5.44batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 54, 768])


train:  79%|███████▉  | 211/267 [00:39<00:10,  5.25batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 44, 768])


train:  80%|███████▉  | 213/267 [00:39<00:09,  5.50batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 43, 768])


train:  81%|████████  | 215/267 [00:40<00:09,  5.54batch/s]

torch.Size([32, 47, 768])
torch.Size([32, 51, 768])


train:  81%|████████▏ | 217/267 [00:40<00:09,  5.32batch/s]

torch.Size([32, 49, 768])
torch.Size([32, 48, 768])


train:  82%|████████▏ | 219/267 [00:40<00:09,  5.27batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 48, 768])


train:  83%|████████▎ | 221/267 [00:41<00:08,  5.17batch/s]

torch.Size([32, 54, 768])


train:  83%|████████▎ | 222/267 [00:41<00:08,  5.16batch/s]

torch.Size([32, 50, 768])
torch.Size([32, 51, 768])


train:  84%|████████▍ | 224/267 [00:41<00:08,  5.30batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 45, 768])


train:  85%|████████▍ | 226/267 [00:42<00:07,  5.22batch/s]

torch.Size([32, 54, 768])


train:  85%|████████▌ | 227/267 [00:42<00:07,  5.39batch/s]

torch.Size([32, 43, 768])
torch.Size([32, 52, 768])


train:  86%|████████▌ | 229/267 [00:42<00:07,  5.19batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 49, 768])


train:  87%|████████▋ | 231/267 [00:43<00:07,  5.14batch/s]

torch.Size([32, 52, 768])
torch.Size([32, 54, 768])


train:  87%|████████▋ | 233/267 [00:43<00:06,  4.99batch/s]

torch.Size([32, 55, 768])


train:  88%|████████▊ | 234/267 [00:43<00:06,  5.01batch/s]

torch.Size([32, 51, 768])
torch.Size([32, 38, 768])


train:  88%|████████▊ | 236/267 [00:44<00:05,  5.45batch/s]

torch.Size([32, 44, 768])
torch.Size([32, 46, 768])


train:  89%|████████▉ | 237/267 [00:44<00:05,  5.33batch/s]

torch.Size([32, 48, 768])





KeyboardInterrupt: ignored