In [1]:
import os
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('..')

import numpy as np
import random
import torch
# %env CUDA_VISIBLE_DEVICES=1

from lib import data_processors, utils

In [2]:
params = {
    'data_dir': '../../data/SST-2',
    'output_dir': '../output',
    'cache_dir': '../model_cache',
    'task_name': 'sst2',
    'bert_model': 'bert-base-uncased',
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 8,
    'learning_rate': 2e-5,
    'warmup_proportion': 0.1,
    'num_train_epochs': 5,
    'seed': 1331
}

processors = {
    'cola': data_processors.ColaProcessor,
    'mnli': data_processors.MnliProcessor,
    'mrpc': data_processors.MrpcProcessor,
    'sst2': data_processors.SST2Processor
}

num_labels_task = {
    'cola': 2,
    'mnli': 3,
    'mrpc': 2,
    'sst2': 2
}

random.seed(params['seed'])
np.random.seed(params['seed'])
torch.manual_seed(params['seed'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
from lib.bert import BertForSequenceClassification
from pytorch_pretrained_bert.tokenization import BertTokenizer

from lib.train_eval import train, evaluate
from lib.train_student import train_student 
from lib.blend_cnn import BlendCNN
from pytorch_pretrained_bert.modeling import (BertConfig, WEIGHTS_NAME, CONFIG_NAME)
from lib.bert import BertForSequenceClassification

Скачиваем предобученный на SST-2 Берт.

In [7]:
processor = processors[params['task_name']]()
num_labels = num_labels_task[params['task_name']]
label_list = processor.get_labels()

tokenizer = BertTokenizer.from_pretrained(
    params['bert_model'], do_lower_case=True)

train_examples = processor.get_train_examples(params['data_dir'])
eval_examples = processor.get_dev_examples(params['data_dir'])

output_model_file = os.path.join(params['output_dir'], WEIGHTS_NAME)
output_config_file = os.path.join(params['output_dir'], CONFIG_NAME)

config = BertConfig(output_config_file)
teacher_model = BertForSequenceClassification(config, num_labels=num_labels)
teacher_model.load_state_dict(torch.load(output_model_file))
teacher_model = teacher_model.to(device)


03/21/2019 16:41:40 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/shakhrayv/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [8]:
result = evaluate(teacher_model, eval_examples, label_list, params, tokenizer)
result

03/21/2019 16:41:46 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 16:41:46 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 16:41:46 - INFO - lib.train_eval -     Batch size = 8
Evaluating: 100%|██████████| 109/109 [00:06<00:00, 17.64it/s]
03/21/2019 16:41:52 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 16:41:52 - INFO - lib.train_eval -     eval_accuracy = 0.9277522935779816
03/21/2019 16:41:52 - INFO - lib.train_eval -     eval_loss = 0.24567339350597575


{'eval_loss': 0.24567339350597575, 'eval_accuracy': 0.9277522935779816}

В качестве студента возьмем модель BlendCNN (код находится в паке lib)

In [9]:
model = BlendCNN(params['max_seq_length'], len(tokenizer.vocab), len(label_list), n_layers=6).to(device)

Общее число параметров в ученике

In [35]:
all_count = 0
for p in model.named_parameters():
    cur_count = 1
    for sh in p[1].size():
        cur_count *= sh
        
    all_count += cur_count
    
print(all_count)

4456598


Общее число параметров в учителе

In [42]:
all_count = 0
for p in teacher_model.named_parameters():
    cur_count = 1
    for sh in p[1].size():
        cur_count *= sh
        
    all_count += cur_count
    
print(all_count)

109483778


In [43]:
print('compresing in {} times'.format(109483778 / 4456598))

compresing in 24.566671259108404 times


Обучаем студента при помощи дистилляции. Далее какое-то время идет обучение, но это не конец.

In [10]:
model, result = train_student(model, teacher_model, train_examples, label_list, params, tokenizer)

Evaluating:   0%|          | 0/2105 [00:00<?, ?it/s]

***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    10520


Evaluating: 100%|██████████| 2105/2105 [03:19<00:00, 10.54it/s]
  loss_first = KLDivLoss()(F.log_softmax(logits_model / temperature), F.softmax(teacher_logits / temperature))
  loss_first = KLDivLoss()(F.log_softmax(logits_model / temperature), F.softmax(teacher_logits / temperature))
Iteration:   0%|          | 4/2105 [00:00<00:56, 37.02it/s]

Epoch: 1


Iteration: 100%|██████████| 2105/2105 [00:17<00:00, 117.43it/s]
Iteration:   1%|          | 12/2105 [00:00<00:17, 117.38it/s]

Epoch: 2


Iteration: 100%|██████████| 2105/2105 [00:17<00:00, 116.95it/s]
Iteration:   1%|          | 12/2105 [00:00<00:18, 115.34it/s]

Epoch: 3


Iteration: 100%|██████████| 2105/2105 [00:17<00:00, 117.53it/s]
Iteration:   1%|          | 12/2105 [00:00<00:18, 114.03it/s]

Epoch: 4


Iteration: 100%|██████████| 2105/2105 [00:18<00:00, 115.57it/s]
Iteration:   1%|          | 12/2105 [00:00<00:18, 116.23it/s]

Epoch: 5


Iteration: 100%|██████████| 2105/2105 [00:18<00:00, 116.37it/s]


In [11]:
evaluate(model, eval_examples, label_list, params, tokenizer)

03/21/2019 16:47:57 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 16:47:57 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 16:47:57 - INFO - lib.train_eval -     Batch size = 8
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 433.20it/s]
03/21/2019 16:47:57 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 16:47:57 - INFO - lib.train_eval -     eval_accuracy = 0.5573394495412844
03/21/2019 16:47:57 - INFO - lib.train_eval -     eval_loss = 0.11215483937241616


{'eval_loss': 0.11215483937241616, 'eval_accuracy': 0.5573394495412844}

In [12]:
model, result = train_student(model, teacher_model, train_examples, label_list, params, tokenizer)

Evaluating:   0%|          | 0/2105 [00:00<?, ?it/s]

***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    10520


Evaluating: 100%|██████████| 2105/2105 [03:19<00:00, 10.54it/s]
Iteration:   0%|          | 6/2105 [00:00<00:37, 55.66it/s]

Epoch: 1


Iteration: 100%|██████████| 2105/2105 [00:19<00:00, 109.56it/s]
Iteration:   1%|          | 11/2105 [00:00<00:19, 106.38it/s]

Epoch: 2


Iteration: 100%|██████████| 2105/2105 [00:19<00:00, 107.14it/s]
Iteration:   1%|          | 11/2105 [00:00<00:20, 103.82it/s]

Epoch: 3


Iteration: 100%|██████████| 2105/2105 [00:19<00:00, 110.43it/s]
Iteration:   0%|          | 9/2105 [00:00<00:24, 86.32it/s]

Epoch: 4


Iteration: 100%|██████████| 2105/2105 [00:19<00:00, 110.52it/s]
Iteration:   1%|          | 12/2105 [00:00<00:18, 112.70it/s]

Epoch: 5


Iteration: 100%|██████████| 2105/2105 [00:18<00:00, 111.37it/s]


In [20]:
evaluate(model, eval_examples, label_list, params, tokenizer)

03/21/2019 16:56:29 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 16:56:29 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 16:56:29 - INFO - lib.train_eval -     Batch size = 8
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 384.49it/s]
03/21/2019 16:56:29 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 16:56:29 - INFO - lib.train_eval -     eval_accuracy = 0.6009174311926605
03/21/2019 16:56:29 - INFO - lib.train_eval -     eval_loss = 0.10501195025553398


{'eval_loss': 0.10501195025553398, 'eval_accuracy': 0.6009174311926605}

In [38]:
params['num_train_epochs'] = 5
params['learning_rate'] = 2e-3

model, result = train_student(model, teacher_model, train_examples, label_list, params, tokenizer)




***** Running training *****
Num examples: 67349
Batch size:   128
Num steps:    2630


Evaluating:   0%|          | 0/527 [00:00<?, ?it/s][A
Evaluating:   0%|          | 2/527 [00:00<01:44,  5.01it/s][A
Evaluating:   1%|          | 3/527 [00:00<02:09,  4.04it/s][A
Evaluating:   1%|          | 4/527 [00:01<02:27,  3.56it/s][A
Evaluating:   1%|          | 5/527 [00:01<02:39,  3.27it/s][A
Evaluating:   1%|          | 6/527 [00:01<02:47,  3.10it/s][A
Evaluating:   1%|▏         | 7/527 [00:02<02:53,  3.00it/s][A
Evaluating:   2%|▏         | 8/527 [00:02<02:57,  2.92it/s][A
Evaluating:   2%|▏         | 9/527 [00:02<03:00,  2.88it/s][A
Evaluating:   2%|▏         | 10/527 [00:03<03:01,  2.84it/s][A
Evaluating:   2%|▏         | 11/527 [00:03<03:02,  2.82it/s][A
Evaluating:   2%|▏         | 12/527 [00:04<03:03,  2.81it/s][A
Evaluating:   2%|▏         | 13/527 [00:04<03:04,  2.79it/s][A
Evaluating:   3%|▎         | 14/527 [00:04<03:04,  2.79it/s][A
Evaluating:   3%|▎         | 15/527 [00:05<03:04,  2.78it/s][A
Evaluating:   3%|▎         | 16/527 [00:05<03:03,  2.78it

Evaluating:  48%|████▊     | 254/527 [01:32<01:40,  2.72it/s][A
Evaluating:  48%|████▊     | 255/527 [01:32<01:39,  2.72it/s][A
Evaluating:  49%|████▊     | 256/527 [01:32<01:39,  2.72it/s][A
Evaluating:  49%|████▉     | 257/527 [01:33<01:39,  2.72it/s][A
Evaluating:  49%|████▉     | 258/527 [01:33<01:38,  2.72it/s][A
Evaluating:  49%|████▉     | 259/527 [01:34<01:38,  2.72it/s][A
Evaluating:  49%|████▉     | 260/527 [01:34<01:37,  2.73it/s][A
Evaluating:  50%|████▉     | 261/527 [01:34<01:37,  2.72it/s][A
Evaluating:  50%|████▉     | 262/527 [01:35<01:37,  2.72it/s][A
Evaluating:  50%|████▉     | 263/527 [01:35<01:36,  2.72it/s][A
Evaluating:  50%|█████     | 264/527 [01:35<01:36,  2.72it/s][A
Evaluating:  50%|█████     | 265/527 [01:36<01:36,  2.72it/s][A
Evaluating:  50%|█████     | 266/527 [01:36<01:35,  2.72it/s][A
Evaluating:  51%|█████     | 267/527 [01:37<01:35,  2.73it/s][A
Evaluating:  51%|█████     | 268/527 [01:37<01:35,  2.72it/s][A
Evaluating:  51%|█████   

Evaluating:  96%|█████████▌| 506/527 [03:05<00:07,  2.71it/s][A
Evaluating:  96%|█████████▌| 507/527 [03:05<00:07,  2.70it/s][A
Evaluating:  96%|█████████▋| 508/527 [03:05<00:07,  2.71it/s][A
Evaluating:  97%|█████████▋| 509/527 [03:06<00:06,  2.71it/s][A
Evaluating:  97%|█████████▋| 510/527 [03:06<00:06,  2.70it/s][A
Evaluating:  97%|█████████▋| 511/527 [03:06<00:05,  2.71it/s][A
Evaluating:  97%|█████████▋| 512/527 [03:07<00:05,  2.71it/s][A
Evaluating:  97%|█████████▋| 513/527 [03:07<00:05,  2.71it/s][A
Evaluating:  98%|█████████▊| 514/527 [03:08<00:04,  2.71it/s][A
Evaluating:  98%|█████████▊| 515/527 [03:08<00:04,  2.71it/s][A
Evaluating:  98%|█████████▊| 516/527 [03:08<00:04,  2.71it/s][A
Evaluating:  98%|█████████▊| 517/527 [03:09<00:03,  2.71it/s][A
Evaluating:  98%|█████████▊| 518/527 [03:09<00:03,  2.71it/s][A
Evaluating:  98%|█████████▊| 519/527 [03:09<00:02,  2.71it/s][A
Evaluating:  99%|█████████▊| 520/527 [03:10<00:02,  2.71it/s][A
Evaluating:  99%|████████

Epoch: 1



Iteration:   2%|▏         | 8/527 [00:00<00:16, 31.31it/s][A
Iteration:   3%|▎         | 14/527 [00:00<00:14, 35.80it/s][A
Iteration:   4%|▍         | 20/527 [00:00<00:12, 39.91it/s][A
Iteration:   5%|▍         | 26/527 [00:00<00:11, 43.51it/s][A
Iteration:   6%|▌         | 32/527 [00:00<00:10, 46.33it/s][A
Iteration:   7%|▋         | 38/527 [00:00<00:10, 48.38it/s][A
Iteration:   8%|▊         | 44/527 [00:00<00:09, 50.11it/s][A
Iteration:   9%|▉         | 50/527 [00:00<00:09, 51.81it/s][A
Iteration:  11%|█         | 56/527 [00:01<00:08, 52.66it/s][A
Iteration:  12%|█▏        | 62/527 [00:01<00:08, 53.57it/s][A
Iteration:  13%|█▎        | 68/527 [00:01<00:08, 52.31it/s][A
Iteration:  14%|█▍        | 74/527 [00:01<00:08, 52.96it/s][A
Iteration:  15%|█▌        | 80/527 [00:01<00:08, 53.79it/s][A
Iteration:  16%|█▋        | 86/527 [00:01<00:08, 54.26it/s][A
Iteration:  17%|█▋        | 92/527 [00:01<00:07, 54.87it/s][A
Iteration:  19%|█▊        | 98/527 [00:01<00:07, 55.01i

Epoch: 2



Iteration:   2%|▏         | 12/527 [00:00<00:09, 55.92it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 56.09it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:08, 55.94it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:08, 56.26it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:08, 54.65it/s][A
Iteration:   8%|▊         | 42/527 [00:00<00:09, 53.60it/s][A
Iteration:   9%|▉         | 48/527 [00:00<00:08, 54.07it/s][A
Iteration:  10%|█         | 54/527 [00:00<00:08, 54.40it/s][A
Iteration:  11%|█▏        | 60/527 [00:01<00:08, 54.53it/s][A
Iteration:  13%|█▎        | 66/527 [00:01<00:08, 54.65it/s][A
Iteration:  14%|█▎        | 72/527 [00:01<00:08, 54.55it/s][A
Iteration:  15%|█▍        | 78/527 [00:01<00:08, 54.85it/s][A
Iteration:  16%|█▌        | 84/527 [00:01<00:08, 54.43it/s][A
Iteration:  17%|█▋        | 90/527 [00:01<00:07, 54.78it/s][A
Iteration:  18%|█▊        | 96/527 [00:01<00:07, 55.42it/s][A
Iteration:  19%|█▉        | 102/527 [00:01<00:07, 55.5

Epoch: 3



Iteration:   2%|▏         | 12/527 [00:00<00:09, 53.29it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 52.74it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 53.53it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:09, 53.39it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:09, 53.32it/s][A
Iteration:   8%|▊         | 42/527 [00:00<00:09, 53.81it/s][A
Iteration:   9%|▉         | 48/527 [00:00<00:08, 54.58it/s][A
Iteration:  10%|█         | 54/527 [00:00<00:08, 54.81it/s][A
Iteration:  11%|█▏        | 60/527 [00:01<00:08, 55.35it/s][A
Iteration:  13%|█▎        | 66/527 [00:01<00:08, 55.44it/s][A
Iteration:  14%|█▎        | 72/527 [00:01<00:08, 55.21it/s][A
Iteration:  15%|█▍        | 78/527 [00:01<00:08, 55.12it/s][A
Iteration:  16%|█▌        | 84/527 [00:01<00:08, 55.19it/s][A
Iteration:  17%|█▋        | 90/527 [00:01<00:07, 54.65it/s][A
Iteration:  18%|█▊        | 96/527 [00:01<00:07, 54.28it/s][A
Iteration:  19%|█▉        | 102/527 [00:01<00:07, 54.8

Epoch: 4



Iteration:   2%|▏         | 12/527 [00:00<00:09, 53.09it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 52.78it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 52.56it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:09, 52.88it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:09, 52.36it/s][A
Iteration:   8%|▊         | 42/527 [00:00<00:09, 52.58it/s][A
Iteration:   9%|▉         | 48/527 [00:00<00:08, 53.74it/s][A
Iteration:  10%|█         | 54/527 [00:01<00:08, 53.76it/s][A
Iteration:  11%|█▏        | 60/527 [00:01<00:08, 54.11it/s][A
Iteration:  13%|█▎        | 66/527 [00:01<00:08, 54.46it/s][A
Iteration:  14%|█▎        | 72/527 [00:01<00:08, 53.12it/s][A
Iteration:  15%|█▍        | 78/527 [00:01<00:08, 54.09it/s][A
Iteration:  16%|█▌        | 84/527 [00:01<00:08, 53.89it/s][A
Iteration:  17%|█▋        | 90/527 [00:01<00:08, 54.15it/s][A
Iteration:  18%|█▊        | 96/527 [00:01<00:07, 54.52it/s][A
Iteration:  19%|█▉        | 102/527 [00:01<00:07, 55.1

Epoch: 5



Iteration:   2%|▏         | 12/527 [00:00<00:09, 55.99it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 55.35it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 55.88it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:08, 55.67it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:08, 55.65it/s][A
Iteration:   8%|▊         | 42/527 [00:00<00:08, 54.79it/s][A
Iteration:   9%|▉         | 48/527 [00:00<00:08, 54.31it/s][A
Iteration:  10%|█         | 54/527 [00:00<00:08, 53.41it/s][A
Iteration:  11%|█▏        | 60/527 [00:01<00:08, 53.31it/s][A
Iteration:  13%|█▎        | 66/527 [00:01<00:08, 54.03it/s][A
Iteration:  14%|█▎        | 72/527 [00:01<00:08, 54.27it/s][A
Iteration:  15%|█▍        | 78/527 [00:01<00:08, 54.84it/s][A
Iteration:  16%|█▌        | 84/527 [00:01<00:08, 54.05it/s][A
Iteration:  17%|█▋        | 90/527 [00:01<00:08, 53.18it/s][A
Iteration:  18%|█▊        | 96/527 [00:01<00:08, 53.47it/s][A
Iteration:  19%|█▉        | 102/527 [00:01<00:08, 52.8

In [39]:
evaluate(model, eval_examples, label_list, params, tokenizer)

03/21/2019 17:15:55 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 17:15:55 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 17:15:55 - INFO - lib.train_eval -     Batch size = 8

Evaluating:   0%|          | 0/109 [00:00<?, ?it/s][A
Evaluating:  39%|███▊      | 42/109 [00:00<00:00, 418.62it/s][A
Evaluating:  77%|███████▋  | 84/109 [00:00<00:00, 416.37it/s][A
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 412.48it/s][A03/21/2019 17:15:55 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 17:15:55 - INFO - lib.train_eval -     eval_accuracy = 0.7763761467889908
03/21/2019 17:15:55 - INFO - lib.train_eval -     eval_loss = 0.002517868346030559


{'eval_loss': 0.002517868346030559, 'eval_accuracy': 0.7763761467889908}

In [40]:
model, result = train_student(model, teacher_model, train_examples, label_list, params, tokenizer)

***** Running training *****
Num examples: 67349
Batch size:   128
Num steps:    2630



Evaluating:   0%|          | 0/527 [00:00<?, ?it/s][A
Evaluating:   0%|          | 2/527 [00:00<01:43,  5.08it/s][A
Evaluating:   1%|          | 3/527 [00:00<02:09,  4.05it/s][A
Evaluating:   1%|          | 4/527 [00:01<02:27,  3.55it/s][A
Evaluating:   1%|          | 5/527 [00:01<02:39,  3.27it/s][A
Evaluating:   1%|          | 6/527 [00:01<02:48,  3.10it/s][A
Evaluating:   1%|▏         | 7/527 [00:02<02:54,  2.99it/s][A
Evaluating:   2%|▏         | 8/527 [00:02<02:58,  2.91it/s][A
Evaluating:   2%|▏         | 9/527 [00:02<03:00,  2.86it/s][A
Evaluating:   2%|▏         | 10/527 [00:03<03:02,  2.83it/s][A
Evaluating:   2%|▏         | 11/527 [00:03<03:03,  2.81it/s][A
Evaluating:   2%|▏         | 12/527 [00:04<03:04,  2.78it/s][A
Evaluating:   2%|▏         | 13/527 [00:04<03:04,  2.78it/s][A
Evaluating:   3%|▎         | 14/527 [00:04<03:05,  2.77it/s][A
Evaluating:   3%|▎         | 15/527 [00:05<03:05,  2.77it/s][A
Evaluating:   3%|▎         | 16/527 [00:05<03:05,  2.76i

Evaluating:  48%|████▊     | 254/527 [01:32<01:40,  2.71it/s][A
Evaluating:  48%|████▊     | 255/527 [01:33<01:40,  2.72it/s][A
Evaluating:  49%|████▊     | 256/527 [01:33<01:39,  2.72it/s][A
Evaluating:  49%|████▉     | 257/527 [01:33<01:39,  2.72it/s][A
Evaluating:  49%|████▉     | 258/527 [01:34<01:39,  2.71it/s][A
Evaluating:  49%|████▉     | 259/527 [01:34<01:38,  2.72it/s][A
Evaluating:  49%|████▉     | 260/527 [01:34<01:38,  2.71it/s][A
Evaluating:  50%|████▉     | 261/527 [01:35<01:38,  2.71it/s][A
Evaluating:  50%|████▉     | 262/527 [01:35<01:37,  2.71it/s][A
Evaluating:  50%|████▉     | 263/527 [01:36<01:37,  2.72it/s][A
Evaluating:  50%|█████     | 264/527 [01:36<01:37,  2.71it/s][A
Evaluating:  50%|█████     | 265/527 [01:36<01:36,  2.71it/s][A
Evaluating:  50%|█████     | 266/527 [01:37<01:36,  2.72it/s][A
Evaluating:  51%|█████     | 267/527 [01:37<01:35,  2.71it/s][A
Evaluating:  51%|█████     | 268/527 [01:37<01:35,  2.71it/s][A
Evaluating:  51%|█████   

Evaluating:  96%|█████████▌| 506/527 [03:05<00:07,  2.70it/s][A
Evaluating:  96%|█████████▌| 507/527 [03:06<00:07,  2.70it/s][A
Evaluating:  96%|█████████▋| 508/527 [03:06<00:07,  2.70it/s][A
Evaluating:  97%|█████████▋| 509/527 [03:06<00:06,  2.70it/s][A
Evaluating:  97%|█████████▋| 510/527 [03:07<00:06,  2.70it/s][A
Evaluating:  97%|█████████▋| 511/527 [03:07<00:05,  2.70it/s][A
Evaluating:  97%|█████████▋| 512/527 [03:08<00:05,  2.70it/s][A
Evaluating:  97%|█████████▋| 513/527 [03:08<00:05,  2.70it/s][A
Evaluating:  98%|█████████▊| 514/527 [03:08<00:04,  2.70it/s][A
Evaluating:  98%|█████████▊| 515/527 [03:09<00:04,  2.70it/s][A
Evaluating:  98%|█████████▊| 516/527 [03:09<00:04,  2.70it/s][A
Evaluating:  98%|█████████▊| 517/527 [03:09<00:03,  2.70it/s][A
Evaluating:  98%|█████████▊| 518/527 [03:10<00:03,  2.70it/s][A
Evaluating:  98%|█████████▊| 519/527 [03:10<00:02,  2.70it/s][A
Evaluating:  99%|█████████▊| 520/527 [03:11<00:02,  2.70it/s][A
Evaluating:  99%|████████

Epoch: 1



Iteration:   2%|▏         | 9/527 [00:00<00:16, 32.31it/s][A
Iteration:   3%|▎         | 15/527 [00:00<00:14, 36.30it/s][A
Iteration:   4%|▍         | 21/527 [00:00<00:12, 40.15it/s][A
Iteration:   5%|▌         | 27/527 [00:00<00:11, 44.01it/s][A
Iteration:   6%|▋         | 33/527 [00:00<00:10, 46.71it/s][A
Iteration:   7%|▋         | 39/527 [00:00<00:09, 49.36it/s][A
Iteration:   9%|▊         | 45/527 [00:00<00:09, 51.25it/s][A
Iteration:  10%|▉         | 51/527 [00:00<00:09, 52.85it/s][A
Iteration:  11%|█         | 57/527 [00:01<00:08, 53.37it/s][A
Iteration:  12%|█▏        | 63/527 [00:01<00:08, 53.65it/s][A
Iteration:  13%|█▎        | 69/527 [00:01<00:08, 54.17it/s][A
Iteration:  14%|█▍        | 75/527 [00:01<00:08, 53.83it/s][A
Iteration:  15%|█▌        | 81/527 [00:01<00:08, 54.07it/s][A
Iteration:  17%|█▋        | 87/527 [00:01<00:08, 54.47it/s][A
Iteration:  18%|█▊        | 93/527 [00:01<00:07, 55.02it/s][A
Iteration:  19%|█▉        | 99/527 [00:01<00:07, 54.70i

Epoch: 2



Iteration:   2%|▏         | 12/527 [00:00<00:09, 53.91it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 53.81it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 53.13it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:09, 53.02it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:09, 53.31it/s][A
Iteration:   8%|▊         | 42/527 [00:00<00:09, 53.27it/s][A
Iteration:   9%|▉         | 48/527 [00:00<00:08, 53.74it/s][A
Iteration:  10%|█         | 54/527 [00:01<00:08, 53.16it/s][A
Iteration:  11%|█▏        | 60/527 [00:01<00:08, 53.70it/s][A
Iteration:  13%|█▎        | 66/527 [00:01<00:08, 53.65it/s][A
Iteration:  14%|█▎        | 72/527 [00:01<00:08, 53.88it/s][A
Iteration:  15%|█▍        | 78/527 [00:01<00:08, 54.39it/s][A
Iteration:  16%|█▌        | 84/527 [00:01<00:08, 54.50it/s][A
Iteration:  17%|█▋        | 90/527 [00:01<00:07, 54.84it/s][A
Iteration:  18%|█▊        | 96/527 [00:01<00:07, 54.54it/s][A
Iteration:  19%|█▉        | 102/527 [00:01<00:07, 54.8

Epoch: 3



Iteration:   2%|▏         | 12/527 [00:00<00:09, 52.93it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 53.73it/s][A
Iteration:   4%|▍         | 23/527 [00:00<00:09, 52.35it/s][A
Iteration:   6%|▌         | 29/527 [00:00<00:09, 52.67it/s][A
Iteration:   7%|▋         | 35/527 [00:00<00:09, 53.03it/s][A
Iteration:   8%|▊         | 40/527 [00:00<00:09, 51.84it/s][A
Iteration:   9%|▊         | 46/527 [00:00<00:09, 52.64it/s][A
Iteration:  10%|▉         | 52/527 [00:00<00:08, 52.97it/s][A
Iteration:  11%|█         | 58/527 [00:01<00:08, 53.03it/s][A
Iteration:  12%|█▏        | 64/527 [00:01<00:08, 53.47it/s][A
Iteration:  13%|█▎        | 70/527 [00:01<00:08, 53.54it/s][A
Iteration:  14%|█▍        | 76/527 [00:01<00:08, 52.46it/s][A
Iteration:  16%|█▌        | 82/527 [00:01<00:08, 53.03it/s][A
Iteration:  17%|█▋        | 88/527 [00:01<00:08, 53.83it/s][A
Iteration:  18%|█▊        | 94/527 [00:01<00:08, 54.08it/s][A
Iteration:  19%|█▉        | 100/527 [00:01<00:07, 54.6

Epoch: 4



Iteration:   2%|▏         | 12/527 [00:00<00:09, 52.89it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 53.48it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 53.04it/s][A
Iteration:   6%|▌         | 30/527 [00:00<00:09, 53.04it/s][A
Iteration:   7%|▋         | 36/527 [00:00<00:09, 53.56it/s][A
Iteration:   8%|▊         | 41/527 [00:00<00:09, 52.27it/s][A
Iteration:   9%|▉         | 47/527 [00:00<00:09, 53.29it/s][A
Iteration:  10%|█         | 53/527 [00:00<00:08, 53.56it/s][A
Iteration:  11%|█         | 59/527 [00:01<00:08, 54.26it/s][A
Iteration:  12%|█▏        | 65/527 [00:01<00:08, 54.58it/s][A
Iteration:  13%|█▎        | 71/527 [00:01<00:08, 55.07it/s][A
Iteration:  15%|█▍        | 77/527 [00:01<00:08, 55.33it/s][A
Iteration:  16%|█▌        | 83/527 [00:01<00:08, 55.30it/s][A
Iteration:  17%|█▋        | 89/527 [00:01<00:08, 54.56it/s][A
Iteration:  18%|█▊        | 95/527 [00:01<00:07, 54.65it/s][A
Iteration:  19%|█▉        | 101/527 [00:01<00:07, 55.0

Epoch: 5



Iteration:   2%|▏         | 12/527 [00:00<00:09, 54.21it/s][A
Iteration:   3%|▎         | 18/527 [00:00<00:09, 54.14it/s][A
Iteration:   5%|▍         | 24/527 [00:00<00:09, 54.24it/s][A
Iteration:   6%|▌         | 29/527 [00:00<00:09, 52.86it/s][A
Iteration:   7%|▋         | 35/527 [00:00<00:09, 52.36it/s][A
Iteration:   8%|▊         | 41/527 [00:00<00:09, 52.84it/s][A
Iteration:   9%|▉         | 47/527 [00:00<00:09, 53.07it/s][A
Iteration:  10%|█         | 53/527 [00:01<00:09, 52.65it/s][A
Iteration:  11%|█         | 59/527 [00:01<00:08, 52.85it/s][A
Iteration:  12%|█▏        | 65/527 [00:01<00:08, 52.48it/s][A
Iteration:  13%|█▎        | 71/527 [00:01<00:08, 50.87it/s][A
Iteration:  15%|█▍        | 77/527 [00:01<00:08, 51.43it/s][A
Iteration:  16%|█▌        | 83/527 [00:01<00:08, 51.16it/s][A
Iteration:  17%|█▋        | 89/527 [00:01<00:08, 52.26it/s][A
Iteration:  18%|█▊        | 95/527 [00:01<00:08, 51.89it/s][A
Iteration:  19%|█▉        | 101/527 [00:01<00:08, 52.8

In [41]:
evaluate(model, eval_examples, label_list, params, tokenizer)

03/21/2019 17:26:22 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 17:26:22 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 17:26:22 - INFO - lib.train_eval -     Batch size = 8

Evaluating:   0%|          | 0/109 [00:00<?, ?it/s][A
Evaluating:  37%|███▋      | 40/109 [00:00<00:00, 392.99it/s][A
Evaluating:  77%|███████▋  | 84/109 [00:00<00:00, 405.22it/s][A
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 416.47it/s][A03/21/2019 17:26:23 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 17:26:23 - INFO - lib.train_eval -     eval_accuracy = 0.7844036697247706
03/21/2019 17:26:23 - INFO - lib.train_eval -     eval_loss = -0.00040045435275506535


{'eval_loss': -0.00040045435275506535, 'eval_accuracy': 0.7844036697247706}

Всего получилось где-то 20 эпох

Теперь посмотрим на ту же модель с ***8 модулями*** свертки.

In [47]:
params = {
    'data_dir': '../../data/SST-2',
    'output_dir': '../output',
    'cache_dir': '../model_cache',
    'task_name': 'sst2',
    'bert_model': 'bert-base-uncased',
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 8,
    'learning_rate': 2e-3,
    'warmup_proportion': 0.1,
    'num_train_epochs': 15,
    'seed': 1331
}

model = BlendCNN(params['max_seq_length'], len(tokenizer.vocab), len(label_list), n_layers=8).to(device)
model, result = train_student(model, teacher_model, train_examples, label_list, params, tokenizer)




***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    31560


Evaluating:   0%|          | 0/2105 [00:00<?, ?it/s][A
Evaluating:   0%|          | 2/2105 [00:00<02:09, 16.21it/s][A
Evaluating:   0%|          | 4/2105 [00:00<02:29, 14.03it/s][A
Evaluating:   0%|          | 6/2105 [00:00<02:43, 12.84it/s][A
Evaluating:   0%|          | 8/2105 [00:00<02:52, 12.12it/s][A
Evaluating:   0%|          | 10/2105 [00:00<02:59, 11.67it/s][A
Evaluating:   1%|          | 12/2105 [00:01<03:04, 11.34it/s][A
Evaluating:   1%|          | 14/2105 [00:01<03:08, 11.12it/s][A
Evaluating:   1%|          | 16/2105 [00:01<03:09, 11.04it/s][A
Evaluating:   1%|          | 18/2105 [00:01<03:11, 10.92it/s][A
Evaluating:   1%|          | 20/2105 [00:01<03:12, 10.86it/s][A
Evaluating:   1%|          | 22/2105 [00:01<03:13, 10.79it/s][A
Evaluating:   1%|          | 24/2105 [00:02<03:13, 10.74it/s][A
Evaluating:   1%|          | 26/2105 [00:02<03:13, 10.76it/s][A
Evaluating:   1%|▏         | 28/2105 [00:02<03:13, 10.75it/s][A
Evaluating:   1%|▏         | 30/2105 [

Evaluating:  24%|██▎       | 498/2105 [00:46<02:32, 10.53it/s][A
Evaluating:  24%|██▍       | 500/2105 [00:46<02:32, 10.51it/s][A
Evaluating:  24%|██▍       | 502/2105 [00:47<02:32, 10.53it/s][A
Evaluating:  24%|██▍       | 504/2105 [00:47<02:31, 10.55it/s][A
Evaluating:  24%|██▍       | 506/2105 [00:47<02:31, 10.59it/s][A
Evaluating:  24%|██▍       | 508/2105 [00:47<02:30, 10.59it/s][A
Evaluating:  24%|██▍       | 510/2105 [00:47<02:30, 10.56it/s][A
Evaluating:  24%|██▍       | 512/2105 [00:48<02:30, 10.59it/s][A
Evaluating:  24%|██▍       | 514/2105 [00:48<02:30, 10.60it/s][A
Evaluating:  25%|██▍       | 516/2105 [00:48<02:30, 10.57it/s][A
Evaluating:  25%|██▍       | 518/2105 [00:48<02:30, 10.57it/s][A
Evaluating:  25%|██▍       | 520/2105 [00:48<02:30, 10.56it/s][A
Evaluating:  25%|██▍       | 522/2105 [00:49<02:30, 10.55it/s][A
Evaluating:  25%|██▍       | 524/2105 [00:49<02:29, 10.55it/s][A
Evaluating:  25%|██▍       | 526/2105 [00:49<02:29, 10.56it/s][A
Evaluating

Evaluating:  47%|████▋     | 994/2105 [01:33<01:46, 10.47it/s][A
Evaluating:  47%|████▋     | 996/2105 [01:34<01:45, 10.50it/s][A
Evaluating:  47%|████▋     | 998/2105 [01:34<01:45, 10.49it/s][A
Evaluating:  48%|████▊     | 1000/2105 [01:34<01:45, 10.48it/s][A
Evaluating:  48%|████▊     | 1002/2105 [01:34<01:45, 10.47it/s][A
Evaluating:  48%|████▊     | 1004/2105 [01:34<01:45, 10.46it/s][A
Evaluating:  48%|████▊     | 1006/2105 [01:34<01:44, 10.47it/s][A
Evaluating:  48%|████▊     | 1008/2105 [01:35<01:44, 10.49it/s][A
Evaluating:  48%|████▊     | 1010/2105 [01:35<01:44, 10.52it/s][A
Evaluating:  48%|████▊     | 1012/2105 [01:35<01:43, 10.52it/s][A
Evaluating:  48%|████▊     | 1014/2105 [01:35<01:44, 10.48it/s][A
Evaluating:  48%|████▊     | 1016/2105 [01:35<01:43, 10.51it/s][A
Evaluating:  48%|████▊     | 1018/2105 [01:36<01:43, 10.53it/s][A
Evaluating:  48%|████▊     | 1020/2105 [01:36<01:43, 10.51it/s][A
Evaluating:  49%|████▊     | 1022/2105 [01:36<01:43, 10.50it/s][

Evaluating:  70%|███████   | 1482/2105 [02:20<00:59, 10.46it/s][A
Evaluating:  70%|███████   | 1484/2105 [02:20<00:59, 10.50it/s][A
Evaluating:  71%|███████   | 1486/2105 [02:20<00:59, 10.49it/s][A
Evaluating:  71%|███████   | 1488/2105 [02:20<00:58, 10.48it/s][A
Evaluating:  71%|███████   | 1490/2105 [02:21<00:58, 10.47it/s][A
Evaluating:  71%|███████   | 1492/2105 [02:21<00:58, 10.43it/s][A
Evaluating:  71%|███████   | 1494/2105 [02:21<00:58, 10.45it/s][A
Evaluating:  71%|███████   | 1496/2105 [02:21<00:58, 10.47it/s][A
Evaluating:  71%|███████   | 1498/2105 [02:21<00:58, 10.46it/s][A
Evaluating:  71%|███████▏  | 1500/2105 [02:22<00:57, 10.49it/s][A
Evaluating:  71%|███████▏  | 1502/2105 [02:22<00:57, 10.49it/s][A
Evaluating:  71%|███████▏  | 1504/2105 [02:22<00:57, 10.47it/s][A
Evaluating:  72%|███████▏  | 1506/2105 [02:22<00:57, 10.45it/s][A
Evaluating:  72%|███████▏  | 1508/2105 [02:22<00:57, 10.45it/s][A
Evaluating:  72%|███████▏  | 1510/2105 [02:23<00:56, 10.47it/s

Evaluating:  94%|█████████▎| 1970/2105 [03:07<00:12, 10.47it/s][A
Evaluating:  94%|█████████▎| 1972/2105 [03:07<00:12, 10.45it/s][A
Evaluating:  94%|█████████▍| 1974/2105 [03:07<00:12, 10.44it/s][A
Evaluating:  94%|█████████▍| 1976/2105 [03:07<00:12, 10.45it/s][A
Evaluating:  94%|█████████▍| 1978/2105 [03:07<00:12, 10.44it/s][A
Evaluating:  94%|█████████▍| 1980/2105 [03:08<00:11, 10.44it/s][A
Evaluating:  94%|█████████▍| 1982/2105 [03:08<00:11, 10.43it/s][A
Evaluating:  94%|█████████▍| 1984/2105 [03:08<00:11, 10.44it/s][A
Evaluating:  94%|█████████▍| 1986/2105 [03:08<00:11, 10.48it/s][A
Evaluating:  94%|█████████▍| 1988/2105 [03:08<00:11, 10.44it/s][A
Evaluating:  95%|█████████▍| 1990/2105 [03:09<00:11, 10.45it/s][A
Evaluating:  95%|█████████▍| 1992/2105 [03:09<00:10, 10.42it/s][A
Evaluating:  95%|█████████▍| 1994/2105 [03:09<00:10, 10.43it/s][A
Evaluating:  95%|█████████▍| 1996/2105 [03:09<00:10, 10.41it/s][A
Evaluating:  95%|█████████▍| 1998/2105 [03:09<00:10, 10.42it/s

Epoch: 1



Iteration:   1%|          | 16/2105 [00:00<00:36, 56.66it/s][A
Iteration:   1%|          | 26/2105 [00:00<00:31, 65.05it/s][A
Iteration:   2%|▏         | 37/2105 [00:00<00:28, 73.01it/s][A
Iteration:   2%|▏         | 47/2105 [00:00<00:25, 79.32it/s][A
Iteration:   3%|▎         | 58/2105 [00:00<00:24, 85.02it/s][A
Iteration:   3%|▎         | 68/2105 [00:00<00:22, 88.88it/s][A
Iteration:   4%|▍         | 79/2105 [00:00<00:21, 92.51it/s][A
Iteration:   4%|▍         | 89/2105 [00:00<00:21, 94.53it/s][A
Iteration:   5%|▍         | 99/2105 [00:01<00:20, 95.83it/s][A
Iteration:   5%|▌         | 109/2105 [00:01<00:20, 96.82it/s][A
Iteration:   6%|▌         | 120/2105 [00:01<00:20, 98.38it/s][A
Iteration:   6%|▌         | 130/2105 [00:01<00:20, 98.58it/s][A
Iteration:   7%|▋         | 141/2105 [00:01<00:19, 99.62it/s][A
Iteration:   7%|▋         | 151/2105 [00:01<00:19, 99.53it/s][A
Iteration:   8%|▊         | 162/2105 [00:01<00:19, 100.19it/s][A
Iteration:   8%|▊         | 173/

Epoch: 2



Iteration:   1%|          | 14/2105 [00:00<00:32, 63.63it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:31, 65.90it/s][A
Iteration:   1%|▏         | 29/2105 [00:00<00:31, 66.82it/s][A
Iteration:   2%|▏         | 37/2105 [00:00<00:30, 67.91it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:31, 65.99it/s][A
Iteration:   2%|▏         | 51/2105 [00:00<00:30, 66.99it/s][A
Iteration:   3%|▎         | 59/2105 [00:00<00:29, 68.75it/s][A
Iteration:   3%|▎         | 66/2105 [00:00<00:29, 68.39it/s][A
Iteration:   4%|▎         | 74/2105 [00:01<00:29, 69.25it/s][A
Iteration:   4%|▍         | 82/2105 [00:01<00:28, 70.23it/s][A
Iteration:   4%|▍         | 89/2105 [00:01<00:30, 66.65it/s][A
Iteration:   5%|▍         | 96/2105 [00:01<00:29, 67.53it/s][A
Iteration:   5%|▍         | 103/2105 [00:01<00:29, 66.77it/s][A
Iteration:   5%|▌         | 110/2105 [00:01<00:29, 67.06it/s][A
Iteration:   6%|▌         | 117/2105 [00:01<00:30, 64.50it/s][A
Iteration:   6%|▌         | 124/2105

Iteration:  87%|████████▋ | 1840/2105 [00:27<00:03, 67.82it/s][A
Iteration:  88%|████████▊ | 1848/2105 [00:27<00:03, 69.64it/s][A
Iteration:  88%|████████▊ | 1856/2105 [00:28<00:03, 67.84it/s][A
Iteration:  89%|████████▊ | 1863/2105 [00:28<00:03, 63.70it/s][A
Iteration:  89%|████████▉ | 1870/2105 [00:28<00:03, 65.09it/s][A
Iteration:  89%|████████▉ | 1877/2105 [00:28<00:03, 65.14it/s][A
Iteration:  90%|████████▉ | 1884/2105 [00:28<00:03, 63.44it/s][A
Iteration:  90%|████████▉ | 1892/2105 [00:28<00:03, 66.49it/s][A
Iteration:  90%|█████████ | 1899/2105 [00:28<00:03, 66.63it/s][A
Iteration:  91%|█████████ | 1906/2105 [00:28<00:03, 64.80it/s][A
Iteration:  91%|█████████ | 1913/2105 [00:29<00:03, 63.26it/s][A
Iteration:  91%|█████████ | 1920/2105 [00:29<00:02, 64.87it/s][A
Iteration:  92%|█████████▏| 1927/2105 [00:29<00:02, 65.09it/s][A
Iteration:  92%|█████████▏| 1934/2105 [00:29<00:02, 66.30it/s][A
Iteration:  92%|█████████▏| 1941/2105 [00:29<00:02, 66.58it/s][A
Iteration:

Epoch: 3



Iteration:   1%|          | 14/2105 [00:00<00:30, 68.55it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:29, 70.44it/s][A
Iteration:   1%|▏         | 29/2105 [00:00<00:30, 68.98it/s][A
Iteration:   2%|▏         | 37/2105 [00:00<00:29, 71.16it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:29, 68.94it/s][A
Iteration:   2%|▏         | 52/2105 [00:00<00:28, 70.98it/s][A
Iteration:   3%|▎         | 59/2105 [00:00<00:29, 69.88it/s][A
Iteration:   3%|▎         | 66/2105 [00:00<00:30, 65.98it/s][A
Iteration:   4%|▎         | 74/2105 [00:01<00:29, 67.97it/s][A
Iteration:   4%|▍         | 81/2105 [00:01<00:31, 64.64it/s][A
Iteration:   4%|▍         | 90/2105 [00:01<00:29, 67.86it/s][A
Iteration:   5%|▍         | 98/2105 [00:01<00:28, 69.46it/s][A
Iteration:   5%|▍         | 105/2105 [00:01<00:30, 66.18it/s][A
Iteration:   5%|▌         | 112/2105 [00:01<00:30, 65.32it/s][A
Iteration:   6%|▌         | 119/2105 [00:01<00:29, 66.36it/s][A
Iteration:   6%|▌         | 127/2105

Iteration:  88%|████████▊ | 1849/2105 [00:27<00:03, 65.15it/s][A
Iteration:  88%|████████▊ | 1856/2105 [00:28<00:03, 64.17it/s][A
Iteration:  89%|████████▊ | 1863/2105 [00:28<00:03, 64.93it/s][A
Iteration:  89%|████████▉ | 1870/2105 [00:28<00:03, 62.95it/s][A
Iteration:  89%|████████▉ | 1877/2105 [00:28<00:03, 64.01it/s][A
Iteration:  90%|████████▉ | 1884/2105 [00:28<00:03, 65.30it/s][A
Iteration:  90%|████████▉ | 1891/2105 [00:28<00:03, 62.89it/s][A
Iteration:  90%|█████████ | 1898/2105 [00:28<00:03, 63.69it/s][A
Iteration:  90%|█████████ | 1905/2105 [00:28<00:03, 64.50it/s][A
Iteration:  91%|█████████ | 1912/2105 [00:28<00:02, 64.56it/s][A
Iteration:  91%|█████████ | 1919/2105 [00:29<00:02, 65.43it/s][A
Iteration:  91%|█████████▏| 1926/2105 [00:29<00:02, 64.02it/s][A
Iteration:  92%|█████████▏| 1933/2105 [00:29<00:02, 63.39it/s][A
Iteration:  92%|█████████▏| 1940/2105 [00:29<00:02, 64.19it/s][A
Iteration:  92%|█████████▏| 1947/2105 [00:29<00:02, 64.77it/s][A
Iteration:

Epoch: 4



Iteration:   1%|          | 13/2105 [00:00<00:33, 62.84it/s][A
Iteration:   1%|          | 21/2105 [00:00<00:31, 66.05it/s][A
Iteration:   1%|▏         | 28/2105 [00:00<00:32, 64.89it/s][A
Iteration:   2%|▏         | 36/2105 [00:00<00:30, 66.84it/s][A
Iteration:   2%|▏         | 43/2105 [00:00<00:31, 66.40it/s][A
Iteration:   2%|▏         | 51/2105 [00:00<00:29, 68.69it/s][A
Iteration:   3%|▎         | 58/2105 [00:00<00:30, 66.46it/s][A
Iteration:   3%|▎         | 66/2105 [00:00<00:30, 67.84it/s][A
Iteration:   4%|▎         | 74/2105 [00:01<00:28, 70.21it/s][A
Iteration:   4%|▍         | 81/2105 [00:01<00:30, 67.35it/s][A
Iteration:   4%|▍         | 88/2105 [00:01<00:30, 66.08it/s][A
Iteration:   5%|▍         | 95/2105 [00:01<00:30, 64.89it/s][A
Iteration:   5%|▍         | 102/2105 [00:01<00:30, 65.11it/s][A
Iteration:   5%|▌         | 109/2105 [00:01<00:30, 65.22it/s][A
Iteration:   6%|▌         | 116/2105 [00:01<00:30, 65.51it/s][A
Iteration:   6%|▌         | 123/2105

Iteration:  88%|████████▊ | 1857/2105 [00:27<00:04, 60.49it/s][A
Iteration:  89%|████████▊ | 1864/2105 [00:28<00:03, 62.32it/s][A
Iteration:  89%|████████▉ | 1871/2105 [00:28<00:03, 61.42it/s][A
Iteration:  89%|████████▉ | 1879/2105 [00:28<00:03, 63.65it/s][A
Iteration:  90%|████████▉ | 1886/2105 [00:28<00:03, 63.22it/s][A
Iteration:  90%|████████▉ | 1893/2105 [00:28<00:03, 62.18it/s][A
Iteration:  90%|█████████ | 1900/2105 [00:28<00:03, 63.42it/s][A
Iteration:  91%|█████████ | 1907/2105 [00:28<00:03, 63.26it/s][A
Iteration:  91%|█████████ | 1914/2105 [00:28<00:03, 62.42it/s][A
Iteration:  91%|█████████▏| 1921/2105 [00:28<00:02, 61.60it/s][A
Iteration:  92%|█████████▏| 1928/2105 [00:29<00:02, 63.64it/s][A
Iteration:  92%|█████████▏| 1935/2105 [00:29<00:02, 62.57it/s][A
Iteration:  92%|█████████▏| 1943/2105 [00:29<00:02, 65.42it/s][A
Iteration:  93%|█████████▎| 1950/2105 [00:29<00:02, 66.19it/s][A
Iteration:  93%|█████████▎| 1957/2105 [00:29<00:02, 65.12it/s][A
Iteration:

Epoch: 5



Iteration:   1%|          | 13/2105 [00:00<00:31, 66.17it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:29, 69.54it/s][A
Iteration:   1%|▏         | 28/2105 [00:00<00:31, 65.48it/s][A
Iteration:   2%|▏         | 35/2105 [00:00<00:31, 65.56it/s][A
Iteration:   2%|▏         | 42/2105 [00:00<00:31, 64.50it/s][A
Iteration:   2%|▏         | 49/2105 [00:00<00:31, 64.31it/s][A
Iteration:   3%|▎         | 55/2105 [00:00<00:33, 61.66it/s][A
Iteration:   3%|▎         | 63/2105 [00:00<00:31, 64.71it/s][A
Iteration:   3%|▎         | 70/2105 [00:01<00:31, 63.90it/s][A
Iteration:   4%|▎         | 78/2105 [00:01<00:30, 65.53it/s][A
Iteration:   4%|▍         | 86/2105 [00:01<00:30, 66.66it/s][A
Iteration:   4%|▍         | 94/2105 [00:01<00:29, 67.49it/s][A
Iteration:   5%|▍         | 102/2105 [00:01<00:29, 68.26it/s][A
Iteration:   5%|▌         | 110/2105 [00:01<00:28, 68.88it/s][A
Iteration:   6%|▌         | 117/2105 [00:01<00:30, 66.21it/s][A
Iteration:   6%|▌         | 125/2105

Iteration:  93%|█████████▎| 1960/2105 [00:27<00:02, 66.58it/s][A
Iteration:  93%|█████████▎| 1967/2105 [00:27<00:02, 65.75it/s][A
Iteration:  94%|█████████▍| 1974/2105 [00:27<00:01, 66.47it/s][A
Iteration:  94%|█████████▍| 1982/2105 [00:28<00:01, 69.02it/s][A
Iteration:  94%|█████████▍| 1989/2105 [00:28<00:01, 65.54it/s][A
Iteration:  95%|█████████▍| 1996/2105 [00:28<00:01, 66.42it/s][A
Iteration:  95%|█████████▌| 2003/2105 [00:28<00:01, 66.73it/s][A
Iteration:  95%|█████████▌| 2010/2105 [00:28<00:01, 65.47it/s][A
Iteration:  96%|█████████▌| 2017/2105 [00:28<00:01, 66.52it/s][A
Iteration:  96%|█████████▌| 2024/2105 [00:28<00:01, 66.24it/s][A
Iteration:  96%|█████████▋| 2031/2105 [00:28<00:01, 64.23it/s][A
Iteration:  97%|█████████▋| 2039/2105 [00:28<00:00, 66.33it/s][A
Iteration:  97%|█████████▋| 2046/2105 [00:29<00:00, 66.85it/s][A
Iteration:  98%|█████████▊| 2054/2105 [00:29<00:00, 68.84it/s][A
Iteration:  98%|█████████▊| 2061/2105 [00:29<00:00, 65.28it/s][A
Iteration:

Epoch: 6



Iteration:   1%|          | 15/2105 [00:00<00:28, 72.42it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:29, 69.73it/s][A
Iteration:   1%|▏         | 28/2105 [00:00<00:31, 66.23it/s][A
Iteration:   2%|▏         | 35/2105 [00:00<00:31, 66.12it/s][A
Iteration:   2%|▏         | 42/2105 [00:00<00:31, 66.43it/s][A
Iteration:   2%|▏         | 49/2105 [00:00<00:30, 66.49it/s][A
Iteration:   3%|▎         | 57/2105 [00:00<00:29, 68.90it/s][A
Iteration:   3%|▎         | 64/2105 [00:00<00:29, 68.45it/s][A
Iteration:   3%|▎         | 71/2105 [00:01<00:30, 67.51it/s][A
Iteration:   4%|▎         | 78/2105 [00:01<00:29, 67.61it/s][A
Iteration:   4%|▍         | 85/2105 [00:01<00:30, 66.38it/s][A
Iteration:   4%|▍         | 93/2105 [00:01<00:29, 68.75it/s][A
Iteration:   5%|▍         | 101/2105 [00:01<00:28, 69.88it/s][A
Iteration:   5%|▌         | 108/2105 [00:01<00:29, 68.66it/s][A
Iteration:   5%|▌         | 115/2105 [00:01<00:30, 64.67it/s][A
Iteration:   6%|▌         | 122/2105

Iteration:  87%|████████▋ | 1841/2105 [00:27<00:04, 63.76it/s][A
Iteration:  88%|████████▊ | 1848/2105 [00:27<00:04, 63.93it/s][A
Iteration:  88%|████████▊ | 1855/2105 [00:28<00:03, 62.95it/s][A
Iteration:  89%|████████▊ | 1863/2105 [00:28<00:03, 65.24it/s][A
Iteration:  89%|████████▉ | 1870/2105 [00:28<00:03, 64.93it/s][A
Iteration:  89%|████████▉ | 1877/2105 [00:28<00:03, 66.10it/s][A
Iteration:  90%|████████▉ | 1884/2105 [00:28<00:03, 65.77it/s][A
Iteration:  90%|████████▉ | 1891/2105 [00:28<00:03, 64.11it/s][A
Iteration:  90%|█████████ | 1898/2105 [00:28<00:03, 63.26it/s][A
Iteration:  90%|█████████ | 1905/2105 [00:28<00:03, 64.77it/s][A
Iteration:  91%|█████████ | 1913/2105 [00:28<00:02, 66.34it/s][A
Iteration:  91%|█████████ | 1920/2105 [00:29<00:02, 65.32it/s][A
Iteration:  92%|█████████▏| 1927/2105 [00:29<00:02, 66.51it/s][A
Iteration:  92%|█████████▏| 1934/2105 [00:29<00:02, 65.88it/s][A
Iteration:  92%|█████████▏| 1941/2105 [00:29<00:02, 65.61it/s][A
Iteration:

Epoch: 7



Iteration:   1%|          | 14/2105 [00:00<00:32, 64.12it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:31, 66.07it/s][A
Iteration:   1%|▏         | 30/2105 [00:00<00:30, 67.95it/s][A
Iteration:   2%|▏         | 37/2105 [00:00<00:30, 67.99it/s][A
Iteration:   2%|▏         | 45/2105 [00:00<00:30, 68.48it/s][A
Iteration:   3%|▎         | 53/2105 [00:00<00:29, 70.00it/s][A
Iteration:   3%|▎         | 61/2105 [00:00<00:29, 70.20it/s][A
Iteration:   3%|▎         | 68/2105 [00:00<00:29, 69.64it/s][A
Iteration:   4%|▎         | 75/2105 [00:01<00:30, 67.25it/s][A
Iteration:   4%|▍         | 83/2105 [00:01<00:29, 69.01it/s][A
Iteration:   4%|▍         | 90/2105 [00:01<00:30, 65.85it/s][A
Iteration:   5%|▍         | 98/2105 [00:01<00:29, 68.81it/s][A
Iteration:   5%|▍         | 105/2105 [00:01<00:29, 68.08it/s][A
Iteration:   5%|▌         | 112/2105 [00:01<00:31, 64.27it/s][A
Iteration:   6%|▌         | 119/2105 [00:01<00:30, 64.17it/s][A
Iteration:   6%|▌         | 126/2105

Iteration:  88%|████████▊ | 1858/2105 [00:27<00:03, 68.01it/s][A
Iteration:  89%|████████▊ | 1865/2105 [00:27<00:03, 66.00it/s][A
Iteration:  89%|████████▉ | 1873/2105 [00:28<00:03, 68.69it/s][A
Iteration:  89%|████████▉ | 1881/2105 [00:28<00:03, 70.57it/s][A
Iteration:  90%|████████▉ | 1889/2105 [00:28<00:03, 69.91it/s][A
Iteration:  90%|█████████ | 1897/2105 [00:28<00:03, 66.83it/s][A
Iteration:  90%|█████████ | 1905/2105 [00:28<00:02, 67.76it/s][A
Iteration:  91%|█████████ | 1912/2105 [00:28<00:02, 67.52it/s][A
Iteration:  91%|█████████ | 1920/2105 [00:28<00:02, 70.71it/s][A
Iteration:  92%|█████████▏| 1928/2105 [00:28<00:02, 68.77it/s][A
Iteration:  92%|█████████▏| 1936/2105 [00:28<00:02, 69.83it/s][A
Iteration:  92%|█████████▏| 1944/2105 [00:29<00:02, 67.00it/s][A
Iteration:  93%|█████████▎| 1951/2105 [00:29<00:02, 67.20it/s][A
Iteration:  93%|█████████▎| 1958/2105 [00:29<00:02, 65.04it/s][A
Iteration:  93%|█████████▎| 1965/2105 [00:29<00:02, 66.43it/s][A
Iteration:

Epoch: 8



Iteration:   1%|          | 15/2105 [00:00<00:30, 69.44it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:30, 68.43it/s][A
Iteration:   1%|▏         | 29/2105 [00:00<00:30, 68.89it/s][A
Iteration:   2%|▏         | 36/2105 [00:00<00:30, 68.38it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:29, 69.27it/s][A
Iteration:   2%|▏         | 52/2105 [00:00<00:29, 69.26it/s][A
Iteration:   3%|▎         | 59/2105 [00:00<00:30, 66.19it/s][A
Iteration:   3%|▎         | 66/2105 [00:00<00:30, 66.48it/s][A
Iteration:   4%|▎         | 74/2105 [00:01<00:29, 68.66it/s][A
Iteration:   4%|▍         | 81/2105 [00:01<00:29, 67.73it/s][A
Iteration:   4%|▍         | 88/2105 [00:01<00:30, 66.55it/s][A
Iteration:   5%|▍         | 96/2105 [00:01<00:29, 67.63it/s][A
Iteration:   5%|▍         | 103/2105 [00:01<00:29, 67.02it/s][A
Iteration:   5%|▌         | 110/2105 [00:01<00:29, 67.22it/s][A
Iteration:   6%|▌         | 117/2105 [00:01<00:30, 65.31it/s][A
Iteration:   6%|▌         | 124/2105

Iteration:  88%|████████▊ | 1851/2105 [00:27<00:03, 68.78it/s][A
Iteration:  88%|████████▊ | 1859/2105 [00:27<00:03, 69.69it/s][A
Iteration:  89%|████████▊ | 1866/2105 [00:27<00:03, 66.90it/s][A
Iteration:  89%|████████▉ | 1873/2105 [00:28<00:03, 67.07it/s][A
Iteration:  89%|████████▉ | 1880/2105 [00:28<00:03, 67.08it/s][A
Iteration:  90%|████████▉ | 1887/2105 [00:28<00:03, 63.75it/s][A
Iteration:  90%|████████▉ | 1894/2105 [00:28<00:03, 63.83it/s][A
Iteration:  90%|█████████ | 1901/2105 [00:28<00:03, 62.21it/s][A
Iteration:  91%|█████████ | 1909/2105 [00:28<00:03, 64.47it/s][A
Iteration:  91%|█████████ | 1916/2105 [00:28<00:03, 62.79it/s][A
Iteration:  91%|█████████▏| 1923/2105 [00:28<00:02, 61.44it/s][A
Iteration:  92%|█████████▏| 1931/2105 [00:28<00:02, 63.87it/s][A
Iteration:  92%|█████████▏| 1938/2105 [00:29<00:02, 63.55it/s][A
Iteration:  92%|█████████▏| 1945/2105 [00:29<00:02, 63.64it/s][A
Iteration:  93%|█████████▎| 1953/2105 [00:29<00:02, 65.84it/s][A
Iteration:

Epoch: 9



Iteration:   1%|          | 14/2105 [00:00<00:32, 63.42it/s][A
Iteration:   1%|          | 21/2105 [00:00<00:33, 63.10it/s][A
Iteration:   1%|▏         | 29/2105 [00:00<00:31, 65.36it/s][A
Iteration:   2%|▏         | 36/2105 [00:00<00:32, 64.32it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:30, 66.75it/s][A
Iteration:   2%|▏         | 50/2105 [00:00<00:32, 63.55it/s][A
Iteration:   3%|▎         | 57/2105 [00:00<00:32, 63.79it/s][A
Iteration:   3%|▎         | 64/2105 [00:01<00:32, 63.71it/s][A
Iteration:   3%|▎         | 71/2105 [00:01<00:32, 62.53it/s][A
Iteration:   4%|▎         | 78/2105 [00:01<00:31, 63.52it/s][A
Iteration:   4%|▍         | 85/2105 [00:01<00:31, 64.62it/s][A
Iteration:   4%|▍         | 93/2105 [00:01<00:29, 68.23it/s][A
Iteration:   5%|▍         | 101/2105 [00:01<00:28, 69.43it/s][A
Iteration:   5%|▌         | 108/2105 [00:01<00:28, 68.92it/s][A
Iteration:   5%|▌         | 115/2105 [00:01<00:30, 66.16it/s][A
Iteration:   6%|▌         | 122/2105

Iteration:  91%|█████████▏| 1922/2105 [00:27<00:02, 69.94it/s][A
Iteration:  92%|█████████▏| 1930/2105 [00:28<00:02, 70.04it/s][A
Iteration:  92%|█████████▏| 1938/2105 [00:28<00:02, 70.11it/s][A
Iteration:  92%|█████████▏| 1946/2105 [00:28<00:02, 67.38it/s][A
Iteration:  93%|█████████▎| 1953/2105 [00:28<00:02, 65.43it/s][A
Iteration:  93%|█████████▎| 1960/2105 [00:28<00:02, 66.55it/s][A
Iteration:  93%|█████████▎| 1967/2105 [00:28<00:02, 64.39it/s][A
Iteration:  94%|█████████▍| 1975/2105 [00:28<00:01, 67.12it/s][A
Iteration:  94%|█████████▍| 1983/2105 [00:28<00:01, 68.96it/s][A
Iteration:  95%|█████████▍| 1990/2105 [00:28<00:01, 65.98it/s][A
Iteration:  95%|█████████▍| 1997/2105 [00:29<00:01, 66.69it/s][A
Iteration:  95%|█████████▌| 2005/2105 [00:29<00:01, 69.17it/s][A
Iteration:  96%|█████████▌| 2012/2105 [00:29<00:01, 68.21it/s][A
Iteration:  96%|█████████▌| 2019/2105 [00:29<00:01, 66.60it/s][A
Iteration:  96%|█████████▌| 2026/2105 [00:29<00:01, 67.23it/s][A
Iteration:

Epoch: 10



Iteration:   1%|          | 15/2105 [00:00<00:30, 68.85it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:30, 68.83it/s][A
Iteration:   1%|▏         | 30/2105 [00:00<00:30, 69.08it/s][A
Iteration:   2%|▏         | 38/2105 [00:00<00:28, 71.37it/s][A
Iteration:   2%|▏         | 45/2105 [00:00<00:29, 70.21it/s][A
Iteration:   2%|▏         | 52/2105 [00:00<00:29, 69.83it/s][A
Iteration:   3%|▎         | 60/2105 [00:00<00:28, 70.62it/s][A
Iteration:   3%|▎         | 68/2105 [00:00<00:28, 71.13it/s][A
Iteration:   4%|▎         | 76/2105 [00:01<00:28, 71.23it/s][A
Iteration:   4%|▍         | 83/2105 [00:01<00:28, 70.70it/s][A
Iteration:   4%|▍         | 91/2105 [00:01<00:28, 70.39it/s][A
Iteration:   5%|▍         | 99/2105 [00:01<00:28, 71.64it/s][A
Iteration:   5%|▌         | 107/2105 [00:01<00:27, 71.83it/s][A
Iteration:   5%|▌         | 115/2105 [00:01<00:27, 72.36it/s][A
Iteration:   6%|▌         | 123/2105 [00:01<00:26, 73.43it/s][A
Iteration:   6%|▌         | 131/2105

Iteration:  91%|█████████ | 1920/2105 [00:27<00:02, 74.73it/s][A
Iteration:  92%|█████████▏| 1928/2105 [00:28<00:02, 72.57it/s][A
Iteration:  92%|█████████▏| 1937/2105 [00:28<00:02, 75.06it/s][A
Iteration:  92%|█████████▏| 1945/2105 [00:28<00:02, 74.64it/s][A
Iteration:  93%|█████████▎| 1953/2105 [00:28<00:02, 72.87it/s][A
Iteration:  93%|█████████▎| 1961/2105 [00:28<00:01, 73.69it/s][A
Iteration:  94%|█████████▎| 1969/2105 [00:28<00:01, 75.08it/s][A
Iteration:  94%|█████████▍| 1977/2105 [00:28<00:01, 72.54it/s][A
Iteration:  94%|█████████▍| 1985/2105 [00:28<00:01, 72.40it/s][A
Iteration:  95%|█████████▍| 1994/2105 [00:28<00:01, 74.93it/s][A
Iteration:  95%|█████████▌| 2002/2105 [00:29<00:01, 74.11it/s][A
Iteration:  95%|█████████▌| 2010/2105 [00:29<00:01, 73.04it/s][A
Iteration:  96%|█████████▌| 2018/2105 [00:29<00:01, 72.09it/s][A
Iteration:  96%|█████████▌| 2026/2105 [00:29<00:01, 72.35it/s][A
Iteration:  97%|█████████▋| 2034/2105 [00:29<00:00, 71.89it/s][A
Iteration:

Epoch: 11



Iteration:   1%|          | 17/2105 [00:00<00:27, 77.05it/s][A
Iteration:   1%|          | 25/2105 [00:00<00:27, 76.66it/s][A
Iteration:   2%|▏         | 32/2105 [00:00<00:28, 71.80it/s][A
Iteration:   2%|▏         | 40/2105 [00:00<00:28, 71.35it/s][A
Iteration:   2%|▏         | 48/2105 [00:00<00:28, 71.80it/s][A
Iteration:   3%|▎         | 55/2105 [00:00<00:28, 70.90it/s][A
Iteration:   3%|▎         | 63/2105 [00:00<00:28, 71.18it/s][A
Iteration:   3%|▎         | 71/2105 [00:00<00:28, 71.63it/s][A
Iteration:   4%|▎         | 78/2105 [00:01<00:28, 70.20it/s][A
Iteration:   4%|▍         | 86/2105 [00:01<00:28, 71.63it/s][A
Iteration:   4%|▍         | 94/2105 [00:01<00:28, 71.65it/s][A
Iteration:   5%|▍         | 102/2105 [00:01<00:28, 70.90it/s][A
Iteration:   5%|▌         | 110/2105 [00:01<00:28, 70.01it/s][A
Iteration:   6%|▌         | 117/2105 [00:01<00:28, 69.86it/s][A
Iteration:   6%|▌         | 125/2105 [00:01<00:27, 71.37it/s][A
Iteration:   6%|▋         | 133/210

Iteration:  91%|█████████ | 1911/2105 [00:28<00:02, 65.81it/s][A
Iteration:  91%|█████████ | 1918/2105 [00:28<00:02, 65.26it/s][A
Iteration:  91%|█████████▏| 1925/2105 [00:28<00:02, 64.03it/s][A
Iteration:  92%|█████████▏| 1932/2105 [00:28<00:02, 63.73it/s][A
Iteration:  92%|█████████▏| 1939/2105 [00:28<00:02, 62.90it/s][A
Iteration:  92%|█████████▏| 1947/2105 [00:28<00:02, 64.36it/s][A
Iteration:  93%|█████████▎| 1955/2105 [00:28<00:02, 66.40it/s][A
Iteration:  93%|█████████▎| 1962/2105 [00:28<00:02, 64.69it/s][A
Iteration:  94%|█████████▎| 1969/2105 [00:28<00:02, 65.38it/s][A
Iteration:  94%|█████████▍| 1976/2105 [00:29<00:01, 65.56it/s][A
Iteration:  94%|█████████▍| 1983/2105 [00:29<00:01, 65.05it/s][A
Iteration:  95%|█████████▍| 1990/2105 [00:29<00:01, 64.30it/s][A
Iteration:  95%|█████████▍| 1998/2105 [00:29<00:01, 65.75it/s][A
Iteration:  95%|█████████▌| 2005/2105 [00:29<00:01, 62.90it/s][A
Iteration:  96%|█████████▌| 2012/2105 [00:29<00:01, 64.31it/s][A
Iteration:

Epoch: 12



Iteration:   1%|          | 15/2105 [00:00<00:32, 64.36it/s][A
Iteration:   1%|          | 22/2105 [00:00<00:32, 63.38it/s][A
Iteration:   1%|▏         | 29/2105 [00:00<00:32, 63.24it/s][A
Iteration:   2%|▏         | 37/2105 [00:00<00:31, 65.11it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:31, 64.96it/s][A
Iteration:   2%|▏         | 51/2105 [00:00<00:32, 63.51it/s][A
Iteration:   3%|▎         | 58/2105 [00:00<00:31, 64.20it/s][A
Iteration:   3%|▎         | 66/2105 [00:01<00:30, 65.95it/s][A
Iteration:   4%|▎         | 74/2105 [00:01<00:30, 67.57it/s][A
Iteration:   4%|▍         | 82/2105 [00:01<00:29, 69.60it/s][A
Iteration:   4%|▍         | 90/2105 [00:01<00:28, 71.04it/s][A
Iteration:   5%|▍         | 98/2105 [00:01<00:28, 69.32it/s][A
Iteration:   5%|▍         | 105/2105 [00:01<00:30, 65.63it/s][A
Iteration:   5%|▌         | 113/2105 [00:01<00:29, 68.46it/s][A
Iteration:   6%|▌         | 120/2105 [00:01<00:29, 67.04it/s][A
Iteration:   6%|▌         | 128/2105

Epoch: 13



Iteration:   1%|          | 22/2105 [00:00<00:20, 101.41it/s][A
Iteration:   2%|▏         | 33/2105 [00:00<00:20, 101.48it/s][A
Iteration:   2%|▏         | 44/2105 [00:00<00:20, 102.25it/s][A
Iteration:   3%|▎         | 55/2105 [00:00<00:20, 101.97it/s][A
Iteration:   3%|▎         | 66/2105 [00:00<00:19, 102.39it/s][A
Iteration:   4%|▎         | 76/2105 [00:00<00:20, 101.41it/s][A
Iteration:   4%|▍         | 87/2105 [00:00<00:19, 102.16it/s][A
Iteration:   5%|▍         | 98/2105 [00:00<00:19, 101.94it/s][A
Iteration:   5%|▌         | 109/2105 [00:01<00:19, 102.57it/s][A
Iteration:   6%|▌         | 120/2105 [00:01<00:19, 102.26it/s][A
Iteration:   6%|▌         | 131/2105 [00:01<00:19, 102.66it/s][A
Iteration:   7%|▋         | 142/2105 [00:01<00:19, 102.38it/s][A
Iteration:   7%|▋         | 153/2105 [00:01<00:19, 102.56it/s][A
Iteration:   8%|▊         | 164/2105 [00:01<00:18, 102.73it/s][A
Iteration:   8%|▊         | 175/2105 [00:01<00:18, 102.44it/s][A
Iteration:   9%|▉

Epoch: 14



Iteration:   1%|          | 21/2105 [00:00<00:20, 100.75it/s][A
Iteration:   2%|▏         | 32/2105 [00:00<00:20, 100.83it/s][A
Iteration:   2%|▏         | 43/2105 [00:00<00:20, 101.11it/s][A
Iteration:   3%|▎         | 54/2105 [00:00<00:20, 101.50it/s][A
Iteration:   3%|▎         | 65/2105 [00:00<00:20, 101.54it/s][A
Iteration:   4%|▎         | 76/2105 [00:00<00:19, 101.57it/s][A
Iteration:   4%|▍         | 86/2105 [00:00<00:20, 100.76it/s][A
Iteration:   5%|▍         | 97/2105 [00:00<00:19, 101.39it/s][A
Iteration:   5%|▌         | 108/2105 [00:01<00:19, 101.29it/s][A
Iteration:   6%|▌         | 119/2105 [00:01<00:19, 101.85it/s][A
Iteration:   6%|▌         | 130/2105 [00:01<00:19, 101.63it/s][A
Iteration:   7%|▋         | 141/2105 [00:01<00:19, 100.13it/s][A
Iteration:   7%|▋         | 151/2105 [00:01<00:19, 98.31it/s] [A
Iteration:   8%|▊         | 162/2105 [00:01<00:19, 99.49it/s][A
Iteration:   8%|▊         | 172/2105 [00:01<00:19, 99.55it/s][A
Iteration:   9%|▊  

Epoch: 15



Iteration:   1%|          | 21/2105 [00:00<00:20, 99.95it/s] [A
Iteration:   2%|▏         | 32/2105 [00:00<00:20, 100.17it/s][A
Iteration:   2%|▏         | 43/2105 [00:00<00:20, 100.54it/s][A
Iteration:   3%|▎         | 54/2105 [00:00<00:20, 100.94it/s][A
Iteration:   3%|▎         | 65/2105 [00:00<00:20, 101.04it/s][A
Iteration:   4%|▎         | 76/2105 [00:00<00:20, 101.31it/s][A
Iteration:   4%|▍         | 87/2105 [00:00<00:19, 102.14it/s][A
Iteration:   5%|▍         | 98/2105 [00:00<00:19, 101.62it/s][A
Iteration:   5%|▌         | 108/2105 [00:01<00:19, 100.89it/s][A
Iteration:   6%|▌         | 118/2105 [00:01<00:20, 99.11it/s] [A
Iteration:   6%|▌         | 128/2105 [00:01<00:20, 98.54it/s][A
Iteration:   7%|▋         | 138/2105 [00:01<00:20, 97.46it/s][A
Iteration:   7%|▋         | 148/2105 [00:01<00:20, 96.49it/s][A
Iteration:   8%|▊         | 158/2105 [00:01<00:20, 95.54it/s][A
Iteration:   8%|▊         | 168/2105 [00:01<00:20, 95.99it/s][A
Iteration:   8%|▊     

In [48]:
evaluate(model, eval_examples, label_list, params, tokenizer)

03/21/2019 17:45:26 - INFO - lib.train_eval -   ***** Running evaluation *****
03/21/2019 17:45:26 - INFO - lib.train_eval -     Num examples = 872
03/21/2019 17:45:26 - INFO - lib.train_eval -     Batch size = 8

Evaluating:   0%|          | 0/109 [00:00<?, ?it/s][A
Evaluating:  29%|██▉       | 32/109 [00:00<00:00, 313.20it/s][A
Evaluating:  61%|██████    | 66/109 [00:00<00:00, 320.27it/s][A
Evaluating:  93%|█████████▎| 101/109 [00:00<00:00, 326.88it/s][A
Evaluating: 100%|██████████| 109/109 [00:00<00:00, 331.13it/s][A03/21/2019 17:45:26 - INFO - lib.train_eval -   ***** Eval results *****
03/21/2019 17:45:26 - INFO - lib.train_eval -     eval_accuracy = 0.7964036697247706
03/21/2019 17:45:26 - INFO - lib.train_eval -     eval_loss = -0.023476406231249143


{'eval_loss': -0.023476406231249143, 'eval_accuracy': 0.7964036697247706}

In [53]:
all_count = 0
for p in model.named_parameters():
    cur_count = 1
    for sh in p[1].size():
        cur_count *= sh
        
    all_count += cur_count
    
print('compresing in {} times'.format(109483778 / all_count))

compresing in 23.24299942382788 times
