# **BERT Relation Extraction Notebook**


## Imports and environment configuration

In [None]:
!pip install seqeval
!pip install boto3
!pip install transformers==3.0.0
!pip install ipython-autotime

%load_ext autotime

In [None]:
import os
import sys
import math
import time
import json
import random
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score, precision_score, recall_score

random.seed(42)

if 'google.colab' in str(get_ipython()):
  print('Running on Google Colab')
  root = '/content/drive/My Drive/Colab Notebooks/'
else:
  print('Running locally')
  root = Path(os.getcwd()).parent

basepath = os.path.join(root, 'relation-extraction/')
sys.path.append(os.path.join(basepath, 'MTB/code'))

from modeling_bert import BertModel as Model
from tokenization_bert import BertTokenizer as Tokenizer

Switch for data usage: If True FewRel data will be used, if False Future Engineering data is used

In [None]:
use_fewrel_data=False

## Matching the Blanks Pre-Training

The pre-training process of Matching the Blanks can run for multiple days, even with GPU support. Therefore a already pre-trained model is provided in the GitLab repository. For additional information see README.

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz

import en_core_web_lg

### Pre-Training Helper functions

In [None]:
from pretrain_helper_functions import Two_Headed_Loss, pretrain_dataset, Mtb_Pad_Sequence
from pretrain_helper_functions import load_state, get_subject_objects, create_pretraining_corpus, process_textlines, mtb_evaluate_

In [None]:
def mtb_load_dataloaders(pretrain_data, batch_size, max_length=50000):
    print("Loading pre-training data...")
    with open(pretrain_data, "r", encoding="utf8") as f:
        text = f.readlines()
    
    text = process_textlines(text)
    
    print("Length of text (characters): %d" % len(text))
    num_chunks = math.ceil(len(text)/max_length)
    print("Splitting into %d max length chunks of size %d" % (num_chunks, max_length))
    text_chunks = (text[i*max_length:(i*max_length + max_length)] for i in range(num_chunks))
    
    D = []
    print("Loading Spacy NLP...")
    nlp = en_core_web_lg.load()
    
    for text_chunk in text_chunks:
        D.extend(create_pretraining_corpus(text_chunk, nlp, window_size=40))
        
    print("Total number of relation statements in pre-training corpus: %d" % len(D))
   
    train_set = pretrain_dataset(D, tokenizer, batch_size=batch_size)
    train_length = len(train_set)

    return train_set

### Pre-Training with Matching the Blanks

Definition of parameters for pre-training with Matching the Blanks

In [None]:
num_epochs=18
freeze=0
lr=0.0001
max_norm=1.0
gradient_acc_steps=2
batch_size=32
pretrain_data=os.path.join(root, 'fewrel-training-data/MTB/cnn.txt')
checkpoint_path = os.path.join(basepath, 'MTB/pretrain_checkpoints/pretrain_checkpoint_BERT_1.pth.tar')

Loading model and tokenizer and initialize optimizer and scheduler for training routine.

In [None]:
model_name = 'bert-base-uncased'
lower_case=True

tokenizer = Tokenizer.from_pretrained(model_name, do_lower_case=lower_case)
tokenizer.add_tokens(['[E1]', '[/E1]', '[E2]', '[/E2]', '[BLANK]'])

mtb_model = Model.from_pretrained(model_name, force_download=False)
mtb_model.resize_token_embeddings(len(tokenizer)) 

if cuda:
    print("Cuda is on")
    mtb_model.cuda()

if freeze == 1:
    print("FREEZING MOST HIDDEN LAYERS...")
    unfrozen_layers = ["classifier", "pooler", "encoder.layer.11", "encoder.layer.10",\
                        "encoder.layer.9", "blanks_linear", "lm_linear", "cls"]
        
    for name, param in mtb_model.named_parameters():
        if not any([layer in name for layer in unfrozen_layers]):
            print("[FROZE]: %s" % name)
            param.requires_grad = False
        else:
            print("[FREE]: %s" % name)
            param.requires_grad = True
    
criterion = Two_Headed_Loss(lm_ignore_idx=tokenizer.pad_token_id, use_logits=True, normalize=False)
optimizer = optim.Adam([{"params":mtb_model.parameters(), "lr": lr}])

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8)

Loading pre-training data from inputfile.

In [None]:
cuda = torch.cuda.is_available()

train_loader = mtb_load_dataloaders(pretrain_data, batch_size)
train_len = len(train_loader)
print("Loaded %d pre-training samples." % train_len)

Loading pre-training data...
Length of text (characters): 1041308
Splitting into 21 max length chunks of size 50000
Loading Spacy NLP...
Total number of relation statements in pre-training corpus: 14835
Loaded 14835 pre-training samples.


Load checkpoint if available to continue training from this point.

In [None]:
start_epoch, best_pred = load_state(mtb_model, optimizer, scheduler, checkpoint_path)

Loaded checkpoint model.
Loaded model and optimizer.


Pre-Training process with Matching the Blank method. Caution: This kind of training runs at least 10 to 12 hours on limited hardware.

In [None]:
losses_per_epoch=[]
accuracy_per_epoch=[]
#start_epoch=0


print("Starting training process...")
pad_id = tokenizer.pad_token_id
mask_id = tokenizer.mask_token_id
update_size = len(train_loader)//10
for epoch in range(start_epoch, num_epochs):
    start_time = time.time()
    mtb_model.train(); total_loss = 0.0; losses_per_batch = []; total_acc = 0.0; lm_accuracy_per_batch = []
    for i, data in enumerate(train_loader, 0):
        x, masked_for_pred, e1_e2_start, _, blank_labels, _,_,_,_,_ = data
        masked_for_pred1 =  masked_for_pred
        masked_for_pred = masked_for_pred[(masked_for_pred != pad_id)]
        if masked_for_pred.shape[0] == 0:
            print('Empty dataset, skipping...')
            continue
        attention_mask = (x != pad_id).float()
        token_type_ids = torch.zeros((x.shape[0], x.shape[1])).long()

        if cuda:
            x = x.cuda(); masked_for_pred = masked_for_pred.cuda()
            attention_mask = attention_mask.cuda()
            token_type_ids = token_type_ids.cuda()
        
        blanks_logits, lm_logits, _ = mtb_model(x, token_type_ids=token_type_ids, attention_mask=attention_mask, Q=None,\
                      e1_e2_start=e1_e2_start)
        lm_logits = lm_logits[(x == mask_id)]
        
        #return lm_logits, blanks_logits, x, e1_e2_start, masked_for_pred, masked_for_pred1, blank_labels, tokenizer # for debugging now
        if (i % update_size) == (update_size - 1):
            verbose = True
        else:
            verbose = False
            
        loss = criterion(lm_logits, blanks_logits, masked_for_pred, blank_labels, verbose=verbose)
        loss = loss/gradient_acc_steps

        loss.backward()

        grad_norm = nn.utils.clip_grad_norm_(mtb_model.parameters(), max_norm)
        
        if (i % gradient_acc_steps) == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()
        total_acc += mtb_evaluate_(lm_logits, blanks_logits, masked_for_pred, blank_labels, \
                                tokenizer, print_=False)[0]
        
        if (i % update_size) == (update_size - 1):
            losses_per_batch.append(gradient_acc_steps*total_loss/update_size)
            lm_accuracy_per_batch.append(total_acc/update_size)
            print('[Epoch: %d, %5d/ %d points] total loss, lm accuracy per batch: %.3f, %.3f' %
                  (epoch + 1, (i + 1), train_len, losses_per_batch[-1], lm_accuracy_per_batch[-1]))
            total_loss = 0.0; total_acc = 0.0
            print("Last batch samples (pos, neg): %d, %d" % ((blank_labels.squeeze() == 1).sum().item(),\
                                                                (blank_labels.squeeze() == 0).sum().item()))
    
    scheduler.step()
    losses_per_epoch.append(sum(losses_per_batch)/len(losses_per_batch))
    accuracy_per_epoch.append(sum(lm_accuracy_per_batch)/len(lm_accuracy_per_batch))
    print("Epoch finished, took %.2f seconds." % (time.time() - start_time))
    print("Losses at Epoch %d: %.7f" % (epoch + 1, losses_per_epoch[-1]))
    print("Accuracy at Epoch %d: %.7f" % (epoch + 1, accuracy_per_epoch[-1]))
    
    torch.save({
            'epoch': epoch + 1,\
            'state_dict': mtb_model.state_dict(),\
            'best_acc': accuracy_per_epoch[-1],\
            'optimizer' : optimizer.state_dict(),\
            'scheduler' : scheduler.state_dict(),\
            'amp': None
        }, os.path.join(basepath, "MTB/pretrain_checkpoints/pretrain_checkpoint_BERT_1.pth.tar"))

print("Finished Training!")

## Fine Tuning of classification approach with pre-trained MTB-model

### Helper functions and commonly needed elements

In [None]:
from fine_tuning_helper_functions import evaluate_

Definition of parameters for fine-tuning

In [None]:
model_name = 'bert-base-uncased'
lower_case=True
num_classes = 7
use_pretrained_blanks = 1
max_seq_length = 100

gradient_acc_steps = 1

num_epochs = 7
lr = 0.00005
batch_size = 32

eval_batch_size = 8

checkpoint_path = os.path.join(basepath, 'MTB/pretrain_checkpoints/pretrain_checkpoint_BERT_1.pth.tar')

if (use_fewrel_data):
    data_dir = os.path.join(root, 'fewrel-training-data/fewrel/')
    test_data_file = "test_%d_classes_disjoint.json" % num_classes
    train_data_file = "dev_%d_classes_disjoint.json" % num_classes

    data_type = 'fewrel'
else:
    data_dir = os.path.join(root, 'fe-training-data')
    test_data_file = 'test_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'
    train_data_file = 'train_examples_nota_manufact_operate_operatesth_order_uses_ordersth.json'

    data_type = 'fe'

Initialization of model and tokenizer

In [None]:
from transformers import BertForSequenceClassification, BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=False)
tokenizer.add_tokens(['[E1]', '[/E1]', '[E2]', '[/E2]', '[BLANK]'])

model = BertForSequenceClassification.from_pretrained(model_name, num_labels = num_classes)
model.resize_token_embeddings(len(tokenizer))

if torch.cuda.is_available():
    model.cuda()

FewrelProcessor class manages loading of training and test data

In [None]:
class FewrelProcessor():
    def get_train_examples(self, data_dir,file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "train")
        labels = set([x[3] for x in examples])
        return examples, list(labels)

    def get_dev_examples(self, data_dir,file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "dev")
        labels = set([x[3] for x in examples])
        return examples, list(labels)
        
    def get_test_examples(self, data_dir, file_name):
        examples = self._create_examples(
            self._read_json(os.path.join(data_dir, file_name)), "test")
        labels = set([x[3] for x in examples])
        return examples, list(labels)

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            for x in line['ents']:
                if x[1] == 1:
                    x[1] = 0
            text_a = (line['text'], line['ents'])
            label = line['label']
            examples.append((guid, text_a, None, label))
        return examples

    def _read_json(cls, input_file):
        with open(input_file, "r", encoding='utf-8') as f:
            return json.loads(f.read())

Helper function to convert the loaded training data to useable training instances

In [None]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""
    
    label_list = sorted(label_list)
    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        ex_text_a = example[1][0]
        h, t = example[1][1]
        h_name = ex_text_a[h[1]:h[2]]
        t_name = ex_text_a[t[1]:t[2]]
        if h[1] < t[1]:
            ex_text_a = ex_text_a[:h[1]] + "[E1] "+h_name+" [/E1]" + ex_text_a[h[2]:t[1]] + "[E2] "+t_name+" [/E2]" + ex_text_a[t[2]:]
            
        else:
            ex_text_a = ex_text_a[:t[1]] + "[E2] "+t_name+" [/E2]" + ex_text_a[t[2]:h[1]] + "[E1] "+h_name+" [/E1]" + ex_text_a[h[2]:]

        if h[1] < t[1]:
            h[1] += 2
            h[2] += 2
            t[1] += 6
            t[2] += 6
        else:
            h[1] += 6
            h[2] += 6
            t[1] += 2
            t[2] += 2
        
        tokens_a = tokenizer.tokenize(ex_text_a)


        # Account for [CLS] and [SEP] with "- 2"
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[:(max_seq_length - 2)]

        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = input_mask = [1] * len(input_ids)
        token_type_ids = [0] * max_seq_length

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        attention_mask += padding

        assert len(input_ids) == max_seq_length

        label_id = label_map[example[3]]

        features.append((input_ids, label_id, attention_mask, token_type_ids))

    return features, label_map

Helper function for running evaluation during training process

In [None]:
def run_evaluation(dataloader, model):
    eval_loss, eval_accuracy, eval_precision, eval_recall, eval_f1 = 0, 0, 0, 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for dev_input_ids, dev_labels, dev_attention_mask, dev_token_type_ids in dataloader:
        dev_input_ids = dev_input_ids.to(device)
        dev_labels = dev_labels.to(device)
        dev_attention_mask = dev_attention_mask.to(device)
        dev_token_type_ids = dev_token_type_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss, logits = model(dev_input_ids, token_type_ids=dev_token_type_ids, attention_mask=dev_attention_mask, labels=dev_labels)
      
        logits = logits.detach().cpu().numpy()
        dev_labels = dev_labels.to('cpu').numpy()
        tmp_eval_accuracy, tmp_eval_precision, tmp_eval_recall, tmp_eval_f1, pred = accuracy_precision_recall_f1(logits, dev_labels)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        eval_precision += tmp_eval_precision
        eval_recall += tmp_eval_recall
        eval_f1 += tmp_eval_f1

        nb_eval_examples += dev_input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    eval_precision = eval_precision / nb_eval_steps
    eval_recall = eval_recall / nb_eval_steps
    eval_f1 = eval_f1 / nb_eval_steps

    print("***** Eval results *****")
    print("   Loss: %f" % eval_loss)
    print("   Accuracy: %f" % eval_accuracy)
    print("   Precision (macro-averaged): %f" % eval_precision)
    print("   Recall (macro-averaged): %f" % eval_recall)
    print("   F1-Score (macro-averaged): %f" % eval_f1)

Helper function to calculate accuracy during training process

In [None]:
def accuracy_precision_recall_f1(out, labels):
    outputs = np.argmax(out, axis=1)
    accuracy = np.sum(outputs == labels)
    precision = precision_score(labels, outputs, average='macro', labels=np.unique(labels))
    recall = recall_score(labels, outputs, average='macro', labels=np.unique(labels))
    f1 = f1_score(labels, outputs, average='macro', labels=np.unique(labels))
    return accuracy, precision, recall, f1, outputs

### Fine-Tuning

Parameter use_pretrained_blanks decides if BERT oder MTB-BERT model is used for fine-tuning

In [None]:
if use_pretrained_blanks == 1:
    print("Loading model pre-trained on blanks ...")
    checkpoint = torch.load(checkpoint_path, map_location=torch.device(device))
    model_dict = model.state_dict()
    pretrained_dict = {k: v for k, v in checkpoint['state_dict'].items() if k in model_dict.keys()}
    model_dict.update(pretrained_dict)
    model.load_state_dict(pretrained_dict, strict=False)
    del checkpoint, pretrained_dict, model_dict

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=-1)
optimizer = optim.Adam([{"params":model.parameters(), "lr": lr}])

scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[2,4,6,8,12,15,18,20,22,24,26,30], gamma=0.8)

Loading and preparing training data for the fine-tuning process

In [None]:
processor = FewrelProcessor()

# Prepare training data for fine-tuning
train_examples, label_list = processor.get_train_examples(data_dir, train_data_file)
train_features, label_map = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer)

all_input_ids = torch.tensor([f[0] for f in train_features], dtype=torch.long)
all_label_ids = torch.tensor([f[1] for f in train_features], dtype=torch.long)
all_attention_masks = torch.tensor([f[2] for f in train_features], dtype=torch.long)
all_token_type_ids = torch.tensor([f[3] for f in train_features], dtype=torch.long)

train_data = TensorDataset(all_input_ids, all_label_ids, all_attention_masks, all_token_type_ids)

train_sampler = RandomSampler(train_data)
train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

print('Number of train examples: %d' % len(train_examples))


# Prepare dev data for evaluation while fine-tuning
dev_examples, label_list_dev = processor.get_dev_examples(data_dir, test_data_file)
dev_features, label_map_dev = convert_examples_to_features(dev_examples, label_list_dev, max_seq_length, tokenizer)

all_input_ids_dev = torch.tensor([f[0] for f in dev_features], dtype=torch.long)
all_label_ids_dev = torch.tensor([f[1] for f in dev_features], dtype=torch.long)
all_attention_masks_dev = torch.tensor([f[2] for f in dev_features], dtype=torch.long)
all_token_type_ids_dev = torch.tensor([f[3] for f in dev_features], dtype=torch.long)

dev_data = TensorDataset(all_input_ids_dev, all_label_ids_dev, all_attention_masks_dev, all_token_type_ids_dev)

dev_sampler = SequentialSampler(dev_data)
dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=batch_size)

print('Number of evaluation examples: %d' % len(dev_examples))

Number of train examples: 1068
Number of evaluation examples: 356
time: 2.04 s


Actual training process of the model

In [None]:
losses_per_epoch=[]
accuracy_per_epoch=[]
test_f1_per_epoch=[]
save_path = os.path.join(basepath, 'MTB/fine_tuning_checkpoints/')


train_len = len(train_data)
print("Starting training process...")

label_map_file = os.path.join(save_path, 'label_map_%s_%d_classes_examples.json' %(data_type, num_classes))
with open(label_map_file, 'w') as f:
    json.dump(label_map, f)

update_size = len(train_loader)//10

for epoch in range(0, num_epochs):
    print("------------- Epoch %d -------------" % (epoch+1))
    start_time = time.time()
    model.train(); total_loss = 0.0; losses_per_batch = []; total_acc = 0.0; accuracy_per_batch = []
    for i, data in enumerate(train_loader, 0):

        x, labels, attention_mask, token_type_ids = data

        if torch.cuda.is_available():
            x = x.cuda()
            labels = labels.cuda()
            attention_mask = attention_mask.cuda()
            token_type_ids = token_type_ids.cuda()
            
        loss, classification_logits = model(x, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels)
        loss = loss/gradient_acc_steps

        loss.backward()
        
        if (i % gradient_acc_steps) == 0:
            optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item()
        total_acc += evaluate_(classification_logits, labels, \
                                ignore_idx=-1)[0]
        
        if (i % update_size) == (update_size - 1):
            losses_per_batch.append(gradient_acc_steps*total_loss/update_size)
            accuracy_per_batch.append(total_acc/update_size)

            print('[Epoch: %d, %5d/ %d points] total loss, accuracy per batch: %.3f, %.3f' %
                  (epoch + 1, (i + 1)*batch_size, train_len, losses_per_batch[-1], accuracy_per_batch[-1]))
            
            total_loss = 0.0; total_acc = 0.0

    scheduler.step()

    losses_per_epoch.append(sum(losses_per_batch)/len(losses_per_batch))
    accuracy_per_epoch.append(sum(accuracy_per_batch)/len(accuracy_per_batch))

    print("Epoch finished, took %.2f seconds." % (time.time() - start_time))
    print("Losses at Epoch %d: %.7f" % (epoch + 1, losses_per_epoch[-1]))
    print("Train accuracy at Epoch %d: %.7f" % (epoch + 1, accuracy_per_epoch[-1]))

    print("***** Running evaluation on Dev data *****")
    print("   Num examples = %d" % len(dev_examples))
    run_evaluation(dev_dataloader, model)


mtb = 'no_MTB'
if (use_pretrained_blanks==1):
    mtb = 'MTB'

# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(save_path, "fine_tuning_checkpoint_BERT_%s_%s_%d_classes.pth.tar" % (mtb, data_type, num_classes))
torch.save(model_to_save.state_dict(), output_model_file)
    
print("Finished Training!")

Starting training process...
------------- Epoch 1 -------------
[Epoch: 1,    96/ 1068 points] total loss, accuracy per batch: 1.923, 0.146
[Epoch: 1,   192/ 1068 points] total loss, accuracy per batch: 1.962, 0.177
[Epoch: 1,   288/ 1068 points] total loss, accuracy per batch: 1.891, 0.156
[Epoch: 1,   384/ 1068 points] total loss, accuracy per batch: 1.866, 0.292
[Epoch: 1,   480/ 1068 points] total loss, accuracy per batch: 1.907, 0.198
[Epoch: 1,   576/ 1068 points] total loss, accuracy per batch: 1.887, 0.198
[Epoch: 1,   672/ 1068 points] total loss, accuracy per batch: 1.836, 0.188
[Epoch: 1,   768/ 1068 points] total loss, accuracy per batch: 1.759, 0.323
[Epoch: 1,   864/ 1068 points] total loss, accuracy per batch: 1.674, 0.333
[Epoch: 1,   960/ 1068 points] total loss, accuracy per batch: 1.756, 0.385
[Epoch: 1,  1056/ 1068 points] total loss, accuracy per batch: 1.668, 0.344
Epoch finished, took 10.86 seconds.
Losses at Epoch 1: 1.8300044
Train accuracy at Epoch 1: 0.24905

  _warn_prf(average, modifier, msg_start, len(result))


***** Eval results *****
   Loss: 1.734531
   Accuracy: 0.292135
   Precision (macro-averaged): 0.211494
   Recall (macro-averaged): 0.219161
   F1-Score (macro-averaged): 0.167689
------------- Epoch 2 -------------
[Epoch: 2,    96/ 1068 points] total loss, accuracy per batch: 1.629, 0.354
[Epoch: 2,   192/ 1068 points] total loss, accuracy per batch: 1.414, 0.583
[Epoch: 2,   288/ 1068 points] total loss, accuracy per batch: 1.396, 0.500
[Epoch: 2,   384/ 1068 points] total loss, accuracy per batch: 1.396, 0.490
[Epoch: 2,   480/ 1068 points] total loss, accuracy per batch: 1.298, 0.500
[Epoch: 2,   576/ 1068 points] total loss, accuracy per batch: 1.304, 0.490
[Epoch: 2,   672/ 1068 points] total loss, accuracy per batch: 1.329, 0.510
[Epoch: 2,   768/ 1068 points] total loss, accuracy per batch: 1.257, 0.490
[Epoch: 2,   864/ 1068 points] total loss, accuracy per batch: 1.198, 0.458
[Epoch: 2,   960/ 1068 points] total loss, accuracy per batch: 1.172, 0.448
[Epoch: 2,  1056/ 1068 