In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import csv
import os
import logging
import argparse
import random
from tqdm import tqdm, trange

import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam
from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
PYTORCH_PRETRAINED_BERT_CACHE

PosixPath('/Users/varunn/.pytorch_pretrained_bert')

In [3]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [4]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [12]:
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_csv(cls, input_file, size=2000, header=None):
        """Reads a csv file."""
        df = pd.read_csv(input_file, header=header)
        labels = df[0].tolist()
        text = df[1].tolist()
        out = []
        for idx, line in enumerate(labels):
            out_line = []
            out_line.append(line)
            out_line.append(text[idx])
            out.append(out_line)
        return out[:size]

In [13]:
train_lines = DataProcessor._read_csv('data/imdb_clas/train.csv')

In [17]:
labels = [int(x[0]) for x in train_lines]
print(np.unique(labels))

[0 1]


In [18]:
class Processor(DataProcessor):
    """Processor for the custom data set"""

    def get_train_examples(self, data_dir):
        """See base class."""
        logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.csv")))
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "train.csv")), "train")
    
    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_csv(os.path.join(data_dir, "test.csv")), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for train and test sets."""
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
            text_a = line[1]
            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, label=label))
        return examples

In [19]:
data = Processor().get_train_examples('data/imdb_clas/')

02/03/2019 09:27:29 - INFO - __main__ -   LOOKING AT data/imdb_clas/train.csv


In [22]:
print(data[0].guid)
print(data[0].text_a)
print(data[0].text_b)
print(data[0].label)

train-0
Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!
None
0


In [49]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length

        label_id = label_map[str(example.label)]
        if ex_index < 5:
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [24]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()

def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

# training

In [25]:
data_dir = 'data/imdb_clas/'
task_name = 'mrpc'
bert_model = 'bert-base-uncased'
output_dir = 'data/imdb_Bert_Predictions'
max_seq_length = 128
do_train = True
do_eval = True
do_lower_case = True
train_batch_size = 32
eval_batch_size = 8
learning_rate = 5e-5
num_train_epochs = 3.0
warmup_proportion = 0.1
no_cuda = True
local_rank = -1
seed = 42
gradient_accumulation_steps = 1
fp16 = False
loss_scale = 0

processors = {
        "mrpc": Processor,
    }

num_labels_task = {
    "cola": 2,
    "mnli": 3,
    "mrpc": 2,
}

In [26]:
if local_rank == -1 or no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
    device, n_gpu, bool(local_rank != -1), fp16))

02/03/2019 09:56:54 - INFO - __main__ -   device: cpu n_gpu: 0, distributed training: False, 16-bits training: False


In [27]:
device

device(type='cpu')

In [28]:
if gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                     gradient_accumulation_steps))

In [29]:
train_batch_size = int(train_batch_size / gradient_accumulation_steps)

In [30]:
train_batch_size

32

In [31]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)

if not do_train and not do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

if os.path.exists(output_dir) and os.listdir(output_dir) and do_train:
    raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
os.makedirs(output_dir, exist_ok=True)

task_name = task_name.lower()

if task_name not in processors:
    raise ValueError("Task not found: %s" % (task_name))

processor = processors[task_name]()
num_labels = num_labels_task[task_name]
label_list = processor.get_labels()

In [33]:
num_labels, label_list

(2, ['0', '1'])

In [34]:
tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=do_lower_case)

02/03/2019 09:58:35 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /Users/varunn/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [35]:
train_examples = None
num_train_steps = None
if do_train:
    train_examples = processor.get_train_examples(data_dir)
    num_train_steps = int(
        len(train_examples) / train_batch_size / gradient_accumulation_steps * num_train_epochs)

02/03/2019 09:59:19 - INFO - __main__ -   LOOKING AT data/imdb_clas/train.csv


In [36]:
num_train_steps

187

In [37]:
# Prepare model
model = BertForSequenceClassification.from_pretrained(bert_model,
          cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(local_rank),
          num_labels = num_labels)
if fp16:
    model.half()
model.to(device)
if local_rank != -1:
    try:
        from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

02/03/2019 09:59:50 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/varunn/.pytorch_pretrained_bert/distributed_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
02/03/2019 09:59:50 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/varunn/.pytorch_pretrained_bert/distributed_-1/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/_y/2dty3nzx05zdf0lpd_r9bj1jbr9qbr/T/tmptiw2hoty
02/03/2019 09:59:54 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "

In [38]:
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
t_total = num_train_steps
if local_rank != -1:
    t_total = t_total // torch.distributed.get_world_size()
if fp16:
    try:
        from apex.optimizers import FP16_Optimizer
        from apex.optimizers import FusedAdam
    except ImportError:
        raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=learning_rate,
                          bias_correction=False,
                          max_grad_norm=1.0)
    if loss_scale == 0:
        optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
        optimizer = FP16_Optimizer(optimizer, static_loss_scale=loss_scale)

else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=learning_rate,
                         warmup=warmup_proportion,
                         t_total=t_total)

In [39]:
optimizer

BertAdam (
Parameter Group 0
    b1: 0.9
    b2: 0.999
    e: 1e-06
    lr: 5e-05
    max_grad_norm: 1.0
    schedule: warmup_linear
    t_total: 187
    warmup: 0.1
    weight_decay: 0.01
    weight_decay_rate: 0.01

Parameter Group 1
    b1: 0.9
    b2: 0.999
    e: 1e-06
    lr: 5e-05
    max_grad_norm: 1.0
    schedule: warmup_linear
    t_total: 187
    warmup: 0.1
    weight_decay: 0.0
    weight_decay_rate: 0.01
)

In [44]:
a = train_examples[0]
label_map = {label : i for i, label in enumerate(label_list)}

In [48]:
print(a.label, label_map)
print(label_map[str(a.label)])

0 {'0': 0, '1': 1}
0


In [50]:
# training begins
global_step = 0
nb_tr_steps = 0
tr_loss = 0
if do_train:
    train_features = convert_examples_to_features(
        train_examples, label_list, max_seq_length, tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", train_batch_size)
    logger.info("  Num steps = %d", num_train_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    if local_rank == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

    model.train()
    for _ in trange(int(num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            loss = model(input_ids, segment_ids, input_mask, label_ids)
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            if fp16:
                optimizer.backward(loss)
            else:
                loss.backward()

            tr_loss += loss.item()
            nb_tr_examples += input_ids.size(0)
            nb_tr_steps += 1
            if (step + 1) % gradient_accumulation_steps == 0:
                # modify learning rate with special warm up BERT uses
                lr_this_step = learning_rate * warmup_linear(global_step/t_total, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1

02/03/2019 10:09:49 - INFO - __main__ -   *** Example ***
02/03/2019 10:09:49 - INFO - __main__ -   guid: train-0
02/03/2019 10:09:49 - INFO - __main__ -   tokens: [CLS] un - b ##lee ##ping - bel ##ie ##vable ! meg ryan doesn ' t even look her usual per ##t lo ##vable self in this , which normally makes me forgive her shallow tick ##y acting sc ##ht ##ick . hard to believe she was the producer on this dog . plus kevin k ##line : what kind of suicide trip has his career been on ? who ##osh . . . ban ##zai ! ! ! finally this was directed by the guy who did big chill ? must be a replay of jones ##town - hollywood style . woo ##off ##f ! [SEP]
02/03/2019 10:09:49 - INFO - __main__ -   input_ids: 101 4895 1011 1038 10559 4691 1011 19337 2666 12423 999 12669 4575 2987 1005 1056 2130 2298 2014 5156 2566 2102 8840 12423 2969 1999 2023 1010 2029 5373 3084 2033 9641 2014 8467 16356 2100 3772 8040 11039 6799 1012 2524 2000 2903 2016 2001 1996 3135 2006 2023 3899 1012 4606 4901 1047 4179 1024 2054

02/03/2019 10:09:49 - INFO - __main__ -   label: 1 (id = 1)
02/03/2019 10:09:49 - INFO - __main__ -   *** Example ***
02/03/2019 10:09:49 - INFO - __main__ -   guid: train-4
02/03/2019 10:09:49 - INFO - __main__ -   tokens: [CLS] this movie succeeds at being one of the most unique movies you ' ve seen . however this comes from the fact that you can ' t make heads or tails of this mess . it almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid . if you don ' t want to feel slight ##ed you ' ll sit through this horrible film and develop a real sense of pity for the actors involved , they ' ve all seen better days , but then you realize they actually got paid quite a bit of money to do this and you ' ll [SEP]
02/03/2019 10:09:49 - INFO - __main__ -   input_ids: 101 2023 3185 21645 2012 2108 2028 1997 1996 2087 4310 5691 2017 1005 2310 2464 1012 2174 2023 3310 2013 1996 2755 2008 2017 2064 1005

Iteration:  43%|████▎     | 27/63 [12:46<16:59, 28.32s/it][A
Iteration:  44%|████▍     | 28/63 [13:14<16:32, 28.36s/it][A
Iteration:  46%|████▌     | 29/63 [13:43<16:04, 28.38s/it][A
Iteration:  48%|████▊     | 30/63 [14:14<16:00, 29.10s/it][A
Iteration:  49%|████▉     | 31/63 [14:45<15:48, 29.64s/it][A
Iteration:  51%|█████     | 32/63 [15:13<15:05, 29.20s/it][A
Iteration:  52%|█████▏    | 33/63 [15:41<14:25, 28.85s/it][A
Iteration:  54%|█████▍    | 34/63 [16:09<13:49, 28.59s/it][A
Iteration:  56%|█████▌    | 35/63 [16:37<13:15, 28.40s/it][A
Iteration:  57%|█████▋    | 36/63 [17:05<12:43, 28.29s/it][A
Iteration:  59%|█████▊    | 37/63 [17:33<12:13, 28.22s/it][A
Iteration:  60%|██████    | 38/63 [18:01<11:44, 28.18s/it][A
Iteration:  62%|██████▏   | 39/63 [18:29<11:15, 28.14s/it][A
Iteration:  63%|██████▎   | 40/63 [18:57<10:49, 28.26s/it][A
Iteration:  65%|██████▌   | 41/63 [19:26<10:24, 28.37s/it][A
Iteration:  67%|██████▋   | 42/63 [19:55<09:58, 28.50s/it][A
Iteratio

In [52]:
tr_loss, global_step

(16.010441914200783, 189)

In [53]:
# Save a trained model
model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
output_model_file = os.path.join(output_dir, "pytorch_model.bin")
if do_train:
    torch.save(model_to_save.state_dict(), output_model_file)

In [54]:
# Load a trained model that you have fine-tuned
model_state_dict = torch.load(output_model_file, map_location=device)
model1 = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels)
model1.load_state_dict(model_state_dict)
model1.to(device)

02/03/2019 12:09:56 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/varunn/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
02/03/2019 12:09:56 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/varunn/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/_y/2dty3nzx05zdf0lpd_r9bj1jbr9qbr/T/tmp0s8ohwrd
02/03/2019 12:10:00 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "n

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediate): BertInterme

In [55]:
if do_eval and (local_rank == -1 or torch.distributed.get_rank() == 0):
    eval_examples = processor.get_dev_examples(data_dir)
    eval_features = convert_examples_to_features(
        eval_examples, label_list, max_seq_length, tokenizer)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    logger.info("  Batch size = %d", eval_batch_size)
    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=eval_batch_size)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
    loss = tr_loss/nb_tr_steps if do_train else None
    result = {'eval_loss': eval_loss,
              'eval_accuracy': eval_accuracy,
              'global_step': global_step,
              'loss': loss}

    output_eval_file = os.path.join(output_dir, "test_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

02/03/2019 12:12:36 - INFO - __main__ -   *** Example ***
02/03/2019 12:12:36 - INFO - __main__ -   guid: test-0
02/03/2019 12:12:36 - INFO - __main__ -   tokens: [CLS] bud abbott and lou costello always had a good following among children , but in their careers i think you could say that they only made one film that could be designated for kids . jack and the beans ##talk was that one film . < br / > < br / > it was part of a two picture independent deal from warner brothers , the second film being abbott and costello meet captain kidd . these were the only two films the boys made in color . < br / > < br / > the two of them , out of work as usual , take a job for a very pre ##co ##cious and ob ##no ##xious young david st ##oll ##ery [SEP]
02/03/2019 12:12:36 - INFO - __main__ -   input_ids: 101 13007 14455 1998 10223 21015 2467 2018 1037 2204 2206 2426 2336 1010 2021 1999 2037 10922 1045 2228 2017 2071 2360 2008 2027 2069 2081 2028 2143 2008 2071 2022 4351 2005 4268 1012 2990 1998 19

02/03/2019 12:12:36 - INFO - __main__ -   input_ids: 101 1045 2074 2387 2023 2006 1037 2334 2981 2276 1999 1996 2047 2259 2103 2181 1012 1996 3459 3662 4872 2021 2043 1045 2387 1996 2472 1010 2577 2522 25855 13122 1010 1045 2150 10027 1012 1998 2469 2438 1010 2009 2001 2296 2978 2004 2919 1010 2296 2978 2004 23100 1998 5236 2004 2296 2577 2522 25855 13122 3185 1045 2412 2387 1012 2002 1005 1055 2066 1037 5236 2158 1005 1055 2745 20289 1011 1011 2007 2035 1996 9643 2791 2008 16222 6030 3207 10659 1012 1026 7987 1013 1028 1026 7987 1013 1028 2045 1005 1055 2053 2391 2000 1996 9714 1010 2053 5255 3314 2008 9075 1996 9530 13102 7895 6591 2006 1012 2057 2024 2187 2000 9731 2000 7532 1996 14981 102
02/03/2019 12:12:36 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 

In [56]:
result

{'eval_loss': 0.4050906317234039,
 'eval_accuracy': 0.8245,
 'global_step': 189,
 'loss': 0.25413399863810765}