In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd
import numpy as np
import json
import gc
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torch.nn import Module
from sklearn import metrics
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from gensim.models import KeyedVectors
from underthesea import word_tokenize, sent_tokenize
from keras.preprocessing.sequence import pad_sequences
from pandas.io.json import json_normalize
from operator import itemgetter

import logging
import os
import sys
from io import open

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, mean_squared_error, matthews_corrcoef, confusion_matrix
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

import glob
import random

from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange

from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer)

from transformers import AdamW, WarmupLinearSchedule
from tensorboardX import SummaryWriter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import warnings
warnings.filterwarnings('ignore')
from IPython.core.debugger import set_trace

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I1121 17:48:01.479136 139887018583808 file_utils.py:39] PyTorch version 1.3.1 available.
I1121 17:48:01.516149 139887018583808 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [None]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

## Text to Features

In [11]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, id_a, text_a, id_b=None, text_b=None, label=None):
        """Constructs a InputExample."""
        self.id_a = id_a
        self.id_b = id_b
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, train_file):
        examples = []
        
        train_df = pd.read_json(train_file, encoding= 'utf-8')
        for index, row in train_df.iterrows():
            quest = row['question']
            title = row['title']
            para = row['text']
            label = str(int(row['label']))
            
            examples.append(InputExample(id_a=None, text_a=quest, id_b=None, text_b=para, label=label))
        return examples
        
    def get_test_examples(self, test_file):
        examples = []
        
        with open(test_file) as json_file:
            test_json = json.load(json_file)
        test_df = json_normalize(test_json, 'paragraphs', ['__id__', 'question', 'title'])
        
        for index, row in test_df.iterrows():
            quest_id = row['__id__']
            quest = row['question']
            title = row['title']
            para_id = row['id']
            para = row['text']
            
            examples.append(InputExample(id_a=quest_id, text_a=quest, id_b=para_id, text_b=para, label=None))
        return examples
        
    def get_labels(self):
        """Gets the list of labels for this data set."""
        return ["0", "1"]

In [12]:
def convert_example_to_feature(example_row, pad_token=0,
                               sequence_a_segment_id=0, sequence_b_segment_id=1,
                               cls_token_segment_id=0, pad_token_segment_id=0,
                               mask_padding_with_zero=True):
    example, label_map, max_seq_length, tokenizer, cls_token, sep_token, cls_token_segment_id, pad_token_segment_id = example_row

    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = tokenizer.tokenize(example.text_b)

    # Create segment ids
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    tokens += tokens_b + [sep_token]
    segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    tokens = [cls_token] + tokens
    segment_ids = [cls_token_segment_id] + segment_ids

    # Create input mask
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if padding_length >= 0:
        input_ids = input_ids + ([pad_token] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
    else:
        input_ids = input_ids[:max_seq_length]
        input_mask = input_mask[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
    
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

#     print(tokens_a)
#     print(tokens_b)
#     print(input_ids)
#     set_trace()
    
    label_id = None
    if example.label:
        label_id = label_map[example.label]

    return InputFeatures(input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id)
    

def convert_examples_to_features(examples, label_list, max_seq_length,
                                 tokenizer,
                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
                                 cls_token_segment_id=0, pad_token_segment_id=0,
                                 mask_padding_with_zero=True):

    label_map = {label : i for i, label in enumerate(label_list)}
    examples = [(example, label_map, max_seq_length, tokenizer, cls_token, sep_token, cls_token_segment_id, pad_token_segment_id) for example in examples]

    process_count = cpu_count() - 2
    with Pool(process_count) as p:
        features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=100), total=len(examples)))

    return features

## Config Model

In [13]:
args = {
    'data_dir': '/data/ai_challenge/vietnameseqa/data/',
    'train_file': 'train.json',
    'test_file': 'test.json',
    'model_type':  'bert',
    'model_name': 'bert-base-multilingual-cased',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_train': True,
    'do_eval': True,
    'max_seq_length': 128,
    'train_batch_size': 8,
    'eval_batch_size': 8,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 0,
    'learning_rate': 4e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 50,
    'evaluate_during_training': False,
    'create_checkpoint': False,
    'save_steps': 2000,
    'eval_all_checkpoints': True,
    'reprocess_input_data': True,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def load_data(tokenizer, test=False):
    processor = DataProcessor()
    logger.info("Creating features from dataset file at %s", args['data_dir'])
    label_list = processor.get_labels()
    
    if not test:
        examples = processor.get_train_examples(args['data_dir'] + args['train_file'])
        features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer,
                                                cls_token=tokenizer.cls_token,
                                                sep_token=tokenizer.sep_token,
                                                cls_token_segment_id=0,
                                                pad_token_segment_id=0)

        return features
    
    else:
        examples = processor.get_test_examples(args['data_dir'] + args['test_file'])
        features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer,
                                                cls_token=tokenizer.cls_token,
                                                sep_token=tokenizer.sep_token,
                                                cls_token_segment_id=0,
                                                pad_token_segment_id=0)
        
        test_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        test_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        test_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        
        test_dataset = TensorDataset(test_input_ids, test_input_mask, test_segment_ids)
        test_sampler = SequentialSampler(test_dataset)
        test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args['eval_batch_size'])
        
        return test_dataloader

In [15]:
def export_submit_file(test_file, all_preds):
    preds = all_preds / 5
    preds = [1 if i >= 0.5 else 0 for i in preds]    
    
    with open(test_file) as json_file:
        test_json = json.load(json_file)
    test_df = json_normalize(test_json, 'paragraphs', ['__id__', 'question', 'title'])
    test_df['preds'] = preds
    test_df = test_df.loc[test_df['preds'] == 1]
    
    submit_df = pd.DataFrame()
    submit_df['test_id'] = test_df['__id__']
    submit_df['answer'] = test_df['id']
    submit_df.to_csv(args['data_dir'] + 'submits/sample_submission_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv', index=False)

## Train

In [None]:
tokenizer = BertTokenizer.from_pretrained(args['model_name'])

features = np.array(load_data(tokenizer))
labels = [f.label_id for f in features]

all_preds = None
test_dataloader = load_data(tokenizer, test=True)

splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=2019).split(features, labels))
for idx, (train_idx, valid_idx) in enumerate(splits):
    seed_everything(idx)
    print('\nTrain Fold {}'.format(idx))
#     set_trace()
    train_features = features[train_idx]
    valid_features = features[valid_idx]

    train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    train_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    train_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    train_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    
    valid_input_ids = torch.tensor([f.input_ids for f in valid_features], dtype=torch.long)
    valid_input_mask = torch.tensor([f.input_mask for f in valid_features], dtype=torch.long)
    valid_segment_ids = torch.tensor([f.segment_ids for f in valid_features], dtype=torch.long)
    valid_label_ids = torch.tensor([f.label_id for f in valid_features], dtype=torch.long)
    
    train_dataset = TensorDataset(train_input_ids, train_input_mask, train_segment_ids, train_label_ids)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    valid_dataset = TensorDataset(valid_input_ids, valid_input_mask, valid_segment_ids, valid_label_ids)
    valid_sampler = RandomSampler(valid_dataset)
    valid_dataloader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args['train_batch_size'])
    
    if idx > 0:
        model = model.cpu()
        del model
    gc.collect()
    model = BertForSequenceClassification.from_pretrained(args['model_name'])
    model.to(device)
    
    t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']
    no_decay = ['bias', 'LayerNorm.weight']
#     no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)
   
    logger.info("***** Running kfolds %d *****", idx)
    logger.info("  Num train examples = %d", len(train_dataset))
    logger.info("  Num valid examples = %d", len(valid_dataset))
    logger.info("  Num Epochs = %d", args['num_train_epochs'])
    logger.info("  Total train batch size  = %d", args['train_batch_size'])
    logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    
    for epoch in range(int(args['num_train_epochs'])):
        epoch_loss = 0.0
        epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2],
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]
            print("\r%f" % loss, end='')

            if args['gradient_accumulation_steps'] > 1:
                loss = loss / args['gradient_accumulation_steps']

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

            tr_loss += loss.item()
            epoch_loss += loss.item()
            if (step + 1) % args['gradient_accumulation_steps'] == 0:
                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                    logger.info("  Loss = %f", (tr_loss - logging_loss)/args['logging_steps'])
                    logging_loss = tr_loss
                    
        # Run validation
        model.eval()
        valid_loss = 0
        valid_iterator = tqdm_notebook(valid_dataloader, desc="Iteration")
        for step, batch in enumerate(valid_iterator):
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids':      batch[0],
                          'attention_mask': batch[1],
                          'token_type_ids': batch[2],
                          'labels':         batch[3]}
                outputs = model(**inputs)
                valid_loss += outputs[0]
                
        print("Epoch: {} - train_loss: {:.5f} - valid_loss: {:.5f}".format(epoch, epoch_loss/len(train_dataloader), valid_loss/len(valid_dataloader)))
        
    # Run test when finish a kfold
    logger.info("***** Running test kfolds {} *****".format(idx))
    preds = None
    
    model.eval()
    for batch in tqdm_notebook(test_dataloader, desc="Evaluating"):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2]}
            outputs = model(**inputs)
            logits = outputs[0]
            
        if preds is None:
            preds = logits.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds, axis=1)
    if all_preds is None:
        all_preds = preds
    else:
        all_preds += preds
        
# Export test result
export_submit_file(args['data_dir'] + args['test_file'], all_preds)

I1121 16:19:42.016399 139982407472896 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/thinhvd/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I1121 16:19:42.103125 139982407472896 <ipython-input-14-b16a02e78f4c>:3] Creating features from dataset file at /data/ai_challenge/vietnameseqa/data/
 37%|███▋      | 6701/18108 [00:02<00:06, 1852.03it/s]W1121 16:19:46.414457 139982407472896 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
 43%|████▎     | 7747/18108 [00:03<00:04, 2114.99it/s]W1121 16:19:46.804887 139982407472896 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence l


Train Fold 0


I1121 16:19:54.307997 139982407472896 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json from cache at /home/thinhvd/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.83b0fa3d7f1ac0e113ad300189a938c6f14d0588a4200f30eef109d0a047c484
I1121 16:19:54.311425 139982407472896 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler

HBox(children=(IntProgress(value=0, description='Iteration', max=1811, style=ProgressStyle(description_width='…

0.430478

I1121 16:22:07.786190 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.579167


0.571300

I1121 16:24:14.475793 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.519407


0.454158

I1121 16:26:22.964429 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.529891


0.426434

I1121 16:28:31.625786 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.510113


0.995664

I1121 16:30:40.038077 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.583542


0.539857

I1121 16:32:48.507176 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.552434


0.412913

I1121 16:34:53.274606 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.590257


1.014968

I1121 16:37:00.330760 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.535445


0.643940

I1121 16:39:10.737546 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.521648


0.625057

I1121 16:41:24.587378 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.536389


0.714369

I1121 16:43:37.554720 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.492245


0.795915

I1121 16:45:43.054695 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.497754


0.268978

I1121 16:47:51.373232 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.488951


0.876515

I1121 16:49:58.435474 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.473299


0.295239

I1121 16:52:04.618452 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.481282


1.075687

I1121 16:54:11.157785 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.496460


0.185331

I1121 16:56:16.945099 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.511279


0.494734

I1121 16:58:22.478798 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.469914


0.281266

I1121 17:00:30.620354 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.503204


0.428183

I1121 17:02:41.236700 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.417011


0.391885

I1121 17:04:48.168551 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.409459


0.534073

I1121 17:06:54.126525 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.436620


0.676727

I1121 17:09:01.072154 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.393424


0.120374

I1121 17:11:06.648174 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.485414


0.208022

I1121 17:13:11.437283 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.422013


0.721711

I1121 17:15:15.455099 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.387030


0.612316

I1121 17:17:19.464758 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.411640


0.617452

I1121 17:19:23.278908 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.381125


0.408824

I1121 17:21:26.943753 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.417023


0.153973

I1121 17:23:30.743607 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.416425


0.461007

I1121 17:25:34.556547 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.419191


0.457242

I1121 17:27:40.095969 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.394976


0.160324

I1121 17:29:46.104550 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.421874


0.086642

I1121 17:31:53.570724 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.329842


1.056057

I1121 17:33:58.559988 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.390656


0.217096

I1121 17:36:04.545773 139982407472896 <ipython-input-18-aa23d39ff0e9>:89]   Loss = 0.357226


0.278560


HBox(children=(IntProgress(value=0, description='Iteration', max=453, style=ProgressStyle(description_width='i…

I1121 17:40:41.077284 139982407472896 <ipython-input-18-aa23d39ff0e9>:110] ***** Running test kfolds 0 *****



Epoch: 0 - train_loss: 0.46567 - valid_loss: 0.37737


HBox(children=(IntProgress(value=0, description='Evaluating', max=335, style=ProgressStyle(description_width='…



Train Fold 1


I1121 17:43:53.992027 139982407472896 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json from cache at /home/thinhvd/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.83b0fa3d7f1ac0e113ad300189a938c6f14d0588a4200f30eef109d0a047c484
I1121 17:43:53.994375 139982407472896 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler

HBox(children=(IntProgress(value=0, description='Iteration', max=1811, style=ProgressStyle(description_width='…

0.688691