In [1]:
# version 12

In [1]:
from __future__ import absolute_import, division, print_function

import pandas as pd
import numpy as np
import json
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, random_split
import torch.nn.functional as F
from torch.nn import Module
from sklearn import metrics
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from gensim.models import KeyedVectors
from keras.preprocessing.sequence import pad_sequences
from pandas.io.json import json_normalize
from operator import itemgetter

import logging
import os
import sys
from io import open

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, mean_squared_error, matthews_corrcoef, confusion_matrix
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

import regex as re
import glob
import random

from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm_notebook, trange

from transformers import WEIGHTS_NAME, BertForSequenceClassification, BertTokenizer
from transformers import XLMForSequenceClassification, XLMTokenizer
from transformers import AdamW, WarmupLinearSchedule
from tensorboardX import SummaryWriter

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import warnings
warnings.filterwarnings('ignore')
from IPython.core.debugger import set_trace

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I1126 16:56:54.576250 140596968896256 file_utils.py:39] PyTorch version 1.3.1 available.
I1126 16:56:54.645592 140596968896256 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

## Text to Features

In [3]:
EMAIL = re.compile(r"([\w0-9_\.-]+)(@)([\d\w\.-]+)(\.)([\w\.]{2,6})")
URL = re.compile(r"https?:\/\/(?!.*:\/\/)\S+")
PHONE = re.compile(r"(09|01[2|6|8|9])+([0-9]{8})\b")
MENTION = re.compile(r"@.+?:")
NUMBER = re.compile(r"\d+.?\d*")
DATETIME = '\d{1,2}\s?[/-]\s?\d{1,2}\s?[/-]\s?\d{4}'

RE_HTML_TAG = re.compile(r'<[^>]+>')
RE_CLEAR_1 = re.compile("[^_<>\s\p{Latin}]")
RE_CLEAR_2 = re.compile("__+")
RE_CLEAR_3 = re.compile("\s+")

def replace_common_token(txt):
    txt = re.sub(EMAIL, ' ', txt)
    txt = re.sub(URL, ' ', txt)
    txt = re.sub(MENTION, ' ', txt)
    txt = re.sub(DATETIME, ' ', txt)
    txt = re.sub(NUMBER, ' ', txt)
    return txt

def remove_emoji(txt):
    txt = re.sub(':v', '', txt)
    txt = re.sub(':D', '', txt)
    txt = re.sub(':3', '', txt)
    txt = re.sub(':\(', '', txt)
    txt = re.sub(':\)', '', txt)
    return txt

def remove_html_tag(txt):
    return re.sub(RE_HTML_TAG, ' ', txt)

def remove_stop_words(txt):
    tokenized_words = txt.split()
    return " ".join([word for word in tokenized_words if word not in stop_words])

def preprocess(txt):
#     txt = remove_html_tag(txt)
#     txt = re.sub('&.{3,4};', ' ', txt)
# #   txt = convertwindown1525toutf8(txt)
# #   txt = replace_common_token(txt)
#     txt = remove_emoji(txt)
#     txt = re.sub(RE_CLEAR_1, ' ', txt)
#     txt = re.sub(RE_CLEAR_2, ' ', txt)
#     txt = re.sub(RE_CLEAR_3, ' ', txt)
# #   txt = chuan_hoa_dau_cau_tieng_viet(txt)
    return txt.strip()

In [4]:
class InputExample(object):
    def __init__(self, id_a, text_a, id_b=None, text_b=None, label=None):
        self.id_a = id_a
        self.id_b = id_b
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class DataProcessor(object):
    def get_train_examples(self, train_file):
        examples = []
        
        train_df = pd.read_json(train_file, encoding= 'utf-8')
        for index, row in train_df.iterrows():
            quest = row['question']
            quest = quest[:1].upper() + quest[1:]
            title = row['title']
            para = preprocess(row['text'])
            label = str(int(bool(row['label'])))
            
            examples.append(InputExample(id_a=None, text_a=quest, id_b=None, text_b=title + " . " + para, label=label))
        return examples
        
    def get_test_examples(self, test_file):
        examples = []
        
        with open(test_file) as json_file:
            test_json = json.load(json_file)
        test_df = json_normalize(test_json, 'paragraphs', ['__id__', 'question', 'title'])
        
        for index, row in test_df.iterrows():
            quest_id = row['__id__']
            quest = row['question']
            quest = quest[:1].upper() + quest[1:]
            title = row['title']
            para_id = row['id']
            para = preprocess(row['text'])
            
            examples.append(InputExample(id_a=quest_id, text_a=quest, id_b=para_id, text_b=title + " . " + para, label=None))
        return examples
        
    def get_labels(self):
        """Gets the list of labels for this data set."""
        return ["0", "1"]

In [5]:
def convert_example_to_feature(example_row):
    sequence_a_segment_id=0
    sequence_b_segment_id=1
    mask_padding_with_zero=True
    
    example, label_map, max_seq_length, tokenizer, cls_token_segment_id, pad_token_segment_id = example_row

    pad_token_id = tokenizer.pad_token_id
    cls_token = tokenizer.cls_token
    sep_token = tokenizer.sep_token
    
    tokens_a = tokenizer.tokenize(example.text_a)
    tokens_b = tokenizer.tokenize(example.text_b)

    # Create segment ids
    tokens = tokens_a + [sep_token]
    segment_ids = [sequence_a_segment_id] * len(tokens)

    tokens += tokens_b + [sep_token]
    segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)

    tokens = [cls_token] + tokens
    segment_ids = [cls_token_segment_id] + segment_ids

    # Create input mask
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

    # Zero-pad up to the sequence length.
    padding_length = max_seq_length - len(input_ids)
    if padding_length >= 0:
        input_ids = input_ids + ([pad_token_id] * padding_length)
        input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
        segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
    else:
        input_ids = input_ids[:max_seq_length]
        input_mask = input_mask[:max_seq_length]
        segment_ids = segment_ids[:max_seq_length]
    
    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length
    
    label_id = None
    if example.label:
        label_id = label_map[example.label]

    return InputFeatures(input_ids=input_ids,
                        input_mask=input_mask,
                        segment_ids=segment_ids,
                        label_id=label_id)
    

def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer,
                                 cls_token_segment_id=0, pad_token_segment_id=0):

    label_map = {label : i for i, label in enumerate(label_list)}
    examples = [(example, label_map, max_seq_length, tokenizer, cls_token_segment_id, pad_token_segment_id) for example in examples]

    process_count = cpu_count()
    with Pool(process_count) as p:
        features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=100), total=len(examples)))

    return features

## Config Model

In [6]:
# 'model_type':  'bert',
# 'model_name': 'bert-base-multilingual-cased',

# 'model_type':  'xlm',
# 'model_name': 'xlm-mlm-17-1280',

In [17]:
args = {
    'data_dir': '/data/ai_challenge/vietnameseqa/data/',
    'train_file': 'train.json',
    'test_file': 'test.json',
    'model_type': 'bert',
    'model_name': 'bert-base-multilingual-cased',
    'task_name': 'binary',
    'output_dir': 'outputs/',
    'cache_dir': 'cache/',
    'do_eval': True,
    'max_seq_length': 200,
    'train_batch_size': 8,
    'test_batch_size': 8,

    'gradient_accumulation_steps': 1,
    'num_train_epochs': 1,
    'weight_decay': 1e-7,
    'learning_rate': 1e-5,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'max_grad_norm': 1.0,

    'logging_steps': 100,
    'evaluate_during_training': False,
    'create_checkpoint': False,
    'save_steps': 2000,
    'eval_all_checkpoints': True,
    'reprocess_input_data': True,
}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [8]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer),
    'xlm': (XLMForSequenceClassification, XLMTokenizer),
}

model_class, tokenizer_class = MODEL_CLASSES[args['model_type']]

tokenizer = tokenizer_class.from_pretrained(args['model_name'])
model = model_class.from_pretrained(args['model_name'])

I1124 17:21:11.714929 140159970518784 tokenization_utils.py:374] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt from cache at /home/thinhvd/.cache/torch/transformers/96435fa287fbf7e469185f1062386e05a075cadbf6838b74da22bf64b080bc32.99bcd55fc66f4f3360bc49ba472b940b8dcf223ea6a345deb969d607ca900729
I1124 17:21:12.716949 140159970518784 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json from cache at /home/thinhvd/.cache/torch/transformers/45629519f3117b89d89fd9c740073d8e4c1f0a70f9842476185100a8afe715d1.83b0fa3d7f1ac0e113ad300189a938c6f14d0588a4200f30eef109d0a047c484
I1124 17:21:12.721658 140159970518784 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.0

In [9]:
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [7]:
def load_and_cache_examples(tokenizer, test=False):
    processor = DataProcessor()
    
    logger.info("Creating features from dataset file at %s", args['data_dir'])
    label_list = processor.get_labels()
    examples = processor.get_test_examples(args['data_dir'] + args['test_file']) if test else processor.get_train_examples(args['data_dir'] + args['train_file'])

    features = convert_examples_to_features(examples, label_list, args['max_seq_length'], tokenizer,
                                            cls_token_segment_id=0,
                                            pad_token_segment_id=0)
        
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    
    # Output
    if not test:
        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    else: 
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    
    return dataset

## Train

In [11]:
train_dataset = load_and_cache_examples(tokenizer)
if args['do_eval']:
    train_dataset_size = len(train_dataset)
    train_size = int(0.8 * train_dataset_size)
    valid_size = train_dataset_size - train_size
    
    train_dataset_splitted, valid_dataset = random_split(train_dataset, (train_size, valid_size))
    train_dataloader = DataLoader(train_dataset_splitted, batch_size=args['train_batch_size'], shuffle=True)
    valid_dataloader = DataLoader(valid_dataset, batch_size=args['train_batch_size'], shuffle=False)
else:
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args['train_batch_size'])

t_total = len(train_dataloader) // args['gradient_accumulation_steps'] * args['num_train_epochs']

no_decay = ['bias', 'LayerNorm.weight']
#     no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args['weight_decay']},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args['warmup_steps'], t_total=t_total)

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", args['num_train_epochs'])
logger.info("  Total train batch size  = %d", args['train_batch_size'])
logger.info("  Gradient Accumulation steps = %d", args['gradient_accumulation_steps'])
logger.info("  Total optimization steps = %d", t_total)

model.zero_grad()
train_iterator = trange(int(args['num_train_epochs']), desc="Epoch")

for epoch in train_iterator:
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    epoch_labels = []
    epoch_preds = []
    
    epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration")
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,
                  'labels':         batch[3]}
        labels = batch[3].detach().cpu().numpy()
        outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        preds = logits.detach().cpu().numpy()
        preds = np.argmax(preds, axis=1)
        f1_scr = f1_score(labels, preds)
        print("\r%f - %f" % (loss, f1_scr), end='')
        
        epoch_labels.extend(labels)
        epoch_preds.extend(preds)
        
        if args['gradient_accumulation_steps'] > 1:
            loss = loss / args['gradient_accumulation_steps']

        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), args['max_grad_norm'])

        tr_loss += loss.item()
        if (step + 1) % args['gradient_accumulation_steps'] == 0:
            scheduler.step()
            optimizer.step()
            model.zero_grad()
            global_step += 1

            if args['logging_steps'] > 0 and global_step % args['logging_steps'] == 0:
                logger.info('loss = %s', (tr_loss - logging_loss)/args['logging_steps'])
                logging_loss = tr_loss
    
    print(" Train global_step = %f, average loss = %f, average f1 = %f" % (global_step, tr_loss / global_step, f1_score(epoch_labels, epoch_preds)))
    
    
    if args['do_eval']:
        epoch_labels = []
        epoch_preds = []
        eval_loss = 0.0
        
        for step, batch in enumerate(tqdm_notebook(valid_dataloader, desc="Iteration")):
            model.eval()
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None,
                      'labels':         batch[3]}
            outputs = model(**inputs)
            loss = outputs[0]
            logits = outputs[1]
            eval_loss += loss.item()
            
            preds = logits.detach().cpu().numpy()
            preds = np.argmax(preds, axis=1)
            
            epoch_labels.extend(batch[3].detach().cpu().numpy())
            epoch_preds.extend(preds)
            
        print(" Eval average loss = %f, average f1 = %f" % (eval_loss/len(valid_dataloader), f1_score(epoch_labels, epoch_preds)))
        
    # Save model
    epoch_output_dir = os.path.join(args['data_dir'], args['output_dir'], "epoch_" + str(epoch))
    if not os.path.exists(epoch_output_dir):
        os.makedirs(epoch_output_dir)
    model.save_pretrained(epoch_output_dir)
    tokenizer.save_pretrained(epoch_output_dir)
    torch.save(args, os.path.join(epoch_output_dir, 'training_args.bin'))

I1124 17:21:21.556224 140159970518784 <ipython-input-10-24cce704e7f0>:4] Creating features from dataset file at /data/ai_challenge/vietnameseqa/data/
 37%|███▋      | 6701/18108 [00:02<00:06, 1869.41it/s]W1124 17:21:25.979099 140159970518784 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors
 42%|████▏     | 7601/18108 [00:03<00:04, 2171.62it/s]W1124 17:21:26.337863 140159970518784 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors
 52%|█████▏    | 9501/18108 [00:04<00:03, 2339.40it/s]W1124 17:21:27.189528 140159970518784 tokenization_utils.py:677] Token indices sequence length is longer than the specified maximum sequence length for this model (583 > 512). Running this seque

HBox(children=(IntProgress(value=0, description='Iteration', max=1811, style=ProgressStyle(description_width='…

0.517139 - 0.750000

I1124 17:25:46.344910 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.604013594686985


0.942570 - 0.000000

I1124 17:29:56.290971 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.5254137025773525


0.476239 - 0.800000

I1124 17:34:05.541409 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.5126111339777708


0.893886 - 0.500000

I1124 17:38:28.508153 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.5063785469532013


0.515242 - 0.666667

I1124 17:42:51.530393 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.49125157579779627


0.304526 - 0.800000

I1124 17:47:18.098905 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.4772423630207777


0.417090 - 0.500000

I1124 17:51:37.003913 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.45955790795385837


0.489325 - 0.500000

I1124 17:56:06.656484 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.4478921489417553


0.579581 - 0.500000

I1124 18:00:25.952988 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.44140344828367234


0.558991 - 0.666667

I1124 18:04:56.721226 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.441366666816175


0.544780 - 0.400000

I1124 18:09:27.605429 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.42437400184571744


0.426828 - 0.857143

I1124 18:13:57.329216 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.43303107157349585


0.265039 - 0.857143

I1124 18:18:26.310726 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.399442097954452


0.632408 - 0.800000

I1124 18:22:59.125565 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.40474018748849633


0.222304 - 0.666667

I1124 18:27:33.804156 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.3467920238897204


0.111951 - 1.000000

I1124 18:32:09.547373 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.38865942392498254


0.425867 - 0.666667

I1124 18:36:43.326843 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.39194029103964567


0.762835 - 0.500000

I1124 18:41:15.071382 140159970518784 <ipython-input-11-7bf73b6382cb>:74] loss = 0.3589174585416913


0.382988 - 0.500000

I1124 18:41:44.732513 140159970518784 <ipython-input-11-7bf73b6382cb>:77]  Train global_step = 1811, average loss = 0.44700860518659763, average f1 = 0.658465991316932





HBox(children=(IntProgress(value=0, description='Iteration', max=453, style=ProgressStyle(description_width='i…

I1124 18:46:13.374503 140159970518784 <ipython-input-11-7bf73b6382cb>:102]  Eval average loss = 0.38164129043224104, average f1 = 0.7372764786795047
Epoch: 100%|██████████| 1/1 [1:24:42<00:00, 5082.28s/it]







## Run Tests

In [8]:
def export_submit_file(test_file, preds):
    with open(test_file) as json_file:
        test_json = json.load(json_file)
    test_df = json_normalize(test_json, 'paragraphs', ['__id__', 'question', 'title'])
    test_df['preds'] = preds
    test_df = test_df.loc[test_df['preds'] == 1]
    
    submit_df = pd.DataFrame()
    submit_df['test_id'] = test_df['__id__']
    submit_df['answer'] = test_df['id']
    submit_df.to_csv(args['data_dir'] + 'submits/sample_submission_' + datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + '.csv', index=False)

In [9]:
def test(model, tokenizer, prefix=""):
    results = {}
    test_dataset = load_and_cache_examples(tokenizer, test=True)

    test_sampler = SequentialSampler(test_dataset)
    test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=args['test_batch_size'])

    logger.info("***** Running test {} *****".format(prefix))
    logger.info("  Num examples = %d", len(test_dataset))
    logger.info("  Batch size = %d", args['test_batch_size'])
    preds = None
    model.eval()
    for batch in tqdm_notebook(test_dataloader, desc="Testing"):
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {'input_ids':      batch[0],
                      'attention_mask': batch[1],
                      'token_type_ids': batch[2] if args['model_type'] in ['bert', 'xlnet'] else None}
            outputs = model(**inputs)
            logits = outputs[0]
            
        if preds is None:
            preds = logits.detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)

    preds = np.argmax(preds, axis=1)
    
    export_submit_file(args['data_dir'] + args['test_file'], preds)
    return preds

In [18]:
# Export test from pre_train model
pretrain_model = os.path.join(args['data_dir'], args['output_dir'], "epoch_5")
model = BertForSequenceClassification.from_pretrained(pretrain_model)
tokenizer = BertTokenizer.from_pretrained(pretrain_model)
model.to(device)

I1128 21:55:03.004326 140596968896256 configuration_utils.py:148] loading configuration file /data/ai_challenge/vietnameseqa/data/outputs/epoch_5/config.json
I1128 21:55:03.005117 140596968896256 configuration_utils.py:168] Model config {
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 119547
}

I1128 21:55:03.005844 14059

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [20]:
preds = test(model, tokenizer)
logger.info("Finished Export Submit File!")

I1128 21:55:18.758872 140596968896256 <ipython-input-7-24cce704e7f0>:4] Creating features from dataset file at /data/ai_challenge/vietnameseqa/data/
100%|██████████| 3512/3512 [00:01<00:00, 2258.79it/s]
I1128 21:55:21.258114 140596968896256 <ipython-input-9-1b93e49a2345>:8] ***** Running test  *****
I1128 21:55:21.258772 140596968896256 <ipython-input-9-1b93e49a2345>:9]   Num examples = 3512
I1128 21:55:21.259322 140596968896256 <ipython-input-9-1b93e49a2345>:10]   Batch size = 8


HBox(children=(IntProgress(value=0, description='Testing', max=439, style=ProgressStyle(description_width='ini…

I1128 22:01:37.812880 140596968896256 <ipython-input-20-8792cd99b14e>:2] Finished Export Submit File!



