# Artificial Intelligence II - Homework 4
# Question 3

**Note:** 
I call set_seed() at the exact same spot at each run of a model to have deterministic results.


In [1]:
# from google.colab import drive
# drive.mount('/content/drive')

In [2]:
# PATH = '/content/drive/MyDrive/Colab Notebooks/Artificial Intelligence II/bert/'

# Import Libraries and Read Datasets

Import libraries that will be used in this notebook, define a seeding function and set device to cuda if available.


In [3]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import os

import numpy as np
from numpy import unravel_index
import pandas as pd
import math

import matplotlib.pyplot as plt
%matplotlib inline
import sklearn.metrics
import seaborn as sns
import random
import sys
from IPython.display import Image
import time

# for text preprocessing
import re
import string

!CUBLAS_WORKSPACE_CONFIG=:4096:2 # for cuda deterministic behavior

######### BERT ############
# first install transformers from hugging face
!pip install transformers

# imports
from transformers import BertTokenizer, BertForQuestionAnswering

# dataloaders 
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.enabled = False
    torch.backends.cudnn.benchmark = False
    # torch.use_deterministic_algorithms(False)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Working on:', device)

Working on: cuda


Convert trivia QA dataset to SQuAD format

In [4]:
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Script based on https://github.com/mandarjoshi90/triviaqa/blob/master/utils/convert_to_squad_format.py
# We include functions that are modified from https://github.com/mandarjoshi90/triviaqa/tree/master/utils
# cite: https://github.com/mandarjoshi90/triviaqa/

import os
import argparse
import json
import nltk
# from utils.convert_to_squad_format import get_qad_triples
def add_triple_data(datum, page, domain):
    qad = {'Source': domain}
    for key in ['QuestionId', 'Question', 'Answer']:
        qad[key] = datum[key]
    for key in page:
        qad[key] = page[key]
    return qad


def get_qad_triples(data):
    qad_triples = []
    for datum in data['Data']:
        for key in ['EntityPages', 'SearchResults']:
            for page in datum.get(key, []):
                qad = add_triple_data(datum, page, key)
                qad_triples.append(qad)
    return qad_triples

# from utils.utils import get_file_contents

def get_file_contents(filename, encoding='utf-8'):
    with open(filename, encoding=encoding) as f:
        content = f.read()
    return content

# from utils.dataset_utils import read_triviaqa_data, get_question_doc_string

def read_clean_part(datum):
    for key in ['EntityPages', 'SearchResults']:
        new_page_list = []
        for page in datum.get(key, []):
            if page['DocPartOfVerifiedEval']:
                new_page_list.append(page)
        datum[key] = new_page_list
    assert len(datum['EntityPages']) + len(datum['SearchResults']) > 0
    return datum

def read_json(filename, encoding='utf-8'):
    contents = get_file_contents(filename, encoding=encoding)
    return json.loads(contents)

def read_triviaqa_data(qajson):
    data = read_json(qajson)
    # read only documents and questions that are a part of clean data set
    if data['VerifiedEval']:
        clean_data = []
        for datum in data['Data']:
            if datum['QuestionPartOfVerifiedEval']:
                if data['Domain'] == 'Web':
                    datum = read_clean_part(datum)
                clean_data.append(datum)
        data['Data'] = clean_data
    return data


def get_question_doc_string(qid, doc_name):
    return '{}--{}'.format(qid, doc_name)
#-------------------------------------------------------



def answer_index_in_document(answer, document):
    answer_list = answer['Aliases'] + answer['NormalizedAliases']
    for answer_string_in_doc in answer_list:
        index = document.find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer['NormalizedValue'], -1


def select_relevant_portion(text):
    paras = text.split('\n')
    selected = []
    done = False
    for para in paras:
        sents = sent_tokenize.tokenize(para)
        for sent in sents:
            words = nltk.word_tokenize(sent)
            for word in words:
                selected.append(word)
                if len(selected) >= 800:
                    done = True
                    break
            if done:
                break
        if done:
            break
        selected.append('\n')
    st = ' '.join(selected).strip()
    return st


def triviaqa_to_squad_format(triviaqa_file, data_dir, output_file):
    triviaqa_json = read_triviaqa_data(triviaqa_file)
    qad_triples = get_qad_triples(triviaqa_json)

    data = []

    for triviaqa_example in qad_triples:
        question_text = triviaqa_example['Question']
        text = get_file_contents(os.path.join(data_dir, triviaqa_example['Filename']), encoding='utf-8')
        context = select_relevant_portion(text)

        para = {'context': context, 'qas': [{'question': question_text, 'answers': []}]}
        data.append({'paragraphs': [para]})
        qa = para['qas'][0]
        qa['id'] = get_question_doc_string(triviaqa_example['QuestionId'], triviaqa_example['Filename'])
        qa['is_impossible'] = True
        ans_string, index = answer_index_in_document(triviaqa_example['Answer'], context)

        if index != -1:
            qa['answers'].append({'text': ans_string, 'answer_start': index})
            qa['is_impossible'] = False

    triviaqa_as_squad = {'data': data, 'version': '2.0'}

    with open(output_file, 'w', encoding='utf-8') as outfile:
        json.dump(triviaqa_as_squad, outfile, indent=2, sort_keys=True, ensure_ascii=False)



sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')

In [5]:
# !wget "https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz"

In [6]:
# !mkdir ./triviaqa 
# !tar -C ./triviaqa -zxf triviaqa-rc.tar.gz

In [7]:
# triviaqa_to_squad_format('triviaqa/qa/wikipedia-train.json', 'triviaqa/evidence/wikipedia', 'triviaqa/triviaqa_train.json')

In [8]:
# triviaqa_to_squad_format('triviaqa/qa/wikipedia-dev.json', 'triviaqa/evidence/wikipedia', 'triviaqa/triviaqa_dev.json')

Now we are going to preprocess like SQuAD

I used the datasets library from hugging face.

In [9]:
!pip install datasets
import datasets

Collecting datasets
  Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
     |████████████████████████████████| 312 kB 779 kB/s            
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
     |████████████████████████████████| 212 kB 11.7 MB/s            
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-1.18.4 responses-0.18.0 xxhash-3.0.0


In [10]:
import pyarrow as pa
import json

def jsontodataset(filepath, up_to=None):
    contexts = []
    questions = []
    ids = []
    answers_list = []
    count = 0
    #     print("here")
    with open(filepath) as f:
        squad = json.load(f)
        for article in squad["data"]:
            title = article.get("title", "").strip()
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].strip()
                for qa in paragraph["qas"]:
                    count+=1
                    if up_to!=None and count>=up_to: 
                        dataset = datasets.Dataset(pa.Table.from_pydict({'context': contexts, 'question': questions, 'id': ids, 'answers': answers_list}))
                        return dataset
                    
                    question = qa["question"].strip()
                    id_ = qa["id"]

                    answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                    answers = [answer["text"].strip() for answer in qa["answers"]]


                    contexts.append(context)
                    questions.append(question)
                    ids.append(id_)
                    answers_list.append({"answer_start": answer_starts, "text": answers,})

    dataset = datasets.Dataset(pa.Table.from_pydict({'context': contexts, 'question': questions, 'id': ids, 'answers': answers_list}))
    #     print(dataset)
    return dataset


In [11]:
# train_dataset = jsontodataset('./triviaqa/triviaqa_train.json', 33194)
train_dataset = jsontodataset('../input/trivia-squadformat/triviaqa_train (1).json', 55323)

In [12]:
validation_dataset = jsontodataset('../input/trivia-squadformat/triviaqa_dev (1).json', 7114)

In [13]:
train_dataset

Dataset({
    features: ['context', 'question', 'id', 'answers'],
    num_rows: 55322
})

In [14]:
train_dataset[0]

{'context': "England is a country that is part of the United Kingdom . It shares land borders with Scotland to the north and Wales to the west . The Irish Sea lies northwest of England and the Celtic Sea lies to the southwest . England is separated from continental Europe by the North Sea to the east and the English Channel to the south . The country covers much of the central and southern part of the island of Great Britain , which lies in the North Atlantic ; and includes over 100 smaller islands such as the Isles of Scilly , and the Isle of Wight . \n \n The area now called England was first inhabited by modern humans during the Upper Palaeolithic period , but takes its name from the Angles , one of the Germanic tribes who settled during the 5th and 6th centuries . England became a unified state in the 10th century , and since the Age of Discovery , which began during the 15th century , has had a significant cultural and legal impact on the wider world . The English language , the A

Overview of the feature names of the dataset.

Let's print the first example.

We see that for 'answers' column the dataset contains a dictionary with keys 'text' and 'answer_start', that each contain a list with one element. 

In [15]:
# train_dataset[0]

Same features for the validation dataset.

In [16]:
validation_dataset

Dataset({
    features: ['context', 'question', 'id', 'answers'],
    num_rows: 7113
})

In validation, the column 'answers' contains a dictionary with keys 'text' and 'answer_start' as well, but each contains a list with multiple elements

In [17]:
validation_dataset[4]

{'context': "Kathleen Mary Ferrier , CBE ( 22 April 1912 - 8 October 1953 ) was an English contralto singer who achieved an international reputation as a stage , concert and recording artist , with a repertoire extending from folksong and popular ballads to the classical works of Bach , Brahms , Mahler and Elgar . Her death from cancer , at the height of her fame , was a shock to the musical world and particularly to the general public , which was kept in ignorance of the nature of her illness until after her death . \n \n The daughter of a Lancashire village schoolmaster , Ferrier showed early talent as a pianist , and won numerous amateur piano competitions while working as a telephonist with the General Post Office . She did not take up singing seriously until 1937 , when after winning a prestigious singing competition at the Carlisle Festival she began to receive offers of professional engagements as a vocalist . Thereafter she took singing lessons , first with J.E . Hutchinson and

If there is no answer, the lists are empty

In [18]:
# validation_dataset[-10]

In [19]:
# df = pd.DataFrame(train_dataset)

In [20]:
# df.tail(10)

### datasets


Let's load the bert model for question answering. This model gives as outputs the start and end logits, as described in the BERT paper, before the softmax.

Load bert-large-uncased model and tokenizer.

In [21]:
from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

Find encodings for BERT with tokenizer. The max length of a sequence (question+anwer with special tokens) is 489, so I use that number for padding.

Preprocessing the dataset

In [22]:
# train_dataset = train_dataset.map(lambda example: {'answer_text': example['answers']['text'][0] if len(example['answers']['text'])!=0 else None, 'answer_start': example['answers']['answer_start'][0] if len(example['answers']['text'])!=0 else None}, remove_columns=['answers'])

In [23]:
# train_dataset[1]

For the training dataset, I noticed that for each question there is only one answer, so there is no need to keep the values of the answers dictionary in lists. For example: `'answers': {'text': ['singing and dancing'], 'answer_start': [207]}}` can be reformated to `'answers': {'text': 'singing and dancing', 'answer_start': 207}}`. As for questions that are inanswerable (they look like this:`'answers': {'text': [], 'answer_start': []}}` we can just have `'answers': {'text': "", 'answer_start': 0}}`.

In [24]:
def find_end(example):

    if (len(example['answers']['text']) != 0):
        context = example['context']
        text = example['answers']['text'][0]
        start_idx = example['answers']['answer_start'][0]

        end_idx = start_idx + len(text)
        
        temp = example['answers'] # to change the value
        temp['answer_end']=end_idx 
        temp['answer_start'] = start_idx # [num]->num
        temp['text'] = text # ['text']->text
    
    else:
        temp = example['answers']
        temp['answer_end'] = 0 # []->0
        temp['answer_start'] = 0 # []->0
        temp['text'] = "" # []->""
        
    return example

train_dataset = train_dataset.map(find_end)

0ex [00:00, ?ex/s]

Check some examples:

In [25]:
train_dataset[1]

{'context': "Dame Judith Olivia `` Judi '' Dench , ( born 9 December 1934 ) is an English actress and author . Dench made her professional debut in 1957 with the Old Vic Company . Over the following few years she performed in several of Shakespeare 's plays in such roles as Ophelia in Hamlet , Juliet in Romeo and Juliet and Lady Macbeth in Macbeth . Although most of her work during this period was in theatre , she also branched into film work , and won a BAFTA Award as Most Promising Newcomer . She drew strong reviews for her leading role in the musical Cabaret in 1968 . \n \n Over the next two decades , Dench established herself as one of the most significant British theatre performers , working for the National Theatre Company and the Royal Shakespeare Company . She achieved success in television during this period , in the series A Fine Romance from 1981 until 1984 , and in 1992 with a starring role in the romantic comedy series As Time Goes By . Her film appearances were infrequent

Example with no answer

Tokenize train dataset and find end and start tokens. The sequence lenght will be 512, the maximum one for bert.

In [26]:
tokenized_train = tokenizer(train_dataset['context'], train_dataset['question'], truncation=True, padding=True)

In [27]:
def find_token_indexes(tokenized, dataset):
    start_positions = []
    end_positions = []
    answers = dataset['answers']
    for i in range(len(answers)):
#         print(answers)
        if (answers[i]['text'] != ''):
            start_positions.append(tokenized.char_to_token(i, answers[i]['answer_start']))
            end_positions.append(tokenized.char_to_token(i, answers[i]['answer_end'] - 1))
            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
            if end_positions[-1] is None:
                end_positions[-1] = tokenizer.model_max_length
        else:
            start_positions.append(0)
            end_positions.append(0)
            

    return start_positions, end_positions
    
s, e = find_token_indexes(tokenized_train, train_dataset)
train_dataset = train_dataset.add_column("start_position", s)
train_dataset = train_dataset.add_column("end_position", e)

In [28]:
tokenizer.save_pretrained('.')

('./tokenizer_config.json',
 './special_tokens_map.json',
 './vocab.txt',
 './added_tokens.json',
 './tokenizer.json')

In [29]:
train_dataset

Dataset({
    features: ['context', 'question', 'id', 'answers', 'start_position', 'end_position'],
    num_rows: 55322
})

In [30]:
batch_size = 8
train_data = TensorDataset(torch.tensor(tokenized_train['input_ids'], dtype=torch.int64), 
                           torch.tensor(tokenized_train['attention_mask'], dtype=torch.float), 
                           torch.tensor(train_dataset['start_position'], dtype=torch.int64), 
                           torch.tensor(train_dataset['start_position'], dtype=torch.int64))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

Validation dataset does not need that much preprocessing. I pass to the dataloader only the input_ids, token_type_ids and attention masks, that will be passed to bert model in batches. I use a Sequential sampler to keep the indexing same as the validation dataset. We will need the offsets mapping to construct the sentence from the predicted start and end tokens and compare it with the actual answers.

In [31]:
# tokenized_validation = tokenizer(validation_dataset['context'], validation_dataset['question'], truncation=True, padding=True, return_offsets_mapping=True)

In [32]:
# batch_size = 8
# val_data = TensorDataset(torch.tensor(tokenized_validation['input_ids'], dtype=torch.int64), 
#                         torch.tensor(tokenized_validation['attention_mask'], dtype=torch.float))
# val_sampler = SequentialSampler(val_data)
# val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

## Initialize & train functions

The training for each epoch took aprox. 2 hours so I couldn't try many epochs and do many runs when using the whole dataset.

For optimizer, I used AdamW (Adam with weight decay) which is the one that was used in BERT during pre-training. 

In [33]:
epochs = 3
model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

In [34]:
from tqdm import tqdm
# model.load_state_dict(torch.load("./weights_" + str(0) + ".pth"))
for epoch in range(epochs):
    epoch_loss = []
    validation_loss = []
    
    total_loss = 0
    model.train()

    count=-1
    progress_bar = tqdm(train_dataloader, leave=True, position=0)
    progress_bar.set_description(f"Epoch {epoch+1}")
    for batch in progress_bar:
        count+=1
        input_ids, mask, start, end  = tuple(t.to(device) for t in batch)

        model.zero_grad()
        loss, start_logits, end_logits = model(input_ids = input_ids, 
                                                attention_mask = mask, 
                                                start_positions = start, 
                                                end_positions = end,
                                                return_dict = False)           

        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        
        if (count % 20 == 0 and count != 0):
            avg = total_loss/count
            progress_bar.set_postfix(Loss=avg)
            
    torch.save(model.state_dict(), "./trivia" + str(epoch) + ".h5") # save for later use
    avg_train_loss = total_loss / len(train_dataloader)
    epoch_loss.append(avg_train_loss)
    print(f"Epoch {epoch} Loss: {avg_train_loss}\n")

Epoch 1: 100%|██████████| 6916/6916 [27:25<00:00,  4.20it/s, Loss=2.4]


Epoch 0 Loss: 2.399498201775251



Epoch 2: 100%|██████████| 6916/6916 [27:29<00:00,  4.19it/s, Loss=1.49]


Epoch 1 Loss: 1.4934059961867545



Epoch 3: 100%|██████████| 6916/6916 [27:28<00:00,  4.20it/s, Loss=1.02]


Epoch 2 Loss: 1.019124074005767



# EVALUATION

evaluate on bert
---



In [35]:
# from tqdm import tqdm
# model.load_state_dict(torch.load("./trivia2.h5"))

# threshold = 1.0
# epoch_i = 0
# correct = 0 
# pred_dict = {}
# na_prob_dict = {}

# model.eval()
# correct = 0
# batch_val_losses = []
# row = 0
# for test_batch in tqdm(val_dataloader):
#     input_ids, masks = tuple(t.to(device) for t in test_batch)

#     with torch.no_grad():
#         # prediction logits
#         start_logits, end_logits = model(input_ids=input_ids,
#                                         attention_mask=masks,
#                                         return_dict=False)
# #         print(outputs)

#     # to cpu
#     start_logits = start_logits.detach().cpu()
#     end_logits = end_logits.detach().cpu()

#     # for every sequence in batch 
#     for bidx in range(len(start_logits)):
#         # apply softmax to logits to get scores
#         start_scores = np.array(F.softmax(start_logits[bidx], dim = 0))
#         end_scores = np.array(F.softmax(end_logits[bidx], dim = 0))
# #             print(start_scores.max(), end_scores.max(), "|", start_scores[0], end_scores[0])

#         # find max for start<=end
#         size = len(start_scores)
#         scores = np.zeros((size, size))
#         # print(start_logits[bidx])
#         # print(end_logits[bidx])
#         for j in range(size):
#             for i in range(j+1): # include j
#                 scores[i,j] = start_scores[i] + end_scores[j]

#         # find best i and j
#         # print(unravel_index(scores.argmax(), scores.shape))
#         start_pred, end_pred = unravel_index(scores.argmax(), scores.shape)
#         answer_pred = ""
#         if (scores[start_pred, end_pred] > scores[0,0]+threshold):

#             offsets = tokenized_validation.offset_mapping[row]
#             pred_char_start = offsets[start_pred][0]

#             # compare with answer starts
#             # if pred_char_start in validation_dataset[row]['answers']['answer_start']:
#             if end_pred < len(offsets):
#                 pred_char_end = offsets[end_pred][1]
#                 answer_pred = validation_dataset[row]['context'][pred_char_start:pred_char_end]
#             else:
#                 answer_pred = validation_dataset[row]['context'][pred_char_start:]
# #                 print(answer_pred)
# #                 print(validation_dataset[row]['answers']['text'])
#             if answer_pred in validation_dataset[row]['answers']['text']:
#                 correct += 1
# #                     print("correct")
# #                 if (len(validation_dataset[row]['answers']['text']) ==0):
# #                     print("here")
# #                     print("correct empty")



#         else:
#             if (len(validation_dataset[row]['answers']['text']) ==0):
#                 correct += 1        
# #                 else:
# #                     print("wrong empty")

#         pred_dict[validation_dataset[row]['id']] = answer_pred
#         na_prob_dict[validation_dataset[row]['id']] = scores[0,0]

#         row+=1


# accuracy = correct/validation_dataset.num_rows
# print("accuracy is: ", accuracy)

In [36]:
# import json 
# with open("pred.json", "w") as outfile:
#     json.dump(pred_dict, outfile)

In [37]:
# import json 
# with open("na_prob.json", "w") as outfile:
#     json.dump(na_prob_dict, outfile)

In [38]:
# for i in range(5):
#     print(f"Question: {validation_dataset[i]['question']}")
#     print(f"Predicted answer: {pred_dict[validation_dataset[i]['id']]}")
#     print(f"Answers: {validation_dataset[i]['answers']['text']}\n")

Write the official evaluation

In [39]:
# !wget https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/ -O evaluation.py

In [40]:
# !python evaluation.py dev-v2.0.json pred.json --na-prob-file na_prob.json --na-prob-thresh 1 --out-image-dir ./

In [41]:
# !python evaluation.py dev-v2.0.json pred.json --na-prob-file na_prob.json --out-image-dir ./