In [1]:
!pip install transformers
#==4.19.2
from google.colab import drive
drive.mount('./mydata')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 9.6 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 52.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.9 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [2]:
import sys
sys.path.append('./mydata/MyDrive/CSNLP_Project/Bert_model_COQA')

In [3]:
import collections
import glob
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from transformers import (AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, BertTokenizer, BertModel, BertConfig)
from processors.coqa import Extract_Features, Processor, Result
from processors.evaluate import CoQAEvaluator, parse_args
from processors.Bert_model import BertBaseUncasedModel, load_dataset, Write_predictions

import torch
import csv
import numpy as np

import json

# locations
train_file="coqa-train-v1.0.json"
predict_file="coqa-dev-v1.0.json"
cur_path = os.getcwd()
output_directory = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models"
input_dir = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data"
# can use either BERT base or BERT large
pretrained_model="bert-base-uncased"
# pretrained_model="bert-large-uncased"
epochs = 2
evaluation_batch_size=16
train_batch_size=2

In [4]:
### train function

def train(train_dataset, model, tokenizer, device, batch_size=train_batch_size):

    train_sampler = RandomSampler(train_dataset) 
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
    t_total = len(train_dataloader) // 1 * epochs

    # Preparing optimizer and scheduler
    optimizer_parameters = [{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "LayerNorm.weight"])],"weight_decay": 0.01,},{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in ["bias", "LayerNorm.weight"])], "weight_decay": 0.0}]
    optimizer = AdamW(optimizer_parameters,lr=1e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2000, num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(pretrained_model, "optimizer.pt")) and os.path.isfile(os.path.join(pretrained_model, "scheduler.pt")):
        optimizer.load_state_dict(torch.load(
            os.path.join(pretrained_model, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(
            os.path.join(pretrained_model, "scheduler.pt")))

    counter = 1
    epochs_trained = 0
    train_loss, loss = 0.0, 0.0
    model.zero_grad()
    iterator = trange(epochs_trained, int(epochs), desc="Epoch", disable=False)
    for _ in iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
        for i,batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = { "input_ids": batch[0],"token_type_ids": batch[1], "attention_mask": batch[2],"start_positions": batch[3],"end_positions": batch[4],"rational_mask": batch[5],"cls_idx": batch[6]}
            # loss = model(**inputs, return_dict=False)
            loss = model(**inputs)
            loss.backward()
            train_loss += loss.item()

            #   optimizing training parameters
            if (i + 1) % 1 == 0:
                optimizer.step()
                scheduler.step()  
                model.zero_grad()
                counter += 1
                #   Saving model weights every 1000 iterations
                if counter % 1000 == 0:
                    output_dir = os.path.join(output_directory, "model_weights"+str(epochs_trained))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            if (i+1) % 1000 == 0:
                print('iter: {}, loss: {}'.format(i,train_loss/counter))
    return train_loss/counter

## Training

In [5]:
#   check if gpu is available to use it or not
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   initialize configurations and tokenizer of Bert model 
config = BertConfig.from_pretrained(pretrained_model, return_dict=False)
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

model = BertBaseUncasedModel.from_pretrained(pretrained_model, from_tf=bool(".ckpt" in pretrained_model), config=config,cache_dir=None,)
# print(model)
model.to(device)

# if (os.path.exists(output_directory) and os.listdir(output_directory)):
#     raise ValueError("Output directory " + output_directory + " already exists, Change output_directory name")

method = 'replace_all'

#   Loading dataset and training
# this command will take several hours
cache_file_name = 'bert-base-uncased_train_with_T5_{}'.format(method)

train_file_name = 'coqa-train-v1.0-{}_with_T5.json'.format('append_v3')

train_dataset = load_dataset(tokenizer, input_dir=input_dir, evaluate=False, cache_file_name=cache_file_name, train_file_name=train_file_name, append_method='replace_all')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertBaseUncasedModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertBaseUncasedModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertBaseUncasedModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertBaseUncasedModel were not initialized from the model checkpoint at bert-base-uncased and are newly ini

Loading cache /content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/bert-base-uncased_train_with_T5_replace_all


In [None]:
# train the model on CoQA
train_loss = train(train_dataset, model, tokenizer, device)

variant_name = 'Bert_with_T5_rewritten_epoch2_replace_all'

#   create output directory for model parameters and to write predictions
if not os.path.exists(output_directory+'/'+variant_name) :
    os.makedirs(output_directory+'/'+variant_name)
            
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_directory+'/'+variant_name)
tokenizer.save_pretrained(output_directory+'/'+variant_name)

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

iter: 999, loss: 5.533492145242986
iter: 1999, loss: 4.690178165848049


## Prediction

predict on dev dataset

In [None]:
#   check if gpu is available to use it or not
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   Loading Bert model for writing predictions
# model = BertBaseUncasedModel.from_pretrained(output_directory)
# tokenizer = BertTokenizer.from_pretrained(output_directory, do_lower_case=True)
# model.to(device)
# run for different parameters

# check if gpu is available to use it or not
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

# for m in model_parameter_directory:
#     variant_name = m.split('/')[-1]
#     model = BertBaseUncasedModel.from_pretrained(m) 
#     tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
#     model.to(device)
#     Write_predictions(model, tokenizer, device, variant_name)
# train_dataset = load_dataset(tokenizer, evaluate=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

cache_file_name = 'bert-base-uncased_dev_with_T5_append'

predict_file_name = 'coqa-dev-v1.0.json'

model_parameter_directory

['/content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/Bert_from_original_Surya_epoch4',
 '/content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models/Bert_with_T5_rewritten_epoch4_append_v2']

In [None]:
# for m in model_parameter_directory:
m = model_parameter_directory[1]
variant_name = m.split('/')[-1]
# m = m + '/pytorch_model_2.bin'
model = BertBaseUncasedModel.from_pretrained(m) 
tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
model.to(device)
Write_predictions(model, tokenizer, device, variant_name, input_dir=input_dir, output_directory=output_directory, cache_file_name=cache_file_name, predict_file_name=None,method='append_v3.2', append_method='append')

Loading cache /content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/bert-base-uncased_dev_with_T5_append


Evaluating: 100%|██████████| 8775/8775 [03:14<00:00, 45.15it/s]
Writing preditions: 100%|██████████| 7983/7983 [00:28<00:00, 284.79it/s]


## Evaluation

In [None]:
evaluator = CoQAEvaluator(input_dir+'/'+predict_file)

variant_name = 'Bert_from_original_Surya_epoch4'

pre_file_bert = output_directory+'/'+variant_name+'/'+'predictions.json'

# evaluate
with open(pre_file_bert) as f:
    pred_data = CoQAEvaluator.preds_to_dict(pre_file_bert)

# write evaluate result
with open(output_directory+'/'+variant_name+'/'+'evaluation.json', 'w') as f:
    json.dump(evaluator.model_performance(pred_data), f, indent=2)

# show
# print(json.dumps(evaluator.model_performance(pred_data), indent=2))

get_domain_scores --> model_performance --> get_raw_scores --> compute_turn_score --> _compute_turn_score --> compute_exact /  compute_f1


## Test any input on the fine-tuned model: By ZYZ

In [None]:
### this cell load the fine tuned model

# check if gpu is available to use it or not
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

for m in model_parameter_directory:
    variant_name = m.split('/')[-1]
    model = BertBaseUncasedModel.from_pretrained(m) 
    tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
    model.to(device)

In [None]:
### load the dev dataset with gold answer

cache_file = os.path.join(input_dir,"bert-base-uncased_dev_test")

if os.path.exists(cache_file):
    print("Loading cache",cache_file)
    features_and_dataset = torch.load(cache_file)
    features, dataset, examples = (
        features_and_dataset["features"],features_and_dataset["dataset"],features_and_dataset["examples"])
else:
    print("Creating features from dataset file at", input_dir)

    processor = Processor()

    examples = processor.get_examples(input_dir, 2, filename=predict_file, threads=1)

    # max_seq_length is the total length for input sequence of BERT 
    features, dataset = Extract_Features(examples=examples,tokenizer=tokenizer,max_seq_length=512, doc_stride=128, max_query_length=64, is_training=True, threads=1)
    #   caching it in a cache file to reduce time
    torch.save({"features": features, "dataset": dataset, "examples": examples}, cache_file)

# create evaluation_test_dataloader
evalutation_test_sampler = SequentialSampler(dataset)
evaluation_test_dataloader = DataLoader(dataset, sampler=evalutation_test_sampler, batch_size=1)


Loading cache /content/mydata/MyDrive/Colab Notebooks/data/bert-base-uncased_dev_test


1. model.eval() will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
2. torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [None]:
def predict(batch):
  '''
  given input_ids of text, QA history and current question, predict answer with 
  model and compare with ground truth answer span
  '''
  model.eval()
  batch = tuple(t.to(device) for t in batch)
  with torch.no_grad():
      inputs = {"input_ids": batch[0],"token_type_ids": batch[1],"attention_mask": batch[2]}
      inputs_text = inputs["input_ids"]
      outputs = model(**inputs)

  # convert ids to texts
  input_ids = inputs_text[0]
  # tokens are all QA and text
  tokens = tokenizer.convert_ids_to_tokens(input_ids)

  # sep_idx = list(input_ids).index(tokenizer.sep_token_id)
  # question = " ".join(tokens[:(sep_idx+1)])
  # sep_idx2 = list(input_ids[(sep_idx+1):]).index(tokenizer.sep_token_id)
  # text = " ".join(tokens[(sep_idx+1):(sep_idx2+(sep_idx+1))])

  def convert_to_list(tensor):
      return tensor.detach().cpu().tolist()
      
  # extract answer from prediction of bert
  output = [convert_to_list(output[0]) for output in outputs]
  start_logits, end_logits, yes_logits, no_logits, unk_logits = output
  start_pos = np.argmax(start_logits)
  end_pos = np.argmax(end_logits)

  print("\nQuestion:\n{}".format(question.capitalize()))
  answer = " ".join(tokens[start_pos:end_pos+1])
  print("Answer:\n{}.".format(answer.capitalize()))

  # pass

In [None]:
random_num = np.random.randint(0,len(evaluation_test_dataloader))
random_num = 3

## evaluation_test_dataloader is not iteratable
## dummy method
count = 0
for batch in evaluation_test_dataloader:
    # print(len(batch[3]))
    if count >= random_num:
        break
    count += 1

# data from batch
input_ids = batch[0][0]
token_type_ids = batch[1][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# sep_idx = list(input_ids).index(tokenizer.sep_token_id)
sep_idx = torch.where(token_type_ids==1)[0][0]
question = " ".join(tokens[:(sep_idx)])
sep_idx2 = list(input_ids[(sep_idx+1):]).index(tokenizer.sep_token_id)
text = " ".join(tokens[(sep_idx+1):(sep_idx2+(sep_idx+1))])

# extract gold answer
start_positions = batch[3][0]
end_positions = batch[4][0]
gold_answer = " ".join(tokens[start_positions:end_positions+1])

print('question: '+question,'\n\n','text: '+text,'\n\n','gold answer: '+gold_answer)#+data["answer"][random_num])

question: [CLS] where did she live ? in a barn [SEP] did she live alone ? no [SEP] who did she live with ? [SEP] 

 text: upon a time , in a barn near a farm house , there lived a little white kitten named cotton . cotton lived high up in a nice warm place above the barn where all of the farmer ' s horses slept . but cotton was n ' t alone in her little home above the barn , oh no . she shared her hay bed with her mommy and 5 other sisters . all of her sisters were cute and fluffy , like cotton . but she was the only white one in the bunch . the rest of her sisters were all orange with beautiful white tiger stripes like cotton ' s mommy . being different made cotton quite sad . she often wished she looked like the rest of her family . so one day , when cotton found a can of the old farmer ' s orange paint , she used it to paint herself like them . when her mommy and sisters found her they started laughing . " what are you doing , cotton ? ! " " i only wanted to be more like you " . cot

In [None]:
predict(batch)


Question:
[cls] where did she live ? in a barn [sep] did she live alone ? no [sep] who did she live with ? [sep]
Answer:
Her mommy and 5 other sisters.


## Write all the failure cases

#### Use evaluator.compute_turn_score to evaluate each case

e.g.

evaluator.compute_turn_score('3dr23u6we5exclen4th8uq9rb42tel', 4, 'her mommy and 5 other sisters')

output: {'em': 1.0, 'f1': 1.0} (exact match and f1 score)


In [None]:
# failure criteria
criteria_flag = 'full'

# tuples for all prediction data
pre_data_list = []
for key in pred_data.keys():
    pre_data_list.append(key + (pred_data[key],) )

# find out all the failure cases: their ids
fail_results = []
for prediction in pre_data_list:
    result = evaluator.compute_turn_score(prediction[0],prediction[1],prediction[2])
    # totally wrong
    if criteria_flag == 'full':
        if result == {'em': 0.0, 'f1': 0.0}:
            fail_results.append(prediction)
    # not fully correct
    elif criteria_flag == 'partial':
        if result != {'em': 1.0, 'f1': 1.0}:
            fail_results.append(prediction)
    # print(result)


In [None]:
len(fail_results) / len(pre_data_list)

0.1663535011900288

In [None]:
# write results
with open(output_directory+'/'+variant_name+'/'+'fully_failed_predictions.csv', mode='w+', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['passage_id','turn_id','story_type', 'story', 'history QA', 'current question', 'gold_answers', 'failed_prediction', 'scores'])

with open(output_directory+'/'+variant_name+'/'+'fully_failed_predictions.csv', mode='a+', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    
    for fail_result in fail_results:
        story_id = fail_result[0]
        turn_id = fail_result[1]
        key = (story_id, turn_id)
        a_gold_list = evaluator.gold_data[key]
        scores = evaluator.compute_turn_score(fail_result[0],fail_result[1],fail_result[2])
        # evaluator.questions[turn_id]
        writer.writerow([story_id,turn_id,evaluator.id_to_source[story_id],evaluator.story_dict[story_id],\
                         evaluator.question_dict[story_id][:turn_id-1],evaluator.question_dict[story_id][turn_id-1],a_gold_list, fail_result[2], scores])


### Test results of yes/no answers

In [None]:
# tuples for all prediction data
yn_data_list = []
for key in pred_data.keys():
    if pred_data[key] in ['yes','no']:
        yn_data_list.append(key + (pred_data[key],) )


# find out all the failure cases: their ids
yn_fail_results = []
for prediction in yn_data_list:
    result = evaluator.compute_turn_score(prediction[0],prediction[1],prediction[2])
    # not fully correct
    # if result != {'em': 1.0, 'f1': 1.0}:
    # totally wrong
    if result == {'em': 0.0, 'f1': 0.0}:
        yn_fail_results.append(prediction)
    # print(result)


In [None]:
yn_ratio = len(yn_data_list) / len(pred_data)
yn_ratio

0.22059376174370537

In [None]:
non_yn_failure_ratio = (len(fail_results)-len(yn_fail_results)) / (len(pre_data_list)-len(yn_data_list))
non_yn_failure_ratio

0.1486660237865638

In [None]:
yn_failure_ratio = len(yn_fail_results) / len(yn_data_list)
yn_failure_ratio

0.22884724588302102

Observation: The fully wrong ({'em': 0.0, 'f1': 0.0}) ratio of yes/no type question is higher then the normal question.


Need to construct a dictionary for computing the overall F1 score of these two different type of questions.

In [None]:
# tuples for all prediction data
yn_dict = {}
non_yn_dict = {}
yn_turns  = {}
for i in list(domain_mappings.values()):
    yn_turns[i] = 0
    
for key in pred_data.keys():
    if pred_data[key] in ['yes','no']:
        story_type = evaluator.id_to_source[key[0]]
        yn_turns[domain_mappings[story_type]] += 1
        yn_dict[key] = pred_data[key]
    else:
        non_yn_dict[key] = pred_data[key]

In [None]:
list(domain_mappings.values())

['children_stories',
 'literature',
 'mid-high_school',
 'news',
 'wikipedia',
 'science',
 'reddit']

In [None]:
yn_turns[story_type]

290

In [None]:
# evaluator.model_performance(yn_dict)
result_yn = json.dumps(evaluator.model_performance(yn_dict), indent=2)
result_yn = json.loads(result_yn)
for case in result_yn:
    if case not in ['in_domain', 'out_domain','overall']:
        if yn_turns[case] == 0:
            print('no yes/no answer in {}'.format(case))
            pass
        else:
            result_yn[case]['em'] = result_yn[case]['em'] * result_yn[case]['turns'] / yn_turns[case]
            result_yn[case]['f1'] = result_yn[case]['f1'] * result_yn[case]['turns'] / yn_turns[case]
            result_yn[case]['turns'] = yn_turns[case]

no yes/no answer in reddit
no yes/no answer in science


In [None]:
result_yn

{'children_stories': {'em': 74.27762039660055,
  'f1': 74.27762039660055,
  'turns': 353},
 'in_domain': {'em': 16.8, 'f1': 16.8, 'turns': 7983},
 'literature': {'em': 74.05432098765431,
  'f1': 74.05432098765431,
  'turns': 405},
 'mid-high_school': {'em': 75.67442455242966,
  'f1': 75.67442455242966,
  'turns': 391},
 'news': {'em': 78.35310559006211, 'f1': 78.35310559006211, 'turns': 322},
 'out_domain': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'overall': {'em': 16.8, 'f1': 16.8, 'turns': 7983},
 'reddit': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'science': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'wikipedia': {'em': 80.17862068965518, 'f1': 80.17862068965518, 'turns': 290}}

In [None]:
# evaluator.model_performance(yn_dict)
result_non_yn = json.dumps(evaluator.model_performance(non_yn_dict), indent=2)
result_non_yn = json.loads(result_non_yn)
for case in result_non_yn:
    if case not in ['in_domain', 'out_domain','overall']:
        if yn_turns[case] == result_non_yn[case]['turns']:
            print('no yes/no answer in {}'.format(case))
            pass
        else:
            result_non_yn[case]['em'] = result_non_yn[case]['em'] * result_non_yn[case]['turns'] / (result_non_yn[case]['turns']-yn_turns[case])
            result_non_yn[case]['f1'] = result_non_yn[case]['f1'] * result_non_yn[case]['turns'] / (result_non_yn[case]['turns']-yn_turns[case])
            result_non_yn[case]['turns'] = (result_non_yn[case]['turns']-yn_turns[case])

no yes/no answer in reddit
no yes/no answer in science


In [None]:
result_non_yn

{'children_stories': {'em': 60.748600746268664,
  'f1': 73.64272388059702,
  'turns': 1072},
 'in_domain': {'em': 48.1, 'f1': 58.1, 'turns': 7983},
 'literature': {'em': 58.81306122448979,
  'f1': 71.98612244897959,
  'turns': 1225},
 'mid-high_school': {'em': 57.63232963549921,
  'f1': 71.25451664025357,
  'turns': 1262},
 'news': {'em': 63.62381311228335, 'f1': 76.92019593067067, 'turns': 1327},
 'out_domain': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'overall': {'em': 48.1, 'f1': 58.1, 'turns': 7983},
 'reddit': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'science': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'wikipedia': {'em': 67.4254491017964, 'f1': 78.37904191616767, 'turns': 1336}}

In [None]:
print(json.dumps(evaluator.model_performance(pred_data), indent=2))

{
  "children_stories": {
    "em": 64.1,
    "f1": 73.8,
    "turns": 1425
  },
  "literature": {
    "em": 62.5,
    "f1": 72.5,
    "turns": 1630
  },
  "mid-high_school": {
    "em": 61.9,
    "f1": 72.3,
    "turns": 1653
  },
  "news": {
    "em": 66.6,
    "f1": 77.2,
    "turns": 1649
  },
  "wikipedia": {
    "em": 69.6,
    "f1": 78.7,
    "turns": 1626
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 65.0,
    "f1": 74.9,
    "turns": 7983
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 65.0,
    "f1": 74.9,
    "turns": 7983
  }
}


so the actuall overall f1 score for yes/no question is higher