In [1]:
!pip install transformers
from google.colab import drive
drive.mount('./mydata')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 4.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 76.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 61.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 5.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYA

In [5]:
import sys
sys.path.append('./mydata/MyDrive/CSNLP_Project/Bert_model_COQA')

In [15]:
import collections
import glob
import os
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm, trange
from transformers import (AdamW, AutoConfig, AutoTokenizer, get_linear_schedule_with_warmup, BertTokenizer, BertModel, BertConfig)
from data.processors.coqa import Extract_Features, Processor, Result
from data.processors.metrics import get_predictions
from data.processors.evaluate import CoQAEvaluator, parse_args
from transformers import BertModel, BertPreTrainedModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
import csv
import numpy as np

import json

# locations
train_file="coqa-train-v1.0.json"
predict_file="coqa-dev-v1.0.json"
cur_path = os.getcwd()
output_directory = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data/Bert_models"
input_dir = cur_path + "/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data"
# can use either BERT base or BERT large
pretrained_model="bert-base-uncased"
# pretrained_model="bert-large-uncased"
epochs = 4
evaluation_batch_size=16
train_batch_size=2

## Classes and Functions

In [8]:
#   our model is adapted from the baseline model of https://arxiv.org/pdf/1909.10772.pdf

class BertBaseUncasedModel(BertPreTrainedModel):

    #   Initialize Layers for our model
    def __init__(self,config,activation='relu'):
        super(BertBaseUncasedModel, self).__init__(config)
        self.bert = BertModel(config)
        hidden_size = config.hidden_size
        self.fc=nn.Linear(hidden_size,hidden_size)
        self.linear1 =nn.Linear(hidden_size,1)
        self.linear2= nn.Linear(hidden_size,2)
        self.activation = getattr(F, activation)
        self.init_weights()

    def forward(self,input_ids,token_type_ids=None,attention_mask=None,start_positions=None,end_positions=None,rational_mask=None,cls_idx=None,head_mask=None):

        #   Bert-base outputs
        outputs = self.bert(input_ids,token_type_ids=token_type_ids,attention_mask=attention_mask,head_mask=head_mask)
        output_vector, bert_pooled_output = outputs

        #   rational logits (rationale probability to calculate start and end logits)
        #   fc = w2 x relu(W1 x h)
        rational_logits = self.fc(output_vector)
        rational_logits = self.activation(self.linear1(rational_logits))

        #   pr = sigmoid(fc)
        rational_logits = torch.sigmoid(rational_logits)
        #   h1 = pr x outputvector-h
        output_vector = output_vector * rational_logits
        mask = token_type_ids.type(output_vector.dtype)
        rational_logits = rational_logits.squeeze(-1) * mask

        #   calculating start and end logits using FC(h1)
        start_end_logits = self.fc(output_vector)
        start_end_logits = self.activation(self.linear2(start_end_logits))
        
        start_logits, end_logits = start_end_logits.split(1, dim=-1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)
        start_logits= start_logits * rational_logits
        end_logits =  end_logits * rational_logits

        #   fc2 = wa2 x relu(Wa1 x h1)
        attention  = self.fc(output_vector)
        attention  = (self.activation(self.linear1(attention))).squeeze(-1)

        #   a = SoftMax(fc2)
        attention = F.softmax(attention, dim=-1)
        attention_pooled_output = (attention.unsqueeze(-1) * output_vector).sum(dim=-2)
        unk_logits = self.fc(bert_pooled_output)
        unk_logits = self.activation(self.linear1(unk_logits))

        #   calculate yes and no logits using pooled-output = FC(a)
        yes_no_logits =self.fc(attention_pooled_output)
        yes_no_logits =self.activation(self.linear2(yes_no_logits))
        yes_logits, no_logits = yes_no_logits.split(1, dim=-1)

        if start_positions != None and end_positions != None:
            start_positions, end_positions = start_positions + cls_idx, end_positions + cls_idx
            start = torch.cat((yes_logits, no_logits, unk_logits, start_logits), dim=-1)
            end = torch.cat((yes_logits, no_logits, unk_logits, end_logits), dim=-1)
            if len(start_positions.size()) > 1:
                start_positions = start_positions.squeeze(-1)
            if len(end_positions.size()) > 1:
                end_positions = end_positions.squeeze(-1)

            #   calculate cross entropy loss for start and end logits
            Entropy_loss = CrossEntropyLoss()
            start_loss = Entropy_loss(start, start_positions)
            end_loss = Entropy_loss(end, end_positions)
            #   Training objective: to minimize the total loss of both start and end logits
            total_loss = (start_loss + end_loss) / 2 
            return total_loss
        return start_logits, end_logits, yes_logits, no_logits, unk_logits


In [13]:
### load dataset for training or evaluation

def load_dataset(tokenizer, evaluate=False, cache_file_name=None, train_file_name=None):
    '''
    converting raw coqa dataset into features to be processed by BERT  
    '''
    # print(os.path.join(input_dir,"bert-base-uncased_train"))
    # use user-defined cache_file_name or the default ones
    if cache_file_name is not None:
        cache_file = os.path.join(input_dir,cache_file_name)
    else:
        if evaluate:
            cache_file = os.path.join(input_dir,"bert-base-uncased_dev")
        else:
            cache_file = os.path.join(input_dir,"bert-base-uncased_train")

    if os.path.exists(cache_file):
        print("Loading cache",cache_file)
        features_and_dataset = torch.load(cache_file)
        features, dataset, examples = (
            features_and_dataset["features"],features_and_dataset["dataset"],features_and_dataset["examples"])
    else:
        print("Creating features from dataset file at", input_dir)
        if train_file_name is not None:
            train_file = train_file_name

        if not "data" and ((evaluate and not predict_file) or (not evaluate and not train_file)):
            raise ValueError("predict_file or train_file not found")
        else:
            processor = Processor()
            if evaluate:
                # process the raw data, load only two historical conversation
                # def get_examples(self, data_dir, history_len, filename=None, threads=1)
                examples = processor.get_examples(input_dir, 2, filename=predict_file, threads=1)
            else:
                # process the raw data
                # def get_examples(self, data_dir, history_len, filename=None, threads=1)
                # number of examples is the same as the number of the QA pairs: 108647
                # each example is consist of question_text with 2 historical turn and the text, and ground truth start and end positions
                examples = processor.get_examples(input_dir, 2, filename=train_file, threads=1)
        
        # max_seq_length is the total length for input sequence of BERT 
        features, dataset = Extract_Features(examples=examples,tokenizer=tokenizer,max_seq_length=512, doc_stride=128, max_query_length=64, is_training=not evaluate, threads=1)
    #   caching it in a cache file to reduce time
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cache_file)
    if evaluate:
        return dataset, examples, features
    return dataset

In [10]:
### train function

def convert_to_list(tensor):
    return tensor.detach().cpu().tolist()

def train(train_dataset, model, tokenizer, device):

    train_sampler = RandomSampler(train_dataset) 
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=train_batch_size)
    t_total = len(train_dataloader) // 1 * epochs

    # Preparing optimizer and scheduler
    
    optimizer_parameters = [{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "LayerNorm.weight"])],"weight_decay": 0.01,},{"params": [p for n, p in model.named_parameters() if any(nd in n for nd in ["bias", "LayerNorm.weight"])], "weight_decay": 0.0}]
    optimizer = AdamW(optimizer_parameters,lr=1e-5, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=2000, num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(pretrained_model, "optimizer.pt")) and os.path.isfile(os.path.join(pretrained_model, "scheduler.pt")):
        optimizer.load_state_dict(torch.load(
            os.path.join(pretrained_model, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(
            os.path.join(pretrained_model, "scheduler.pt")))

    counter = 1
    epochs_trained = 0
    train_loss, loss = 0.0, 0.0
    model.zero_grad()
    iterator = trange(epochs_trained, int(epochs), desc="Epoch", disable=False)
    for _ in iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
        for i,batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(device) for t in batch)
            inputs = { "input_ids": batch[0],"token_type_ids": batch[1], "attention_mask": batch[2],"start_positions": batch[3],"end_positions": batch[4],"rational_mask": batch[5],"cls_idx": batch[6]}
            # loss = model(**inputs, return_dict=False)
            loss = model(**inputs)
            loss.backward()
            train_loss += loss.item()

            #   optimizing training parameters
            if (i + 1) % 1 == 0:
                optimizer.step()
                scheduler.step()  
                model.zero_grad()
                counter += 1
                #   Saving model weights every 1000 iterations
                if counter % 1000 == 0:
                    output_dir = os.path.join(output_directory, "model_weights"+str(epochs_trained))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            if (i+1) % 1000 == 0:
                print('iter: {}, loss: {}'.format(i,train_loss/counter))
    return train_loss/counter

In [11]:
### wrtiting predictions

def Write_predictions(model, tokenizer, device, variant_name):
    dataset, examples, features = load_dataset(tokenizer, evaluate=True)

    if not os.path.exists(output_directory+'/'+variant_name):
        os.makedirs(output_directory+'/'+variant_name)
        
    #   wrtiting predictions once training is complete
    evalutation_sampler = SequentialSampler(dataset)
    evaluation_dataloader = DataLoader(dataset, sampler=evalutation_sampler, batch_size=evaluation_batch_size)
    mod_results = []
    for batch in tqdm(evaluation_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            # each batch has 4 elements, the last is the examle_indeces
            inputs = {"input_ids": batch[0],"token_type_ids": batch[1],"attention_mask": batch[2]}
            # indices of ConvQA example in this batch
            example_indices = batch[3]
            outputs = model(**inputs)
        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)
            output = [convert_to_list(output[i]) for output in outputs]
            start_logits, end_logits, yes_logits, no_logits, unk_logits = output
            result = Result(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits, yes_logits=yes_logits, no_logits=no_logits, unk_logits=unk_logits)
            mod_results.append(result)

    # Get predictions for development dataset and store it in predictions.json
    output_prediction_file = os.path.join(output_directory+'/'+variant_name, "predictions.json")
    get_predictions(examples, features, mod_results, 20, 30, True, output_prediction_file, False, tokenizer)


## Training

In [None]:
#   check if gpu is available to use it or not
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   initialize configurations and tokenizer of Bert model 
config = BertConfig.from_pretrained(pretrained_model, return_dict=False)
tokenizer = BertTokenizer.from_pretrained(pretrained_model)

model = BertBaseUncasedModel.from_pretrained(pretrained_model, from_tf=bool(".ckpt" in pretrained_model), config=config,cache_dir=None,)
# print(model)
model.to(device)

# if (os.path.exists(output_directory) and os.listdir(output_directory)):
#     raise ValueError("Output directory " + output_directory + " already exists, Change output_directory name")

#   Loading dataset and training
# this command will take several hours
cache_file_name = 'bert-base-uncased_train_with_T5_append'

train_file_name = 'coqa-train-v1.0-append_with_T5.json'

train_dataset = load_dataset(tokenizer, evaluate=False, cache_file_name=cache_file_name, train_file_name=train_file_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertBaseUncasedModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertBaseUncasedModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertBaseUncasedModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertBaseUncasedModel were not initialized from the model checkpoint at bert-base-uncased and are newly ini

Creating features from dataset file at /content/mydata/MyDrive/CSNLP_Project/Bert_model_COQA/data


Extracting features from dataset:  19%|█▉        | 1387/7199 [14:43<1:06:01,  1.47it/s]

In [None]:
# train the model on CoQA
train_loss = train(train_dataset, model, tokenizer, device)

variant_name = 'Bert_with_T5_rewritten_epoch4'

#   create output directory for model parameters and to write predictions
if not os.path.exists(output_directory+'/'+variant_name) :
    os.makedirs(output_directory+'/'+variant_name)
            
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(output_directory+'/'+variant_name)
tokenizer.save_pretrained(output_directory+'/'+variant_name)

## Prediction

predict on dev dataset

In [None]:
#   check if gpu is available to use it or not
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   Loading Bert model for writing predictions
# model = BertBaseUncasedModel.from_pretrained(output_directory)
# tokenizer = BertTokenizer.from_pretrained(output_directory, do_lower_case=True)
# model.to(device)
# run for different parameters

# check if gpu is available to use it or not
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

# for m in model_parameter_directory:
#     variant_name = m.split('/')[-1]
#     model = BertBaseUncasedModel.from_pretrained(m) 
#     tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
#     model.to(device)
#     Write_predictions(model, tokenizer, device, variant_name)
# train_dataset = load_dataset(tokenizer, evaluate=False)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

# for m in model_parameter_directory:
m = model_parameter_directory[1]
variant_name = m.split('/')[-1]
# m = m + '/pytorch_model_2.bin'
model = BertBaseUncasedModel.from_pretrained(m) 
tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
model.to(device)
Write_predictions(model, tokenizer, device, variant_name)

/content/mydata/MyDrive/Colab Notebooks/data/bert-base-uncased_train
Loading cache /content/mydata/MyDrive/Colab Notebooks/data/bert-base-uncased_dev


Evaluating: 100%|██████████| 536/536 [05:05<00:00,  1.76it/s]
Writing preditions: 100%|██████████| 7983/7983 [00:28<00:00, 283.33it/s]


## Evaluation

In [None]:
evaluator = CoQAEvaluator(input_dir+'/'+predict_file)

variant_name = 'Bert_from_original_Surya_epoch4'

pre_file_bert = output_directory+'/'+variant_name+'/'+'predictions.json'

# evaluate
with open(pre_file_bert) as f:
    pred_data = CoQAEvaluator.preds_to_dict(pre_file_bert)

# write evaluate result
with open(output_directory+'/'+variant_name+'/'+'evaluation.json', 'w') as f:
    json.dump(evaluator.model_performance(pred_data), f, indent=2)

# show
# print(json.dumps(evaluator.model_performance(pred_data), indent=2))

get_domain_scores --> model_performance --> get_raw_scores --> compute_turn_score --> _compute_turn_score --> compute_exact /  compute_f1


## Test any input on the fine-tuned model: By ZYZ

In [None]:
### this cell load the fine tuned model

# check if gpu is available to use it or not
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_parameter_directory = [ f.path for f in os.scandir(output_directory) if f.is_dir() ]

for m in model_parameter_directory:
    variant_name = m.split('/')[-1]
    model = BertBaseUncasedModel.from_pretrained(m) 
    tokenizer = BertTokenizer.from_pretrained(m, do_lower_case=True)
    model.to(device)

In [None]:
### load the dev dataset with gold answer

cache_file = os.path.join(input_dir,"bert-base-uncased_dev_test")

if os.path.exists(cache_file):
    print("Loading cache",cache_file)
    features_and_dataset = torch.load(cache_file)
    features, dataset, examples = (
        features_and_dataset["features"],features_and_dataset["dataset"],features_and_dataset["examples"])
else:
    print("Creating features from dataset file at", input_dir)

    processor = Processor()

    examples = processor.get_examples(input_dir, 2, filename=predict_file, threads=1)

    # max_seq_length is the total length for input sequence of BERT 
    features, dataset = Extract_Features(examples=examples,tokenizer=tokenizer,max_seq_length=512, doc_stride=128, max_query_length=64, is_training=True, threads=1)
    #   caching it in a cache file to reduce time
    torch.save({"features": features, "dataset": dataset, "examples": examples}, cache_file)

# create evaluation_test_dataloader
evalutation_test_sampler = SequentialSampler(dataset)
evaluation_test_dataloader = DataLoader(dataset, sampler=evalutation_test_sampler, batch_size=1)


Loading cache /content/mydata/MyDrive/Colab Notebooks/data/bert-base-uncased_dev_test


1. model.eval() will notify all your layers that you are in eval mode, that way, batchnorm or dropout layers will work in eval mode instead of training mode.
2. torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).

In [None]:
def predict(batch):
  '''
  given input_ids of text, QA history and current question, predict answer with 
  model and compare with ground truth answer span
  '''
  model.eval()
  batch = tuple(t.to(device) for t in batch)
  with torch.no_grad():
      inputs = {"input_ids": batch[0],"token_type_ids": batch[1],"attention_mask": batch[2]}
      inputs_text = inputs["input_ids"]
      outputs = model(**inputs)

  # convert ids to texts
  input_ids = inputs_text[0]
  # tokens are all QA and text
  tokens = tokenizer.convert_ids_to_tokens(input_ids)

  # sep_idx = list(input_ids).index(tokenizer.sep_token_id)
  # question = " ".join(tokens[:(sep_idx+1)])
  # sep_idx2 = list(input_ids[(sep_idx+1):]).index(tokenizer.sep_token_id)
  # text = " ".join(tokens[(sep_idx+1):(sep_idx2+(sep_idx+1))])

  def convert_to_list(tensor):
      return tensor.detach().cpu().tolist()
      
  # extract answer from prediction of bert
  output = [convert_to_list(output[0]) for output in outputs]
  start_logits, end_logits, yes_logits, no_logits, unk_logits = output
  start_pos = np.argmax(start_logits)
  end_pos = np.argmax(end_logits)

  print("\nQuestion:\n{}".format(question.capitalize()))
  answer = " ".join(tokens[start_pos:end_pos+1])
  print("Answer:\n{}.".format(answer.capitalize()))

  # pass

In [None]:
random_num = np.random.randint(0,len(evaluation_test_dataloader))
random_num = 3

## evaluation_test_dataloader is not iteratable
## dummy method
count = 0
for batch in evaluation_test_dataloader:
    # print(len(batch[3]))
    if count >= random_num:
        break
    count += 1

# data from batch
input_ids = batch[0][0]
token_type_ids = batch[1][0]
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# sep_idx = list(input_ids).index(tokenizer.sep_token_id)
sep_idx = torch.where(token_type_ids==1)[0][0]
question = " ".join(tokens[:(sep_idx)])
sep_idx2 = list(input_ids[(sep_idx+1):]).index(tokenizer.sep_token_id)
text = " ".join(tokens[(sep_idx+1):(sep_idx2+(sep_idx+1))])

# extract gold answer
start_positions = batch[3][0]
end_positions = batch[4][0]
gold_answer = " ".join(tokens[start_positions:end_positions+1])

print('question: '+question,'\n\n','text: '+text,'\n\n','gold answer: '+gold_answer)#+data["answer"][random_num])

question: [CLS] where did she live ? in a barn [SEP] did she live alone ? no [SEP] who did she live with ? [SEP] 

 text: upon a time , in a barn near a farm house , there lived a little white kitten named cotton . cotton lived high up in a nice warm place above the barn where all of the farmer ' s horses slept . but cotton was n ' t alone in her little home above the barn , oh no . she shared her hay bed with her mommy and 5 other sisters . all of her sisters were cute and fluffy , like cotton . but she was the only white one in the bunch . the rest of her sisters were all orange with beautiful white tiger stripes like cotton ' s mommy . being different made cotton quite sad . she often wished she looked like the rest of her family . so one day , when cotton found a can of the old farmer ' s orange paint , she used it to paint herself like them . when her mommy and sisters found her they started laughing . " what are you doing , cotton ? ! " " i only wanted to be more like you " . cot

In [None]:
predict(batch)


Question:
[cls] where did she live ? in a barn [sep] did she live alone ? no [sep] who did she live with ? [sep]
Answer:
Her mommy and 5 other sisters.


## Write all the failure cases

#### Use evaluator.compute_turn_score to evaluate each case

e.g.

evaluator.compute_turn_score('3dr23u6we5exclen4th8uq9rb42tel', 4, 'her mommy and 5 other sisters')

output: {'em': 1.0, 'f1': 1.0} (exact match and f1 score)


In [None]:
# failure criteria
criteria_flag = 'full'

# tuples for all prediction data
pre_data_list = []
for key in pred_data.keys():
    pre_data_list.append(key + (pred_data[key],) )

# find out all the failure cases: their ids
fail_results = []
for prediction in pre_data_list:
    result = evaluator.compute_turn_score(prediction[0],prediction[1],prediction[2])
    # totally wrong
    if criteria_flag == 'full':
        if result == {'em': 0.0, 'f1': 0.0}:
            fail_results.append(prediction)
    # not fully correct
    elif criteria_flag == 'partial':
        if result != {'em': 1.0, 'f1': 1.0}:
            fail_results.append(prediction)
    # print(result)


In [None]:
len(fail_results) / len(pre_data_list)

0.1663535011900288

In [None]:
# write results
with open(output_directory+'/'+variant_name+'/'+'fully_failed_predictions.csv', mode='w+', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['passage_id','turn_id','story_type', 'story', 'history QA', 'current question', 'gold_answers', 'failed_prediction', 'scores'])

with open(output_directory+'/'+variant_name+'/'+'fully_failed_predictions.csv', mode='a+', encoding='utf-8-sig', newline='') as f:
    writer = csv.writer(f)
    
    for fail_result in fail_results:
        story_id = fail_result[0]
        turn_id = fail_result[1]
        key = (story_id, turn_id)
        a_gold_list = evaluator.gold_data[key]
        scores = evaluator.compute_turn_score(fail_result[0],fail_result[1],fail_result[2])
        # evaluator.questions[turn_id]
        writer.writerow([story_id,turn_id,evaluator.id_to_source[story_id],evaluator.story_dict[story_id],\
                         evaluator.question_dict[story_id][:turn_id-1],evaluator.question_dict[story_id][turn_id-1],a_gold_list, fail_result[2], scores])


### Test results of yes/no answers

In [None]:
# tuples for all prediction data
yn_data_list = []
for key in pred_data.keys():
    if pred_data[key] in ['yes','no']:
        yn_data_list.append(key + (pred_data[key],) )


# find out all the failure cases: their ids
yn_fail_results = []
for prediction in yn_data_list:
    result = evaluator.compute_turn_score(prediction[0],prediction[1],prediction[2])
    # not fully correct
    # if result != {'em': 1.0, 'f1': 1.0}:
    # totally wrong
    if result == {'em': 0.0, 'f1': 0.0}:
        yn_fail_results.append(prediction)
    # print(result)


In [None]:
yn_ratio = len(yn_data_list) / len(pred_data)
yn_ratio

0.22059376174370537

In [None]:
non_yn_failure_ratio = (len(fail_results)-len(yn_fail_results)) / (len(pre_data_list)-len(yn_data_list))
non_yn_failure_ratio

0.1486660237865638

In [None]:
yn_failure_ratio = len(yn_fail_results) / len(yn_data_list)
yn_failure_ratio

0.22884724588302102

Observation: The fully wrong ({'em': 0.0, 'f1': 0.0}) ratio of yes/no type question is higher then the normal question.


Need to construct a dictionary for computing the overall F1 score of these two different type of questions.

In [None]:
# tuples for all prediction data
yn_dict = {}
non_yn_dict = {}
yn_turns  = {}
for i in list(domain_mappings.values()):
    yn_turns[i] = 0
    
for key in pred_data.keys():
    if pred_data[key] in ['yes','no']:
        story_type = evaluator.id_to_source[key[0]]
        yn_turns[domain_mappings[story_type]] += 1
        yn_dict[key] = pred_data[key]
    else:
        non_yn_dict[key] = pred_data[key]

In [None]:
list(domain_mappings.values())

['children_stories',
 'literature',
 'mid-high_school',
 'news',
 'wikipedia',
 'science',
 'reddit']

In [None]:
yn_turns[story_type]

290

In [None]:
# evaluator.model_performance(yn_dict)
result_yn = json.dumps(evaluator.model_performance(yn_dict), indent=2)
result_yn = json.loads(result_yn)
for case in result_yn:
    if case not in ['in_domain', 'out_domain','overall']:
        if yn_turns[case] == 0:
            print('no yes/no answer in {}'.format(case))
            pass
        else:
            result_yn[case]['em'] = result_yn[case]['em'] * result_yn[case]['turns'] / yn_turns[case]
            result_yn[case]['f1'] = result_yn[case]['f1'] * result_yn[case]['turns'] / yn_turns[case]
            result_yn[case]['turns'] = yn_turns[case]

no yes/no answer in reddit
no yes/no answer in science


In [None]:
result_yn

{'children_stories': {'em': 74.27762039660055,
  'f1': 74.27762039660055,
  'turns': 353},
 'in_domain': {'em': 16.8, 'f1': 16.8, 'turns': 7983},
 'literature': {'em': 74.05432098765431,
  'f1': 74.05432098765431,
  'turns': 405},
 'mid-high_school': {'em': 75.67442455242966,
  'f1': 75.67442455242966,
  'turns': 391},
 'news': {'em': 78.35310559006211, 'f1': 78.35310559006211, 'turns': 322},
 'out_domain': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'overall': {'em': 16.8, 'f1': 16.8, 'turns': 7983},
 'reddit': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'science': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'wikipedia': {'em': 80.17862068965518, 'f1': 80.17862068965518, 'turns': 290}}

In [None]:
# evaluator.model_performance(yn_dict)
result_non_yn = json.dumps(evaluator.model_performance(non_yn_dict), indent=2)
result_non_yn = json.loads(result_non_yn)
for case in result_non_yn:
    if case not in ['in_domain', 'out_domain','overall']:
        if yn_turns[case] == result_non_yn[case]['turns']:
            print('no yes/no answer in {}'.format(case))
            pass
        else:
            result_non_yn[case]['em'] = result_non_yn[case]['em'] * result_non_yn[case]['turns'] / (result_non_yn[case]['turns']-yn_turns[case])
            result_non_yn[case]['f1'] = result_non_yn[case]['f1'] * result_non_yn[case]['turns'] / (result_non_yn[case]['turns']-yn_turns[case])
            result_non_yn[case]['turns'] = (result_non_yn[case]['turns']-yn_turns[case])

no yes/no answer in reddit
no yes/no answer in science


In [None]:
result_non_yn

{'children_stories': {'em': 60.748600746268664,
  'f1': 73.64272388059702,
  'turns': 1072},
 'in_domain': {'em': 48.1, 'f1': 58.1, 'turns': 7983},
 'literature': {'em': 58.81306122448979,
  'f1': 71.98612244897959,
  'turns': 1225},
 'mid-high_school': {'em': 57.63232963549921,
  'f1': 71.25451664025357,
  'turns': 1262},
 'news': {'em': 63.62381311228335, 'f1': 76.92019593067067, 'turns': 1327},
 'out_domain': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'overall': {'em': 48.1, 'f1': 58.1, 'turns': 7983},
 'reddit': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'science': {'em': 0.0, 'f1': 0.0, 'turns': 0},
 'wikipedia': {'em': 67.4254491017964, 'f1': 78.37904191616767, 'turns': 1336}}

In [None]:
print(json.dumps(evaluator.model_performance(pred_data), indent=2))

{
  "children_stories": {
    "em": 64.1,
    "f1": 73.8,
    "turns": 1425
  },
  "literature": {
    "em": 62.5,
    "f1": 72.5,
    "turns": 1630
  },
  "mid-high_school": {
    "em": 61.9,
    "f1": 72.3,
    "turns": 1653
  },
  "news": {
    "em": 66.6,
    "f1": 77.2,
    "turns": 1649
  },
  "wikipedia": {
    "em": 69.6,
    "f1": 78.7,
    "turns": 1626
  },
  "reddit": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "science": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "in_domain": {
    "em": 65.0,
    "f1": 74.9,
    "turns": 7983
  },
  "out_domain": {
    "em": 0.0,
    "f1": 0.0,
    "turns": 0
  },
  "overall": {
    "em": 65.0,
    "f1": 74.9,
    "turns": 7983
  }
}


so the actuall overall f1 score for yes/no question is higher