In [1]:
import os
import sys
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm, trange
from transformers import *
from transformers.data.processors.squad import SquadV2Processor
from probe import Probe
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys
import glob
import csv, json
import pandas as pd
from random import sample
from evaluate import *

In [2]:
def main_alt(data_file, preds):
    with open(data_file) as f:
        dataset_json = json.load(f)
        dataset = dataset_json['data']
    na_probs = {k: 0.0 for k in preds}

    qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]

    has_ans_qids_in_pred = [qid for qid in has_ans_qids if qid in preds]
    no_ans_qids_in_pred = [qid for qid in no_ans_qids if qid in preds]

    exact_raw, f1_raw = get_raw_scores(dataset, preds)

    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,1.0)
    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 1.0)

    out_eval = make_eval_dict(exact_thresh, f1_thresh)

    if has_ans_qids:
        has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids_in_pred)
        merge_eval(out_eval, has_ans_eval, 'has_ans')

    if no_ans_qids:
        no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids_in_pred)
        merge_eval(out_eval, no_ans_eval, 'no_ans')

    exact, f1 = out_eval['exact'], out_eval['f1']
    exact_no_ans, f1_no_ans = out_eval['no_ans_exact'], out_eval['no_ans_f1']
    exact_has_ans, f1_has_ans = out_eval['has_ans_exact'], out_eval['has_ans_f1']

    return exact, f1, exact_no_ans, f1_no_ans, exact_has_ans, f1_has_ans

In [3]:
def make_predictions(p,
                     threshold,
                     tokenizer,
                     model,
                     q_ids,
                     layer,
                     trial,
                     max_seq_length,
                     dev_examples,
                     eval_dataloader,
                     device = "cpu"):
    
    '''
    @param p, probe
    @param dev_examples
    '''
    
    # Initialize predictions
    pred = pd.DataFrame()
    pred['Id'] = q_ids
    pred['Predicted'] = [""] * len(dev_examples)
    pred['Question'] = [""] * len(dev_examples)
    pred['Score'] = [0] * len(dev_examples)

    # List to keep track of how many unique questions we've seen in each df, questions with
    # contexts longer than max seq len get split into multiple features based on doc_stride
    # a good alternative we may implement later is recording for all features, then simplifying with groupby and max
    # e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question'])
    question_ids = 0

    # Evaluation batches
    print("Trial: {}".format(trial))
    for batch in tqdm(eval_dataloader, desc = "Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }

            # Albert forward pass
            idx = batch[3]
            outputs = model(**inputs)
            attention_hidden_states = outputs[2][1:]

            # Compute prediction on eval indices
            for j, index in enumerate(idx):
                index = int(index.item())

                # Extract tokens for the current batch
                tokens = tokenizer.convert_ids_to_tokens(batch[0][j])
                
                # Find where context starts and ends, since we want to predict in context
                context_start = int(max_seq_length - torch.argmax(torch.flip(batch[2][j], [0])).item()) - 1
                context_end = int(torch.argmax(batch[2][j]).item())

                # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token
                question_start = 1
                question_end = context_start
                question = tokenizer.convert_tokens_to_string(tokens[question_start:question_end-1])

                # Extract predicted indicies
                score, start_idx, end_idx = p.predict(attention_hidden_states[layer-1][j].unsqueeze(0), 
                                                      device, 
                                                      threshold=threshold, 
                                                      context_start=context_start, 
                                                      context_end=context_end)
                start_idx = int(start_idx[0])
                end_idx = int(end_idx[0])

                # Extract predicted answer, converting start tokens to empty strings (no answer)
                answer = tokenizer.convert_tokens_to_string(tokens[start_idx:end_idx + 1])
                if answer == '[CLS]':
                    answer = ''

                # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score.
                # If the question is not already in the dataframe, then assign it to the dataframe.
                # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates
                if question_ids == 0:
                    pred.loc[question_ids, 'Question'] = question
                    pred.loc[question_ids, 'Predicted'] = answer
                    pred.loc[question_ids, 'Score'] = score

                elif (pred.loc[int(question_ids-1), 'Question'] == question):
                    question_ids -= 1  
                    old_score = pred.loc[question_ids, 'Score'] 
                    if score > old_score:
                        pred.loc[question_ids, 'Predicted'] = answer
                        pred.loc[question_ids, 'Score'] = score
                else:
                    pred.loc[question_ids, 'Question'] = question
                    pred.loc[question_ids, 'Predicted'] = answer
                    pred.loc[question_ids, 'Score'] = score

                # Increment to new question id (note, for duplicate answers this gets us back to where we were)
                question_ids += 1
                
    return pred

In [4]:
def eval_thresh(probe,
                threshold,
                layer,
                model_prefix,
                data_dir = "squad-master/data/",
                dev_file = "dev-v2.0.json",
                n = 100,
                batch_size = 4,
                trials = 1,
                max_seq_length = 384,
                device = "cpu",
                verbose = True):
    '''
    Evaluate given thresholds  
    
    @param probe
    @param n (int), Number of samples 
    @

    Given: Probe
           List of thresholds

    Returns: Full results

    Optional: Subset percentage (efficiency)

    '''
    
    # Extract examples
    tokenizer = AutoTokenizer.from_pretrained(model_prefix)
    processor = SquadV2Processor()
    dev_examples = processor.get_dev_examples(data_dir = data_dir, filename = dev_file)

    # Initialize ALBERT model
    config = AlbertConfig.from_pretrained(model_prefix, output_hidden_states = True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_prefix, config = config)
    model = torch.nn.DataParallel(model)

    # Initialize results
    exact = 0
    f1 = 0
    exact_no_ans = 0
    f1_no_ans = 0 
    exact_has_ans = 0
    f1_has_ans = 0

    # Execute trials
    for _ in range(trials):

        # Randomly sample n examples
        examples = sample(dev_examples, n)

        # Extract features
        features, dataset = squad_convert_examples_to_features(examples=examples,
                                                               tokenizer=tokenizer,
                                                               max_seq_length=384,
                                                               doc_stride=128,
                                                               max_query_length=64,
                                                               is_training=False,
                                                               return_dataset="pt",
                                                               threads=1)

        # Extract IDs
        q_ids = []
        for i in range(n):
            q_ids.append(examples[i].qas_id)

        # Generate predictions
        eval_sampler = SequentialSampler(dataset)
        eval_dataloader = DataLoader(dataset, sampler = eval_sampler, batch_size = batch_size)
        preds = make_predictions(p = p, 
                                 threshold = threshold,
                                 tokenizer = tokenizer,
                                 model = model, 
                                 layer = layer,
                                 q_ids = q_ids, 
                                 trial = _,
                                 max_seq_length = max_seq_length,
                                 dev_examples = examples, 
                                 eval_dataloader = eval_dataloader, 
                                 device = device)
        preds = dict(zip(preds.Id, preds.Predicted))

        # Evaluate 
        metrics = main_alt(data_dir + dev_file, preds)
        exact += metrics[0]
        f1 += metrics[1]
        exact_no_ans += metrics[2]
        f1_no_ans += metrics[3]
        exact_has_ans += metrics[4]
        f1_has_ans += metrics[5]
        
    exact /= trials
    f1 /= trials
    exact_no_ans /= trials
    f1_no_ans /= trials
    exact_has_ans /= trials
    f1_has_ans /= trials

    return exact, f1, exact_no_ans, f1_no_ans, exact_has_ans, f1_has_ans



In [None]:
eval_threshs(p,
             layer,
             )

In [5]:
hidden_dim = 768
p = Probe(hidden_dim)
p.load("old/fine_tuned_5_epoches_probes/", 12, "cpu")

In [6]:
eval_thresh(p, 
            layer = 12, 
            threshold = -100,
            trials = 10,
            n = 50,
            model_prefix = "twmkn9/albert-base-v2-squad2")

100%|██████████| 16/16 [00:02<00:00,  7.73it/s]
convert squad examples to features: 100%|██████████| 50/50 [00:00<00:00, 188.22it/s]
add example index and unique id: 100%|██████████| 50/50 [00:00<00:00, 94722.31it/s]
Evaluating:   0%|          | 0/13 [00:00<?, ?it/s]

Trial: 0


Evaluating:  23%|██▎       | 3/13 [00:06<00:21,  2.19s/it]


KeyboardInterrupt: 