In [1]:
# imports

from transformers.data.processors.squad import SquadV2Processor, SquadExample, squad_convert_examples_to_features
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import json, torch
import numpy as np
from tqdm import tqdm


In [2]:
# load cuad data

data_path = "../cuad-data/train_separate_questions.json"

sample_path = "../cuad-data/cuad_sample.json"

def get_data(path):
    with open(path,'r') as fobj:
        data = json.loads(fobj.read())
        data = data["data"]
    return data

def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens

def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
      return True
    return False

In [3]:
### Setting hyperparameters
max_seq_length = 512
doc_stride = 256
n_best_size = 1
max_query_length = 64
max_answer_length = 512
do_lower_case = False
null_score_diff_threshold = 0.0
batch_size = 2
threads = 2

In [4]:
def create_examples(input_data, is_training=False):
    examples = []
    for entry in tqdm(input_data):
        title = entry["title"]
        for paragraph in entry["paragraphs"]:
            context_text = paragraph["context"]
            for qa in paragraph["qas"]:
                qas_id = qa["id"]
                question_text = qa["question"]
                start_position_character = None
                answer_text = None
                answers = []

                is_impossible = qa.get("is_impossible", False)
                if not is_impossible:
                    if is_training:
                        answer = qa["answers"][0]
                        answer_text = answer["text"]
                        start_position_character = answer["answer_start"]
                    else:
                        answers = qa["answers"]

                example = SquadExample(
                    qas_id=qas_id,
                    question_text=question_text,
                    context_text=context_text,
                    answer_text=answer_text,
                    start_position_character=start_position_character,
                    title=title,
                    is_impossible=is_impossible,
                    answers=answers,
                )
                examples.append(example)
    return examples


In [5]:
def get_dataset_pos_mask(dataset):
    """
    Returns a list, pos_mask, where pos_mask[i] indicates is True if the ith example in the dataset is positive
    (i.e. it contains some text that should be highlighted) and False otherwise.
    """
    pos_mask = []
    for i in range(len(dataset)):
        ex = dataset[i]
        start_pos = ex[3]
        end_pos = ex[4]
        is_positive = end_pos > start_pos
        pos_mask.append(is_positive)
    return pos_mask


def get_random_subset(dataset, keep_frac=1):
    """
    Takes a random subset of dataset, where a keep_frac fraction is kept.
    """
    keep_indices = [i for i in range(
        len(dataset)) if np.random.random() < keep_frac]
    subset_dataset = torch.utils.data.Subset(dataset, keep_indices)
    return subset_dataset


def get_balanced_dataset(dataset):
    """
    returns a new dataset, where positive and negative examples are approximately balanced
    """
    pos_mask = get_dataset_pos_mask(dataset)
    neg_mask = [~mask for mask in pos_mask]
    npos, nneg = np.sum(pos_mask), np.sum(neg_mask)

    # So that in expectation there will be npos negative examples (--> balanced)
    neg_keep_frac = npos / nneg
    neg_keep_mask = [mask and np.random.random(
    ) < neg_keep_frac for mask in neg_mask]

    # keep all positive examples and subset of negative examples
    keep_mask = [pos_mask[i] or neg_keep_mask[i] for i in range(len(pos_mask))]
    keep_indices = [i for i in range(len(keep_mask)) if keep_mask[i]]

    subset_dataset = torch.utils.data.Subset(dataset, keep_indices)
    return subset_dataset

In [6]:
def create_features_and_dataset(examples, is_training=False):
    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=is_training,
        return_dataset="pt",
        threads=threads,
    )

    if is_training:
        dataset = get_balanced_dataset(dataset)
    
    return features, dataset


In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    "MariamD/distilbert-base-uncased-finetuned-legal_data")

model = AutoModelForSequenceClassification.from_pretrained(
    "MariamD/distilbert-base-uncased-finetuned-legal_data")

Some weights of the model checkpoint at MariamD/distilbert-base-uncased-finetuned-legal_data were not used when initializing DistilBertForSequenceClassification: ['qa_outputs.bias', 'qa_outputs.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at MariamD/distilbert-base-uncased-finetuned-legal_data and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably T

In [8]:
sdata = get_data(sample_path)
sexamples = create_examples(sdata, True)
features, dataset = create_features_and_dataset(sexamples, True)


100%|██████████| 2/2 [00:04<00:00,  2.14s/it]
convert squad examples to features:   0%|          | 0/124 [00:00<?, ?it/s]
