## Read and explore data

We'll start off with reading and doing some basic data processing. We'll assume that:
* you've downloaded the data from http://www.eraserbenchmark.com/ and have unpacked it to a directory called `data`
* you're running the kernel in the root of the `eraserbenchmark` repo

We're going to work with the movies dataset as it's the smallest and easiest to get started with. All the data is stored in either plain text, or jsonl, and should be pre-tokenized and ready to go!

In [1]:
import numpy as np
import itertools

In [2]:
import os
from rationale_benchmark.utils import load_documents, load_datasets, annotations_from_jsonl, Annotation


In [3]:
# data_root = os.path.join('data', 'movies')
data_root = "/work/dzhang5/attention/movie_data"
documents = load_documents(data_root)
val = annotations_from_jsonl(os.path.join(data_root, 'val.jsonl'))
## Or load everything:
train, val, test = load_datasets(data_root)

In [3]:
ann = train[0]
evidences = ann.all_evidences()
print(type(ann))
print(ann.query)
print(ann.classification)
print(len(evidences))

<class 'rationale_benchmark.utils.Annotation'>
What is the sentiment of this review?
NEG
16


# Extract text, label and attention label

In [26]:
def update_attention_label(attention_label, evidences):
    starts = [ev.start_token for ev in evidences]
    ends = [ev.end_token for ev in evidences]
    att_len = attention_label.shape[0]
    for s, e in zip(starts, ends):
        assert 0 <= s <=e <=att_len
        attention_label[s:e] = 1
    return attention_label

def get_list_data(data, documents):
    attention_label_list = []
    y_list = []
    x_list = []
    for e, ann in enumerate(data):
        evidences = ann.all_evidences()
        assert ann.classification in ['NEG', 'POS']
        if ann.classification == "NEG":
            y_list.append(0)
        elif ann.classification == 'POS':
            y_list.append(1)
        docid = ann.annotation_id
        doc = documents[docid]
        flattened_doc = list(itertools.chain.from_iterable(doc))
        x_list.append(flattened_doc)
        doc_len = len(flattened_doc)
        attention_label = np.zeros(doc_len)
        try:
            attention_label = update_attention_label(attention_label, evidences)
            attention_label_list.append(attention_label)
        except AssertionError:
            print(f"id is: {e}, docid is: {docid}")
    return x_list, y_list, attention_label_list

In [150]:
import six

def pad_sequences(sequences, maxlen=None, dtype='int32',
                  padding='pre', truncating='pre', value=0.):
    """Pads sequences to the same length.

    This function transforms a list of
    `num_samples` sequences (lists of integers)
    into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
    `num_timesteps` is either the `maxlen` argument if provided,
    or the length of the longest sequence otherwise.

    Sequences that are shorter than `num_timesteps`
    are padded with `value` at the end.

    Sequences longer than `num_timesteps` are truncated
    so that they fit the desired length.
    The position where padding or truncation happens is determined by
    the arguments `padding` and `truncating`, respectively.

    Pre-padding is the default.

    # Arguments
        sequences: List of lists, where each element is a sequence.
        maxlen: Int, maximum length of all sequences.
        dtype: Type of the output sequences.
            To pad sequences with variable length strings, you can use `object`.
        padding: String, 'pre' or 'post':
            pad either before or after each sequence.
        truncating: String, 'pre' or 'post':
            remove values from sequences larger than
            `maxlen`, either at the beginning or at the end of the sequences.
        value: Float or String, padding value.

    # Returns
        x: Numpy array with shape `(len(sequences), maxlen)`

    # Raises
        ValueError: In case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    """
    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    num_samples = len(sequences)

    lengths = []
    for x in sequences:
        try:
            lengths.append(len(x))
        except TypeError:
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))

    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(dtype, np.unicode_)
    if isinstance(value, six.string_types) and dtype != object and not is_dtype_str:
        raise ValueError("`dtype` {} is not compatible with `value`'s type: {}\n"
                         "You should set `dtype=object` for variable length strings."
                         .format(dtype, type(value)))

    x = np.full((num_samples, maxlen) + sample_shape, value, dtype=dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen]
        else:
            raise ValueError('Truncating type "%s" '
                             'not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s '
                             'is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

def get_list_data_with_head_attention(data, documents):
    attention_label_list = []
    y_list = []
    x_list = []
    id_list = []
    for e, ann in enumerate(data):
        evidences = ann.all_evidences()
        assert ann.classification in ['NEG', 'POS']
        docid = ann.annotation_id
        doc = documents[docid]
        flattened_doc = list(itertools.chain.from_iterable(doc))
        doc_len = len(flattened_doc)
        attention_label = np.zeros(doc_len)
        try:
            attention_label = update_attention_label(attention_label, evidences)
        except AssertionError:
            print(f"id is: {e}, docid is: {docid}")
        if attention_label[0:200].sum() > 0:
            id_list.append(docid)
            x_list.append(flattened_doc[0:200])
            attention_label_list.append(attention_label[0:200])
            if ann.classification == "NEG":
                y_list.append(0)
            elif ann.classification == 'POS':
                y_list.append(1)
    attention_label_list = pad_sequences(attention_label_list, maxlen=200, dtype="long", truncating="post", padding="post")
    return x_list, y_list, attention_label_list, id_list

In [117]:
x_train, y_train, attention_label_train = get_list_data(train, documents)

In [118]:
x_val, y_val, attention_label_val = get_list_data(val, documents)

In [119]:
x_test, y_test, attention_label_test = get_list_data(test, documents)

In [151]:
x_train, y_train, attention_label_train, id_train = get_list_data_with_head_attention(train, documents)
x_val, y_val, attention_label_val, id_val = get_list_data_with_head_attention(val, documents)
x_test, y_test, attention_label_test, id_test = get_list_data_with_head_attention(test, documents)

In [155]:
attention_label_test.shape

(170, 200)

In [159]:
x_val_test = x_val.copy()
x_val_test.extend(x_test)
y_val_test = y_val.copy()
y_val_test.extend(y_test)
attention_label_val_test = np.concatenate([attention_label_val, attention_label_test])
id_val_test = id_val.copy()
id_val_test.extend(id_test)

In [173]:
y_train = np.array(y_train)
y_val_test = np.array(y_val_test)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [174]:
print(len(y_train), len(y_val), len(y_test))

1241 150 170


In [1]:
np.save("./movie_data/raw_text_train.npy", x_train)
np.save("./movie_data/raw_text_val_test.npy", x_val_test)
np.save("./movie_data/raw_text_val.npy", x_val)
np.save("./movie_data/raw_text_test.npy", x_test)

np.save("./movie_data/y_train.npy", y_train)
np.save("./movie_data/y_val_test.npy", y_val_test)
np.save("./movie_data/y_val.npy", y_val)
np.save("./movie_data/y_test.npy", y_test)

np.save("./movie_data/id_train.npy", id_train)
np.save("./movie_data/id_test.npy", id_val_test)
np.save("./movie_data/id_val.npy", id_val)
np.save("./movie_data/movie_data/id_test.npy", id_test)

np.save("./movie_data/att_labels_train.npy", attention_label_train)
np.save("./movie_data/att_labels_val_test.npy", attention_label_val_test)
np.save("./movie_data/att_labels_val.npy", attention_label_val)
np.save("../movie_data/attention/movie_data/att_labels_test.npy", attention_label_test)

NameError: name 'x_train' is not defined

## Check length after tokenization

In [142]:
from transformers import BertTokenizer

In [140]:
def token_align(orig_tokens, orig_attentions, tokenizer):
    """
    tokenize a sentence and generate corresponding attention labels
    """
    if type(orig_tokens) is list:
        orig_tokens = orig_tokens[0:200]
    else:
        orig_tokens = orig_tokens.split()
    bert_tokens = []
    new_attentions = []
    bert_tokens.append("[CLS]")
    new_attentions.append(0)
    for orig_token, orig_attent in zip(orig_tokens, orig_attentions):
        token = tokenizer.tokenize(orig_token)
        bert_tokens.extend(token)
        new_attentions.extend([orig_attent for i in token])
    bert_tokens.append("[SEP]")
    new_attentions.append(0)
    return bert_tokens, new_attentions


def token_align_float(orig_tokens, orig_attentions, tokenizer):
    """
    tokenize a sentence and generate corresponding attention labels (float)
    """
    if type(orig_tokens) is list:
        orig_tokens = orig_tokens[0:200]
    else:
        orig_tokens = orig_tokens.split()
    bert_tokens = []
    new_attentions = []
    bert_tokens.append("[CLS]")
    new_attentions.append(0.0)
    for orig_token, orig_attent in zip(orig_tokens, orig_attentions):
        token = tokenizer.tokenize(orig_token)
        bert_tokens.extend(token)
        new_attentions.extend([orig_attent for i in token])
    bert_tokens.append("[SEP]")
    new_attentions.append(0.0)
    return bert_tokens, new_attentions


def token_align_two(orig_tokens, orig_attentions, orig_attentions_for_val, tokenizer):
    """
    tokenize a sentence and generate two corresponding attention labels
    """
    if type(orig_tokens) is list:
        orig_tokens = orig_tokens[0:200]
    else:
        orig_tokens = orig_tokens.split()
    bert_tokens = []
    new_attentions = []
    new_attentions_val = []
    bert_tokens.append("[CLS]")
    new_attentions.append(0)
    new_attentions_val.append(0)
    for orig_token, orig_attent, orig_attent_val in zip(orig_tokens, orig_attentions, orig_attentions_for_val):
        token = tokenizer.tokenize(orig_token)
        bert_tokens.extend(token)
        new_attentions.extend([orig_attent for i in token])
        new_attentions_val.extend([orig_attent_val for i in token])
    bert_tokens.append("[SEP]")
    new_attentions.append(0)
    new_attentions_val.append(0)
    return bert_tokens, new_attentions, new_attentions_val


def tokenize_with_new_attentions(orig_text, orig_attention_list, max_length, tokenizer, if_float=False):
    """
    tokenize a array of raw text and generate corresponding
    attention labels array and attention masks array
    """
    if if_float == True:
        tokens_attents = [token_align_float(r, a, tokenizer) for r, a in zip(orig_text, orig_attention_list)]
    else:
        tokens_attents = [token_align(r, a, tokenizer) for r, a in zip(orig_text, orig_attention_list)]
    bert_tokens = [i[0] for i in tokens_attents]
    attent_labels = [i[1] for i in tokens_attents]
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in bert_tokens]
    input_ids = pad_sequences(input_ids, maxlen=max_length, dtype="long", truncating="post", padding="post")
    if if_float == True:
        attent_labels = pad_sequences(attent_labels, maxlen=max_length, dtype="float", truncating="post",
                                      padding="post")
    else:
        attent_labels = pad_sequences(attent_labels, maxlen=max_length, dtype="long", truncating="post", padding="post")
    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)
    attention_masks = np.array(attention_masks)
    return input_ids, attent_labels, attention_masks


In [129]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [86]:
x_token_train, att_token_train = [], []
for orig_x, orig_att in zip(x_train, attention_label_train):
    bert_x, bert_att = token_align(orig_x, orig_att, tokenizer)
    x_token_train.append(bert_x)
    att_token_train.append(bert_att)

x_token_val, att_token_val = [], []
for orig_x, orig_att in zip(x_val, attention_label_val):
    bert_x, bert_att = token_align(orig_x, orig_att, tokenizer)
    x_token_val.append(bert_x)
    att_token_val.append(bert_att)
    
x_token_test, att_token_test = [], []
for orig_x, orig_att in zip(x_test, attention_label_test):
    bert_x, bert_att = token_align(orig_x, orig_att, tokenizer)
    x_token_test.append(bert_x)
    att_token_test.append(bert_att)

In [87]:
max_len_train = max([len(bert_x) for bert_x in x_token_train])
max_len_val = max([len(bert_x) for bert_x in x_token_val])
max_len_test = max([len(bert_x) for bert_x in x_token_test])

In [92]:
print(max_len_train, max_len_val, max_len_test)

264 267 258


In [97]:
np.median([len(bert_x) for bert_x in x_token_test])

220.0

After tokenize,we could choose 256 as max length for movie data