In [1]:
import json
import pickle
import numpy as np

## Prepare json file for attention extraction

In [2]:
sentences_file = '../data/europarl/intersect.en.tok.test'
json_file = '../BertAA/entest_4bert.json'

In [3]:
in_data = []
with open(sentences_file, 'r') as infile:
    for line in infile:
        in_data.append({'text': line.strip()})
        
with open(json_file, 'w') as outfile:
    json.dump(in_data, outfile)

In [4]:
len(in_data)

1000

## Load attention matrices

In [5]:
attention_pickle = '../BertAA/entest_4bert_attn.pkl'
def load_pickle(fname):
    with open(fname, "rb") as f:
        return pickle.load(f)  # add, encoding="latin1") if using python3 and downloaded data

# BERT-base Attention Maps extracted from Wikipedia
# Data is a list of dicts of the followig form:
# {
#    "tokens": list of strings
#    "attns": [n_layers, n_heads, n_tokens, n_tokens] 
#             tensor of attention weights
# }
data = load_pickle(attention_pickle)
n_docs = len(data)

In [17]:
data[214]['tokens']

['[CLS]',
 '(',
 'it',
 ')',
 'mr',
 'president',
 ',',
 'i',
 'take',
 'the',
 'floor',
 'to',
 'express',
 'my',
 'satisfaction',
 'at',
 'the',
 'widespread',
 'support',
 'the',
 'assembly',
 'has',
 'given',
 'to',
 'the',
 'motion',
 'that',
 'we',
 'have',
 'put',
 'forward',
 'with',
 'mr',
 'pan',
 '##nell',
 '##a',
 'and',
 'mr',
 'on',
 '##ys',
 '##kiewicz',
 ',',
 'the',
 'motion',
 'does',
 'something',
 'different',
 'from',
 'what',
 'we',
 'heard',
 'from',
 'mrs',
 'ferrer',
 '##owa',
 '##ld',
 '##ner',
 'today',
 ';',
 'that',
 'is',
 ',',
 'it',
 'takes',
 'sides',
 ':',
 'the',
 'side',
 'of',
 'the',
 'search',
 'for',
 'truth',
 ',',
 'for',
 'the',
 'real',
 'reasons',
 'why',
 'talks',
 'broke',
 'down',
 'between',
 'the',
 'chinese',
 'and',
 'the',
 'tibetan',
 '##s',
 ',',
 'rather',
 'than',
 'looking',
 'at',
 'this',
 'from',
 'a',
 'neutral',
 'stand',
 '##point',
 ',',
 'as',
 'the',
 'commission',
 'and',
 'the',
 'council',
 'unfortunately',
 'continu

## Processing of attention matrices
* The row and column coresponding to `[CLS]` token is deleted 
* The row and column coresponding to `[SEP]` is kept, so the format corresponds to attention metrices obtained in the previous experiments.
* Tokenization is changed to `BPE`. `[CLS]` and `[SEP]` tokens are deleted.
* Tokens are saved in a text file. One line per sentence.

In [9]:
output_attention = '../BertAA/attentions.npz'
output_tokens = '../BertAA/source.txt'

In [10]:
def to_bpe(tokens):
    txt_line = (' ').join(tokens)
    txt_line = txt_line.replace(' ##', '@@ ')
    return txt_line.split(' ')

In [11]:
attentions = []
sentences = []
for exmpl in data:
    tokens = to_bpe(exmpl['tokens'])
    sentence = (' ').join(tokens[1:-1])
    sentences.append(sentence)
    attentions.append(exmpl['attns'][:,:,1:,1:])
    

In [12]:
with open(output_attention, 'wb') as attn_file:
    np.savez(attn_file, *attentions)
    
with open(output_tokens, 'w') as src_file:
    src_file.write('\n'.join(sentences))
        

In [15]:
sentences[214]

'( it ) mr president , i take the floor to express my satisfaction at the widespread support the assembly has given to the motion that we have put forward with mr pan@@ nell@@ a and mr on@@ ys@@ kiewicz , the motion does something different from what we heard from mrs ferrer@@ owa@@ ld@@ ner today ; that is , it takes sides : the side of the search for truth , for the real reasons why talks broke down between the chinese and the tibetan@@ s , rather than looking at this from a neutral stand@@ point , as the commission and the council unfortunately continue to do , as if it were enough for us to simply hope for dialogue between two parties .'