# Data processing for the reading comprehension model paradigm

This is the data processing for reading comprehension paradigm. 
For each sentence: 
- BERT sequence tokenization 
- Tokenization for glove embeddings (NLTK)
- char sequence tokenization 
- POS tag tokenization

For Glove Embedding, you need to download the original glove embedding file from their official website. 


In [4]:
from transformers import BertTokenizer
import json 
from tqdm import tqdm
from nltk.tokenize import word_tokenize
import nltk
import numpy as np

In [5]:
# I/O Helper functions

def read_json_lines(path_to_file): 
    with open(path_to_file) as f:
        content = f.readlines()
    f.close()
    raw_data  = [json.loads(x) for x in content] 
    return raw_data

def read_json_file(path):
    with open(path, 'r') as f:
        return json.load(f)
    
def write_json_file(path, data):
    with open(path, 'w') as f:
        json.dump(data, f)
    return

In [9]:
# Settings 
split = 'random'
DATA_NAME = 'MAGPIE'
# Input Paths
PATH_TO_RAW_DATA = '../../data/MAGPIE/raw/MAGPIE_filtered_split_{}_raw_processed.json'.format(split)

# Output paths
PATH_TO_SAVE_DATA = '../../data/{}/processed/{}_{}_read_comp_data.json'.format(DATA_NAME, DATA_NAME, split)
PATH_TO_SAVE_TARGET_VOCAB = '../../data/{}/processed/{}_read_comp_target_vocab.json'.format(DATA_NAME, DATA_NAME)
PATH_TO_SAVE_GLOVE_VOCAB = '../../data/{}/processed/{}_{}_read_comp_glove_vocab.json'.format(DATA_NAME, DATA_NAME, split)
PATH_TO_SAVE_CHAR_VOCAB = '../../data/{}/processed/{}_{}_read_comp_char_vocab.json'.format(DATA_NAME, DATA_NAME, split)
PATH_TO_SAVE_GLOVE_EMB = '../../data/{}/processed/{}_{}_read_comp_glove_embed.npy'.format(DATA_NAME, DATA_NAME, split)
PATH_TO_SAVE_POS_VOCAB = '../../data/{}/processed/{}_{}_read_comp_pos_vocab.json'.format(DATA_NAME, DATA_NAME, split)
PATH_TO_SAVE_DATA_IDX = '../../data/{}/processed/{}_{}_read_comp_data_idx.json'.format(DATA_NAME, DATA_NAME, split)


# Other settings 
max_seq_len = 50

## 1. Construct dictionaries


In [10]:
target_vocab = {
    '<PAD>': 0,
    '<s>': 1, 
    '<e>': 2,
    '<l>': 3,
    '<i>': 4
}

## 2. Data contruction and sentence tokenization 

In [11]:
# initialize tokenizer
# model parameters
pretrained_model_name = 'bert-base-uncased'
# tokenizer parameters
do_lower_case = True
# model and tokenizer initialization
tokenizer = BertTokenizer.from_pretrained(pretrained_model_name, 
                                          do_lower_case = do_lower_case)

# load processed data 
raw_data = read_json_file(PATH_TO_RAW_DATA)

In [12]:
raw_data['train'][12]

{'label': 'l',
 'split': 'training',
 'id': 13,
 'genre': 'W fict prose',
 'idiom': 'in the hole',
 'offsets': [[27, 29], [32, 36]],
 'sentence': "At least back when I lived in a hole I knew everything there was to know about living in a hole , and now it 's a year later and I 'm at a place so far away I do n't even know how far away it is , watching something I do n't understand go to a place so far up there is no down ."}

In [13]:
def process_source_and_target_sequence(raw_data, new_data, data_indices): 
    num_exceptions = 0
    for data_entry in tqdm(raw_data): 
        # ==============================================================================
        # STRING PRE-PROCESSING
        # ------------------------------------------------------------------------------
        try: 
            # extract raw source and target sentence
            source_sentence = data_entry['sentence'].lower()
            if len(source_sentence.split()) > max_seq_len: 
                continue 
            label = data_entry['label']
            data_id = data_entry['id']
            
            offsets = [data_entry['offsets'][0][0], data_entry['offsets'][-1][0]]

            while source_sentence[offsets[0]] != " " and offsets[0] != -1: 
                offsets[0] -= 1
            offsets[0] += 1
            while offsets[1] != len(source_sentence) and source_sentence[offsets[1]] not in [" ", ",", ".", "’"]: 
                offsets[1] += 1

            if source_sentence[offsets[0]] in ["‘", ","]: 
                offsets[0] += 1
            if source_sentence[offsets[1]-1] in ["’", "!", '?', ')']: 
                offsets[1] -= 1

            idiom_in_sentence = ''.join(source_sentence[offsets[0]: offsets[1]].split())
            target_sentence = source_sentence[: offsets[0]] + ''.join([' [SEP]' for i in source_sentence[offsets[0]: offsets[1]].split()]) + source_sentence[offsets[1]:]

            def apply_contraction_change(s): 
                s = s.replace(" n't", "n't")
                s = s.replace("\n", "")
                s = s.replace("‘", " ‘ ")
                s = s.replace("’", " ’ ")
                s = s.replace(",", " , ")
                s = s.replace(".", " . ")
                s = s.replace('?', ' ? ')
                s = s.replace('!', ' ! ')
                s = s.replace('-', ' - ')
                return s
            # apply contraction 
            source_sentence = apply_contraction_change(source_sentence)   
            target_sentence = apply_contraction_change(target_sentence) 


            # ==============================================================================
            # TOKENIZATION
            # ------------------------------------------------------------------------------

            # BERT TOKENIZATION 
            # ------------------------------------------------
            # process source sequence 
            source_sequence = tokenizer.batch_encode_plus([source_sentence])['input_ids'][0]
            target_sequence = tokenizer.batch_encode_plus([target_sentence])['input_ids'][0]

            # GLOVE TOKENIZATION
            # ------------------------------------------------
            # add start and end symbols
            source_sentence_glove_tknz =  word_tokenize(source_sentence)

            # add start and end symbols
            tags_tokens = nltk.pos_tag(source_sentence_glove_tknz)
            source_sentence_glove_tknz =[t[0] for t in tags_tokens]
            source_sentence_pos_taggs = [t[1] for t in tags_tokens]
            source_sentence_glove_tknz = ['<S>'] + source_sentence_glove_tknz + ['<E>']
            source_sentence_pos_taggs = ['<S>'] + source_sentence_pos_taggs + ['<E>']

            # STRING TOKENIZATION
            # ------------------------------------------------
            source_sentence_char_tknz = []
            for word in source_sentence_glove_tknz: 
                if word in ['<S>', '<E>']: 
                    source_sentence_char_tknz.append(['<SPEC>'])
                else: 
                    source_sentence_char_tknz.append(list(word))
            #print(source_sentence_char_tknz)
            #raise

            # ==============================================================================
            # PROCESS TARGET SEQUENCE
            # ------------------------------------------------------------------------------

            # Generate target sentence
            # identify the start and end indices 
            target_indices = []
            for t_idx, t in enumerate(target_sequence): 
                if tokenizer._convert_id_to_token(t) == '[SEP]': 
                    target_indices.append(t_idx)
            target_indices = [min(target_indices[:-1]), max(target_indices[:-1])]
            idiom_in_target = ''.join([tokenizer._convert_id_to_token(t) for t in source_sequence[target_indices[0]: target_indices[1]+1]]).replace('#', '')
            while idiom_in_target != idiom_in_sentence: 
                target_indices[1] += 1
                idiom_in_target = ''.join([tokenizer._convert_id_to_token(t) for t in source_sequence[target_indices[0]: target_indices[1]+1]]).replace('#', '')
                if target_indices[1] > len(source_sequence): 
                    raise

            # process target sequence
            target_sequence = [target_vocab['<s>']] + \
                              [target_vocab['<l>'] for i in source_sequence[1:-1]] + \
                              [target_vocab['<e>']]
            if label == 'i': 
                for t_idx in range(target_indices[0], target_indices[1]+1): 
                    target_sequence[t_idx] = target_vocab['<i>']
                # print([tokenizer._convert_id_to_token(source_sequence[t_idx]) for t_idx, t in enumerate(target_sequence) if t == target_vocab['<i>']])
            try: 
                # assert len(source_sequence) == len(source_sentence_glove_tknz)
                assert len(source_sequence) == len(target_sequence)
            except: 
                # print(' '.join(source_sentence_bert_tknz))
                print(' '.join(source_sentence_glove_tknz))
                raise
            new_data_entry = [source_sequence, source_sentence_glove_tknz, source_sentence_char_tknz, source_sentence_pos_taggs, target_sequence, label]
            new_data.append(new_data_entry)
            data_indices.append(data_id)
        except: 
            num_exceptions += 1
        
    print('Final Number of Exceptions: {}'.format(num_exceptions)) 

            


In [14]:
# data processing 
processed_data = {'train': [], 'valid': [], 'test': []}
data_indices = {'train': [], 'valid': [], 'test': []}
process_source_and_target_sequence(raw_data['train'], processed_data['train'], data_indices['train'])
process_source_and_target_sequence(raw_data['test'], processed_data['valid'], data_indices['valid'])

print('Final Number of train data: {}'.format(len(processed_data['train'])))
print('Final Number of test data: {}'.format(len(processed_data['test'])))
print('Final Number of valid data: {}'.format(len(processed_data['valid'])))

100%|██████████| 35533/35533 [00:48<00:00, 733.25it/s]
  3%|▎         | 153/4451 [00:00<00:05, 735.65it/s]

Final Number of Exceptions: 25


100%|██████████| 4451/4451 [00:06<00:00, 697.29it/s]

Final Number of Exceptions: 4
Final Number of train data: 32162
Final Number of test data: 0
Final Number of valid data: 4030





In [9]:
write_json_file(PATH_TO_SAVE_DATA_IDX, data_indices)

In [8]:
processed_data['train'][12]

[[101,
  1998,
  2059,
  1010,
  2065,
  2673,
  2743,
  2995,
  2000,
  2433,
  1010,
  1996,
  6151,
  8067,
  5999,
  5726,
  2015,
  2052,
  9498,
  2091,
  2000,
  1037,
  4899,
  4873,
  2041,
  1997,
  4356,
  1012,
  102],
 ['<S>',
  'and',
  'then',
  ',',
  'if',
  'everything',
  'ran',
  'true',
  'to',
  'form',
  ',',
  'the',
  'undamaged',
  'clays',
  'would',
  'sail',
  'down',
  'to',
  'a',
  'landing',
  'somewhere',
  'out',
  'of',
  'sight',
  '.',
  '<E>'],
 [['<SPEC>'],
  ['a', 'n', 'd'],
  ['t', 'h', 'e', 'n'],
  [','],
  ['i', 'f'],
  ['e', 'v', 'e', 'r', 'y', 't', 'h', 'i', 'n', 'g'],
  ['r', 'a', 'n'],
  ['t', 'r', 'u', 'e'],
  ['t', 'o'],
  ['f', 'o', 'r', 'm'],
  [','],
  ['t', 'h', 'e'],
  ['u', 'n', 'd', 'a', 'm', 'a', 'g', 'e', 'd'],
  ['c', 'l', 'a', 'y', 's'],
  ['w', 'o', 'u', 'l', 'd'],
  ['s', 'a', 'i', 'l'],
  ['d', 'o', 'w', 'n'],
  ['t', 'o'],
  ['a'],
  ['l', 'a', 'n', 'd', 'i', 'n', 'g'],
  ['s', 'o', 'm', 'e', 'w', 'h', 'e', 'r', 'e'],
  [

## 3. Construct Glove dictionary and convert sequence to indices

In [15]:
def get_glove_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        vocab.extend(example[1])  # index 1 is the source sentence tkn for glove
    vocab = set(vocab)
    vocab.remove('<S>')
    vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_glove_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<S>': 2, '<E>': 3}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<S>', 3: '<E>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word

In [16]:
glove_vocab =  get_glove_vocab(processed_data['train'] + processed_data['valid'])

vocab size:  38660


In [17]:
word2idx_glove, idx2word_glove = get_glove_word2idx_idx2word(glove_vocab)
len(word2idx_glove.keys())

38664

In [18]:
glove_vocab = list(glove_vocab)
glove_tknz_idx = 1
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[glove_tknz_idx]: 
        toknz_sent.append(word2idx_glove[w])
    d[glove_tknz_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[glove_tknz_idx]: 
        toknz_sent.append(word2idx_glove[w])
    d[glove_tknz_idx] = toknz_sent
    processed_data['valid'][i] = d

## 4. Construct Char dictionary

In [19]:
char_tknz_idx = 2
def get_char_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        #print(example)
        for cs in example[char_tknz_idx]: 
         #   print(cs)
            vocab.extend(cs)  # index 2 is the source sentence tkn for char tokens
        
    vocab = set(vocab)

    vocab.remove('<SPEC>')
    # vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_char_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<SPEC>': 2}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<SPEC>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word

In [20]:
char_vocab =  get_char_vocab(processed_data['train'] + processed_data['valid'])


vocab size:  117


In [21]:
word2idx_char, idx2word_char = get_char_word2idx_idx2word(char_vocab)


In [22]:
char_vocab = list(char_vocab)
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[char_tknz_idx]:
        toknz_word = []
        for c in w: 
            toknz_word.append(word2idx_char[c])
        toknz_sent.append(toknz_word)
    d[char_tknz_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[char_tknz_idx]:
        toknz_word = []
        for c in w: 
            toknz_word.append(word2idx_char[c])
        toknz_sent.append(toknz_word)
    d[char_tknz_idx] = toknz_sent
    processed_data['valid'][i] = d

In [23]:
processed_data['train'][12]

[[101,
  1998,
  2059,
  1010,
  2065,
  2673,
  2743,
  2995,
  2000,
  2433,
  1010,
  1996,
  6151,
  8067,
  5999,
  5726,
  2015,
  2052,
  9498,
  2091,
  2000,
  1037,
  4899,
  4873,
  2041,
  1997,
  4356,
  1012,
  102],
 [2,
  14992,
  21728,
  17443,
  11563,
  22365,
  21909,
  15271,
  9121,
  30573,
  17443,
  5743,
  25078,
  14109,
  11117,
  20119,
  16058,
  9121,
  11277,
  4135,
  14265,
  3083,
  6552,
  19598,
  31420,
  3],
 [[2],
  [79, 72, 71],
  [37, 33, 117, 72],
  [101],
  [64, 48],
  [117, 107, 117, 50, 43, 37, 33, 64, 72, 116],
  [50, 79, 72],
  [37, 50, 94, 117],
  [37, 105],
  [48, 105, 50, 23],
  [101],
  [37, 33, 117],
  [94, 72, 71, 79, 23, 79, 116, 117, 71],
  [17, 61, 79, 43, 73],
  [104, 105, 94, 61, 71],
  [73, 79, 64, 61],
  [71, 105, 104, 72],
  [37, 105],
  [79],
  [61, 79, 72, 71, 64, 72, 116],
  [73, 105, 23, 117, 104, 33, 117, 50, 117],
  [105, 94, 37],
  [105, 48],
  [73, 64, 116, 33, 37],
  [29],
  [2]],
 ['<S>',
  'CC',
  'RB',
  ',',
  

## 4. Construct POS tag dictionary 


In [24]:
pos_tag_idx = 3

def get_pos_vocab(raw_dataset):
    """
    return vocab set, and prints out the vocab size
    :param raw_dataset: a list of lists: each inner list is a triple:
                a sentence: string
                a list of labels:
                a list of pos:
    :return: a set: the vocabulary in the raw_dataset
    """
    vocab = []
    for example in raw_dataset:
        vocab.extend(example[pos_tag_idx])  # index 3 is the source sentence tkn - pos tags 
    vocab = set(vocab)
    vocab.remove('<S>')
    vocab.remove('<E>')
    print("vocab size: ", len(vocab))
    return vocab

def get_pos_word2idx_idx2word(vocab):
    """
    :param vocab: a set of strings: vocabulary
    :return: word2idx: string to an int
             idx2word: int to a string
    """
    word2idx = {"<PAD>": 0, "<UNK>": 1, '<S>': 2, '<E>': 3}
    idx2word = {0: "<PAD>", 1: "<UNK>", 2: '<S>', 3: '<E>'}
    for word in vocab:
        assigned_index = len(word2idx)
        word2idx[word] = assigned_index
        idx2word[assigned_index] = word
    return word2idx, idx2word

In [25]:
pos_vocab =  get_pos_vocab(processed_data['train'] + processed_data['valid'])
word2idx_pos, idx2word_pos = get_pos_word2idx_idx2word(pos_vocab)


vocab size:  44


In [26]:
pos_vocab = list(pos_vocab)
# replace the words with indices 
for i, d in enumerate(processed_data['train'] ): 
    toknz_sent = []
    for w in d[pos_tag_idx]: 
        toknz_sent.append(word2idx_pos[w])
    d[pos_tag_idx] = toknz_sent
    processed_data['train'][i] = d
    
for i, d in enumerate(processed_data['valid'] ): 
    toknz_sent = []
    for w in d[pos_tag_idx]: 
        toknz_sent.append(word2idx_pos[w])
    d[pos_tag_idx] = toknz_sent
    processed_data['valid'][i] = d

## 5.  create glove embedding layer weights


In [27]:
import mmap
PATH_TO_STATIC_GLOVE_EMBEDDINGS = '/home/zzeng/workspace/UIUC_research/IdiomDetection/data/resources/glove.840B.300d.txt'
GLOVE_EMBEDDING_DIM = 300
GLOVE_EMBED_NORAM = False

In [28]:
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

In [29]:
glove_vectors = {}
print('Loading Pre-trained GLOVE word Embeddings...')
with open(PATH_TO_STATIC_GLOVE_EMBEDDINGS) as glove_file:
    for line in tqdm(glove_file, total=get_num_lines(PATH_TO_STATIC_GLOVE_EMBEDDINGS)):
        split_line = line.rstrip().split()
        word = split_line[0]
        if len(split_line) != (GLOVE_EMBEDDING_DIM + 1) or word not in word2idx_glove:
            continue
        assert (len(split_line) == GLOVE_EMBEDDING_DIM + 1)
        vector = np.array([float(x) for x in split_line[1:]], dtype="float32")
        if GLOVE_EMBED_NORAM:
            vector = vector / np.linalg.norm(vector)
        assert len(vector) == GLOVE_EMBEDDING_DIM
        glove_vectors[word] = vector
print("Number of pre-trained word vectors loaded: ", len(glove_vectors))



Loading Pre-trained GLOVE word Embeddings...


100%|██████████| 2196017/2196017 [00:25<00:00, 87659.94it/s]

Number of pre-trained word vectors loaded:  35041





In [30]:
all_embeddings = np.array(list(glove_vectors.values()))
embeddings_mean = float(np.mean(all_embeddings))
embeddings_stdev = float(np.std(all_embeddings))
print("Embeddings mean: ", embeddings_mean)
print("Embeddings stdev: ", embeddings_stdev)

# Randomly initialize an embedding matrix of (vocab_size, embedding_dim) shape
# with a similar distribution as the pretrained embeddings for words in vocab.
vocab_size = len(word2idx_glove)
embedding_matrix = np.random.normal(embeddings_mean, embeddings_stdev, size=(vocab_size, GLOVE_EMBEDDING_DIM))
# Go through the embedding matrix and replace the random vector with a
# pretrained one if available. Start iteration at 2 since 0, 1 are PAD, UNK
hit, miss = 0, 0 
for i in range(2, vocab_size):
    word = idx2word_glove[i]
    if word in glove_vectors:
        hit += 1
        embedding_matrix[i] = np.array(glove_vectors[word])
    else: 
        miss += 1
        
if GLOVE_EMBED_NORAM:
    for i in range(vocab_size):
        embedding_matrix[i] = embedding_matrix[i] / float(np.linalg.norm(embedding_matrix[i]))

print('Glove Embedding shape: ')
print(embedding_matrix.shape)
print('Hit ratio: {}'.format(hit/(hit + miss)))

Embeddings mean:  -0.003942171111702919
Embeddings stdev:  0.39030006527900696
Glove Embedding shape: 
(38664, 300)
Hit ratio: 0.9063421447416068


## SAVE THINGS

In [25]:
write_json_file(PATH_TO_SAVE_DATA, processed_data)
write_json_file(PATH_TO_SAVE_TARGET_VOCAB, target_vocab)
write_json_file(PATH_TO_SAVE_GLOVE_VOCAB, idx2word_glove)
write_json_file(PATH_TO_SAVE_CHAR_VOCAB, idx2word_char)
write_json_file(PATH_TO_SAVE_POS_VOCAB, idx2word_pos)

np.save(PATH_TO_SAVE_GLOVE_EMB, embedding_matrix)