* fine process the tokenized word sequence into token index 

In [1]:
import gzip
import os
import re
import tarfile

from os.path import join as pjoin
from six.moves import urllib

from tensorflow import gfile
from tqdm import tqdm
import numpy as np

from preprocessing.download_preprocess import tokenize

#### Data Environment

In [6]:
source_dir = os.path.join(os.curdir, 'data', 'squad')

train_path = pjoin(source_dir, "train")
valid_path = pjoin(source_dir, "val")
dev_path = pjoin(source_dir, "dev")

# vocabulary
vocab_path = pjoin(source_dir, "vocab.dat")

In [11]:
_PAD = "<pad>"
_SOS = "<sos>"
_UNK = "<unk>"
_START_VOCAB = [_PAD, _SOS, _UNK]

PAD_ID = 0
SOS_ID = 1
UNK_ID = 2

#### SQuAD Vocubulary I/O
total vocubulary token list from Question & Paragraph text

In [12]:
def basic_tokenizer(sentence):
    words = []
    for space_separated_fragment in sentence.strip().split():
        words.extend(re.split(" ", space_separated_fragment.decode('utf-8')))
    return [w for w in words if w]

def create_vocabulary(vocabulary_path, data_paths, tokenizer):
    '''Output'''
    if not gfile.Exists(vocabulary_path):
        print("creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths)))
        vocab = {}
        for path in tqdm(data_paths):
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    tokens = tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print("Vocabulary size: %d" % len(vocab_list))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + "\n")

def initialize_vocabulary(vocabulary_path):
    '''Input'''
    # map vocab to word embeddings
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="r") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip('\n') for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)


In [13]:
# store the vocabulary
create_vocabulary(vocab_path,
                  [pjoin(source_dir, "train.context"),
                   pjoin(source_dir, "train.question"),
                   pjoin(source_dir, "val.context"),
                   pjoin(source_dir, "val.question")],
                   basic_tokenizer)

  0%|          | 0/4 [00:00<?, ?it/s]

creating vocabulary ./data/squad/vocab.dat from data ['./data/squad/train.context', './data/squad/train.question', './data/squad/val.context', './data/squad/val.question']


100%|██████████| 4/4 [00:17<00:00,  4.36s/it]


Vocabulary size: 115365


In [18]:
# read the vocabulary
vocab, rev_vocab = initialize_vocabulary(vocab_path)

In [22]:
def generate_glove(vocab_list, save_path, glove_dir, glove_source,
                   glove_dim=300, size=4e5, random_init=True):
    """
    store the embedding matrix of specific GloVe vector 
    :param vocab_list: [vocab]
    :return:
    """
    if not gfile.Exists(save_path + ".npz"):
        if glove_source == 'wiki':
            glove_path = os.path.join(glove_dir, "glove.6B.{}d.txt".format(glove_dim))
        elif glove_source == 'crawl_cs':
            glove_path = os.path.join(glove_dir, "glove.840B.300d.txt")
            glove_dim = 300
        elif glove_source == 'crawl_ci':
            glove_path = os.path.join(glove_dir, "glove.42B.300d.txt")
            glove_dim = 300
        
        if random_init:
            glove = np.random.randn(len(vocab_list), glove_dim)
        else:
            glove = np.zeros((len(vocab_list), glove_dim))

        found = 0
        with open(glove_path, 'r', encoding='utf8') as fh:  # NOTE: encoding='utf8, new addition, may cause problems elsewhere
            for line in tqdm(fh, total=size):
                array = line.lstrip().rstrip().split(" ")
                word = array[0]
                vector = list(map(float, array[1:]))
                if word in vocab_list:
                    idx = vocab_list.index(word)
                    glove[idx, :] = vector
                    found += 1
                elif word.capitalize() in vocab_list:
                    idx = vocab_list.index(word.capitalize())
                    glove[idx, :] = vector
                    found += 1
                elif word.lower() in vocab_list:
                    idx = vocab_list.index(word.lower())
                    glove[idx, :] = vector
                    found += 1
                elif word.upper() in vocab_list:
                    idx = vocab_list.index(word.upper())
                    glove[idx, :] = vector
                    found += 1

        print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
        np.savez_compressed(save_path, glove=glove)
        print("saved trimmed glove matrix at: {}".format(save_path))

In [23]:
# store the glove word embeddingmatrix into glove.trimmed.300
generate_glove(rev_vocab, source_dir + "/glove.trimmed.300", source_dir, 'wiki')

100%|██████████| 400000/400000.0 [35:59<00:00, 185.25it/s]


71734/115365 of word vocab have corresponding vectors in ./data/squad/glove.6B.300d.txt
saved trimmed glove matrix at: ./data/squad/glove.trimmed.300


#### create the {train | dev | val} token id dataset
* Training Set

In [27]:
def sentence_to_token_ids(sentence, vocabulary, tokenizer):
    words = tokenizer(sentence)
    return [vocabulary.get(w, UNK_ID) for w in words]

def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer):
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % 5000 == 0:
                        print("tokenizing line %d" % counter)
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

In [28]:
x_train_dis_path = train_path + ".ids.context"
y_train_ids_path = train_path + ".ids.question"
data_to_token_ids(train_path + ".context", x_train_dis_path, vocab_path, basic_tokenizer)
data_to_token_ids(train_path + ".question", y_train_ids_path, vocab_path, basic_tokenizer)

Tokenizing data in ./data/squad/train.context
tokenizing line 5000
tokenizing line 10000
tokenizing line 15000
tokenizing line 20000
tokenizing line 25000
tokenizing line 30000
tokenizing line 35000
tokenizing line 40000
tokenizing line 45000
tokenizing line 50000
tokenizing line 55000
tokenizing line 60000
tokenizing line 65000
tokenizing line 70000
tokenizing line 75000
tokenizing line 80000
Tokenizing data in ./data/squad/train.question
tokenizing line 5000
tokenizing line 10000
tokenizing line 15000
tokenizing line 20000
tokenizing line 25000
tokenizing line 30000
tokenizing line 35000
tokenizing line 40000
tokenizing line 45000
tokenizing line 50000
tokenizing line 55000
tokenizing line 60000
tokenizing line 65000
tokenizing line 70000
tokenizing line 75000
tokenizing line 80000


* Val Set

In [29]:
x_dis_path = valid_path + ".ids.context"
y_ids_path = valid_path + ".ids.question"
data_to_token_ids(valid_path + ".context", x_dis_path, vocab_path, basic_tokenizer)
data_to_token_ids(valid_path + ".question", y_ids_path, vocab_path, basic_tokenizer)

Tokenizing data in ./data/squad/val.context
Tokenizing data in ./data/squad/val.question
