## Augmentation Inspection
This notebook is used to sample the augmented training data and inspect it.

In [92]:
import numpy as np
import os
import tensorflow as tf
import collections
from utils import tokenization
tf.enable_eager_execution()

### Get Paths to the Different Data Augmentation Directories

In [98]:
data_base_path = './Data/proc_Data/GoT/unsup'
prob_factors = np.arange(0.1,0.2,0.1)
copy_number = '0'
data_record_paths = [os.path.join(data_base_path, 'tf_idf-{:0.1f}'.format(x), copy_number, "tf_examples.tfrecord*") for x in prob_factors]
data_files = [tf.contrib.slim.parallel_reader.get_data_files(
          data_record_path) for data_record_path in data_record_paths]

### Feature Specifications is a mapping of the different "columns" of data stored in the tfrecords files.

In [99]:
feature_specs = collections.OrderedDict()
feature_specs["ori_input_ids"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)
feature_specs["ori_input_mask"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)
feature_specs["ori_input_type_ids"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)
feature_specs["aug_input_ids"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)
feature_specs["aug_input_mask"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)
feature_specs["aug_input_type_ids"] = tf.io.FixedLenFeature([max_seq_len], tf.int64)


NameError: name 'max_seq_len' is not defined

### Use the cell below to create mappings of words to ids and ids to words

In [100]:
vocab_file = "./bert_pretrained/bert_base/vocab.txt"

vocab = tokenization.load_vocab(vocab_file)
ids_dict = tokenization.load_ids(vocab)

In [101]:
for i,infile in enumerate(data_files):
    for example in tf.python_io.tf_record_iterator(infile[-1]):
        a = tf.train.Example.FromString(example)
        orig_int_list = [a.features.feature['ori_input_ids'].int64_list.value[i] for i in range(0,128)]
        aug_int_list = [a.features.feature['aug_input_ids'].int64_list.value[i] for i in range(0,128)]
        orig_seq = tokenization.convert_ids_to_words(orig_int_list, ids_dict)
        aug_seq = tokenization.convert_ids_to_words(aug_int_list, ids_dict)
        print("Original Sequence:\n {}\n\n".format(" ".join(orig_seq)))
        print("Augmented Sequence with p={}:\n {}\n\n".format(prob_factors[i], " ".join(aug_seq)))
        break

Original Sequence:
 [CLS] brien ##ne was moving , slow and wary , sword to hand ; step , turn , and listen . each step made a little splash . a cave lion ? dire ##wo ##lves ? some bear ? tell me , jaime . what lives here ? what lives in the darkness ? doom . no bear , he knew . no lion . only doom . in the cool silvery - blue light of the swords , the big wen ##ch looked pale and fierce . i mis ##like this place . i ’ m not fond of it myself . their blades made a little island of light , but all around them stretched a sea of darkness , une ##nding . [SEP] [PAD] [PAD]


Augmented Sequence with p=0.1:
 [CLS] moving , slow and wary , sword to hand ; step , turn , and listen figured bath ##house step made a wrists splash gods ##way a cave lion ? dire ##wo ##lves ? some bear ? tell tap , jaime . what lives here ? what lives in ( darkness trees doom . no bear , he knew . no moaning . only doom . in the cool silvery - blue light of the swords , the big wen ##ch looked pale and fierce . i mis

In [105]:
orig_int_list

[101,
 9848,
 2638,
 2001,
 3048,
 1010,
 4030,
 1998,
 15705,
 1010,
 4690,
 2000,
 2192,
 1025,
 3357,
 1010,
 2735,
 1010,
 1998,
 4952,
 1012,
 2169,
 3357,
 2081,
 1037,
 2210,
 17624,
 1012,
 1037,
 5430,
 7006,
 1029,
 18704,
 12155,
 20899,
 1029,
 2070,
 4562,
 1029,
 2425,
 2033,
 1010,
 14519,
 1012,
 2054,
 3268,
 2182,
 1029,
 2054,
 3268,
 1999,
 1996,
 4768,
 1029,
 12677,
 1012,
 2053,
 4562,
 1010,
 2002,
 2354,
 1012,
 2053,
 7006,
 1012,
 2069,
 12677,
 1012,
 1999,
 1996,
 4658,
 21666,
 1011,
 2630,
 2422,
 1997,
 1996,
 10689,
 1010,
 1996,
 2502,
 19181,
 2818,
 2246,
 5122,
 1998,
 9205,
 1012,
 1045,
 28616,
 10359,
 2023,
 2173,
 1012,
 1045,
 1521,
 1049,
 2025,
 13545,
 1997,
 2009,
 2870,
 1012,
 2037,
 10491,
 2081,
 1037,
 2210,
 2479,
 1997,
 2422,
 1010,
 2021,
 2035,
 2105,
 2068,
 7121,
 1037,
 2712,
 1997,
 4768,
 1010,
 16655,
 15683,
 1012,
 102,
 0,
 0]

In [104]:
aug_int_list

128

In [107]:
a = list(zip(orig_int_list, aug_int_list))

In [108]:
a

[(101, 101),
 (9848, 3048),
 (2638, 1010),
 (2001, 4030),
 (3048, 1998),
 (1010, 15705),
 (4030, 1010),
 (1998, 4690),
 (15705, 2000),
 (1010, 2192),
 (4690, 1025),
 (2000, 3357),
 (2192, 1010),
 (1025, 2735),
 (3357, 1010),
 (1010, 1998),
 (2735, 4952),
 (1010, 6618),
 (1998, 7198),
 (4952, 4580),
 (1012, 3357),
 (2169, 2081),
 (3357, 1037),
 (2081, 12150),
 (1037, 17624),
 (2210, 5932),
 (17624, 4576),
 (1012, 1037),
 (1037, 5430),
 (5430, 7006),
 (7006, 1029),
 (1029, 18704),
 (18704, 12155),
 (12155, 20899),
 (20899, 1029),
 (1029, 2070),
 (2070, 4562),
 (4562, 1029),
 (1029, 2425),
 (2425, 11112),
 (2033, 1010),
 (1010, 14519),
 (14519, 1012),
 (1012, 2054),
 (2054, 3268),
 (3268, 2182),
 (2182, 1029),
 (1029, 2054),
 (2054, 3268),
 (3268, 1999),
 (1999, 1006),
 (1996, 4768),
 (4768, 3628),
 (1029, 12677),
 (12677, 1012),
 (1012, 2053),
 (2053, 4562),
 (4562, 1010),
 (1010, 2002),
 (2002, 2354),
 (2354, 1012),
 (1012, 2053),
 (2053, 22653),
 (7006, 1012),
 (1012, 2069),
 (2069, 12