In [1]:
import random
import numpy as np
from tqdm import tqdm_notebook

In [2]:
tokenized_questions = np.load('../data/tokenized_questions.npy')
extended_programs = np.load('../data/extended_programs.npy')

In [3]:
sentences = list()

for program in extended_programs:
    for episode in program:
        for sentence in episode:
            sentences.append(sentence)
            
shuffle_list = list(range(len(sentences)-1))
random.shuffle(shuffle_list)

In [4]:
def generate_testing_data(questions):
    x = []
    for sample in questions:
        question = sample[0]
        options = sample[1:]
        for option in options:
            x.append((question, option))
    return x

In [5]:
def write_question(x):
    with open('../data/test.tsv', 'w', encoding='utf-8') as file:
        for xx in tqdm_notebook(x):
            nt = [t for t in xx[0] if t != '\t']
            c1 = ' '.join(nt)
            nt = [t for t in xx[1] if t != '\t']
            c2 = ' '.join(nt)
            file.write(c1 + '\t' + c2 + '\n')

In [6]:
def generate_training_data(sentences, num_samples, shuffle_list):
    x, y = [], []
    count = 0
    for i in tqdm_notebook(range(num_samples)):
        pos_or_neg = random.randint(0, 1)
        
        if pos_or_neg < 1:
            try:
                x.append((sentences[shuffle_list[count]], sentences[shuffle_list[count] + 3]))
            except:
                x.append((sentences[0], sentences[3]))
            y.append(1)
            count = (count + 1) % len(sentences)
            
        else:
            f = random.randint(0, len(sentences)-1)
            s = random.randint(0, len(sentences)-1)
            x.append((sentences[f], sentences[s]))
            y.append(0)
    
    return x, y

In [7]:
def write_tsv(type_, x, y):
    with open('../data/{}.tsv'.format(type_), 'w', encoding='utf-8') as file:
        for xx, yy in tqdm_notebook(zip(x, y)):
            nt = [t for t in xx[0] if t != '\t']
            c1 = ' '.join(nt)
            nt = [t for t in xx[1] if t != '\t']
            c2 = ' '.join(nt)
            file.write(c1 + '\t' + c2 + '\t' + str(yy) + '\n')

In [8]:
test_x = generate_testing_data(tokenized_questions)

In [9]:
write_question(test_x)

HBox(children=(IntProgress(value=0, max=3000), HTML(value='')))




In [10]:
train_x, train_y = generate_training_data(sentences, 4000000, shuffle_list[:2000000])
valid_x, valid_y = generate_training_data(sentences, 10000, shuffle_list[2000000:])

HBox(children=(IntProgress(value=0, max=4000000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [11]:
write_tsv('train', train_x, train_y)
write_tsv('dev', valid_x, valid_y)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


