In [1]:
"""
4/13: Generate fake numpy files for testing.
"""


import numpy as np
import os
import sys
sys.path.append("../src") 

In [28]:
import math
import random


data_path = "../data/"
data_name = "auto-last-toy"
try:
    os.mkdir(os.path.join(data_path, data_name))  # Make directory.
except FileExistsError:
    pass


max_length = 13
vocab_size = 2
PAD = 0
SOS = 1
EOS = 2
DUMMY = EOS + vocab_size + 1


def get_random(length, candidates=[3, 4]):
    def dfs(length, result):
        if length == 0:
            results.append(result)
            return
        for candidate in candidates:
            dfs(length - 1, result + [candidate])
    results = []
    dfs(length, [])
    return results


def turn_to_same_number(lst, target_size=1000): 
    lst = lst.copy()
    if len(lst) < target_size:
        m = math.ceil(target_size / len(lst))
        n = math.ceil(math.log(m, 2))
        #print(m, n)
        for i in range(n):
            lst = lst + lst.copy()
    random.shuffle(lst)
    return lst[:target_size]


candidates = get_random(3)  
print(len(candidates))
a = turn_to_same_number(candidates, 30)
print(len(a))
a = turn_to_same_number(candidates, 20)
print(len(a))
a = turn_to_same_number(candidates, 15)
print(len(a))
a = turn_to_same_number(candidates, 5)        
print(len(a))

8
30
20
15
5


In [29]:
# Autoenc-last
# encoder = [0, 1, x, x, 3, 2]
# decoder = [1, 5, 5, 3, 2, 0]
data_name = "auto-last-toy"
min_length = 1
max_length = 13
target_size = 1000

encoder_train = []
decoder_train = []
for length in range(min_length, max_length + 1):
    candidates = get_random(length - 1)
    candidates = turn_to_same_number(candidates, target_size)
    
    for token in range(EOS + 1, EOS + 1 + vocab_size):
        encoder_train += [[PAD] * (max_length - length) + [SOS] + candidate + [token, EOS]
                          for candidate in candidates]
        decoder_train += [[SOS] + [DUMMY] * (length - 1) + [token, EOS] + [PAD] * (max_length - length)
                         for _ in range(len(candidates))]

print(len(encoder_train))
#print(encoder_train)
print(len(decoder_train))
#print(decoder_train)
encoder_valid = np.copy(encoder_train)
decoder_valid = np.copy(decoder_train)
np.save("%s%s/encoder_train.npy" % (data_path, data_name), encoder_train)
np.save("%s%s/decoder_train.npy" % (data_path, data_name), decoder_train)
np.save("%s%s/encoder_valid.npy" % (data_path, data_name), encoder_valid)
np.save("%s%s/decoder_valid.npy" % (data_path, data_name), decoder_train)


26000
26000


In [30]:
# Token-positioning
# encoder = [3, 3]
# decoder = [1, x, x, 3, 2, 0, 0]
data_name = 'token-posi-toy'
try:
    os.mkdir(os.path.join(data_path, data_name))  # Make directory.
except FileExistsError:
    pass
min_length = 1
max_length = 13
target_size = 1000

encoder_train = []
decoder_train = []
for length in range(min_length, max_length + 1):
    candidates = get_random(length - 1)
    candidates = turn_to_same_number(candidates, target_size)
    
    for token in range(EOS + 1, EOS + 1 + vocab_size):
        encoder_train += [[token, length] for _ in range(len(candidates))]
        decoder_train += [[SOS] + candidate + [token, EOS] + [PAD] * (max_length - length)
                         for candidate in candidates]

print(len(encoder_train))
print(len(decoder_train))
#print(encoder_train)
#print(decoder_train)
encoder_valid = np.copy(encoder_train)
decoder_valid = np.copy(decoder_train)
np.save("%s%s/encoder_train.npy" % (data_path, data_name), encoder_train)
np.save("%s%s/decoder_train.npy" % (data_path, data_name), decoder_train)
np.save("%s%s/encoder_valid.npy" % (data_path, data_name), encoder_valid)
np.save("%s%s/decoder_valid.npy" % (data_path, data_name), decoder_train)


26000
26000


In [31]:
# Generate fake data.

data_path = "../data/"
data_name = "fake-data"

#os.mkdir(os.path.join(data_path, data_name))  # Make directory.

encoder_train = np.array([[1, 3, 3, 3, 3, 2],
                          [0, 1, 3, 3, 3, 2], 
                         [0, 0, 1, 3, 3, 2],
                         [0, 0, 0, 1, 3, 2],
                         [1, 4, 4, 4, 4, 2],
                         [0, 1, 4, 4, 4, 2],
                         [0, 0, 1, 4, 4, 2],
                         [0, 0, 0, 1, 4, 2]])
decoder_train = np.array([[1, 5, 5, 5, 3, 2],
                          [1, 5, 5, 3, 2, 0], 
                         [1, 5, 3, 2, 0, 0],
                         [1, 3, 2, 0, 0, 0],
                         [1, 5, 5, 5, 4, 2],
                         [1, 5, 5, 4, 2, 0],
                         [1, 5, 4, 2, 0, 0],
                         [1, 4, 2, 0, 0, 0]])
encoder_valid = np.copy(encoder_train)
decoder_valid = np.copy(decoder_train)
np.save("%s%s/encoder_train.npy" % (data_path, data_name), encoder_train)
np.save("%s%s/decoder_train.npy" % (data_path, data_name), decoder_train)
np.save("%s%s/encoder_valid.npy" % (data_path, data_name), encoder_valid)
np.save("%s%s/decoder_valid.npy" % (data_path, data_name), decoder_train)

In [25]:
# Generate fake data.

data_path = "../data/"
data_name = "fake-data-2"
data_type_list = ['encoder_train', 'decoder_train', 'encoder_valid', 'decoder_valid']
data_shape_list = [[100, 8], [100, 8], [50, 8], [50, 8]]

os.mkdir(os.path.join(data_path, data_name))  # Make directory.

for data_type, data_shape in zip(data_type_list, data_shape_list):
    print("data_type = %s, data_shape =" % data_type, data_shape)
    npy_file = np.ones(data_shape)
    np.save("%s%s/%s.npy" % (data_path, data_name, data_type), npy_file)

data_type = encoder_train, data_shape = [100, 8]
data_type = decoder_train, data_shape = [100, 8]
data_type = encoder_valid, data_shape = [50, 8]
data_type = decoder_valid, data_shape = [50, 8]


In [None]:
# Test src/main.py with task = "autoenc-last":
# In src/, run $ python3 main.py --task=autoenc-last --units=10 --max_epochs=1 --mode=train --data_name=fake_data

In [21]:
# Test evaluate_autoencoder_last_step.

import evaluator
import importlib
importlib.reload(evaluator)

real = np.array([[3, 3, 3, 2, 0, 0]])  # 0: PAD, 2: EOS, 3, 4: vocab
pred = np.array([[3, 3, 3, 2, 0, 0]])
acc = evaluator.evaluate_autoencoder_last_step(real, pred)
print("1.0 ? ->", acc)

real = np.array([[3, 3, 3, 2, 0, 0]])  # The last words are different.
pred = np.array([[3, 3, 4, 2, 0, 0]])
acc = evaluator.evaluate_autoencoder_last_step(real, pred)
print("0.0 ? ->", acc)


real = np.array([[3, 3, 3, 2, 0, 0]])  # Other words are different.
pred = np.array([[4, 4, 3, 2, 0, 0]])
acc = evaluator.evaluate_autoencoder_last_step(real, pred)
print("1.0 ? ->", acc)


real = np.array([[3, 3, 3, 2, 0, 0]])  # Every word are same, but last word appear at wrong t.
pred = np.array([[3, 3, 3, 3, 2, 2]])
acc = evaluator.evaluate_autoencoder_last_step(real, pred)
print("0.0 ? ->", acc)


real = np.array([[3, 3, 3, 2, 0, 0], [3, 3, 3, 2, 0, 0]])  
pred = np.array([[3, 3, 3, 3, 2, 2], [4, 4, 3, 2, 0, 0]])
acc = evaluator.evaluate_autoencoder_last_step(real, pred)
print("0.5 ? ->", acc)

1.0 ? -> 1.0
0.0 ? -> 0.0
1.0 ? -> 1.0
0.0 ? -> 0.0
0.5 ? -> 0.5
