In [1]:
syllables = ["be", "bi", "de", "di", "ge", "gi", "po", "pu", "to", "tu", "ko", "ku"]
len(syllables)

12

In [56]:
def make_list_blue(lst):
    new_list = []
    
    for elt in lst:
        new_list.append("\"<p style='color:blue'>" + elt + "</p>\"")
        
    return new_list

In [57]:
def make_list_green(lst):
    new_list = []
    
    for elt in lst:
        new_list.append("\"<p style='color:green'>" + elt + "</p>\"")
        
    return new_list

In [58]:
import random
from random import shuffle

def gen_seq(half_length, used_dict):
    done = False
    while not done:
        seq = []
        
        for i in range(half_length):
            seq.append(random.choice(syllables))
            
        seq = tuple(seq)
        if seq not in used_dict:
            repeat = False
            repeat_dict = {}
            
            for elt in seq:
                if elt in repeat_dict:
                    repeat = True
                repeat_dict[elt] = 1
            
            if not repeat:
                done = True
           
    seq = list(seq)
    return make_list_green(seq) + make_list_blue(seq[::-1])
    

In [59]:
def gen_dataset(half_length, num):
    used_dict = {}
    dataset = []
    
    for _ in range(num):
        new_seq = gen_seq(half_length, used_dict)
        half_seq = new_seq[:half_length]
        used_dict[tuple(half_seq)] = 1
        
        dataset.append(new_seq)
        
    return dataset
    

In [60]:
def gen_bad_dependencies(half_length, index_to_change, used_dict):
    done = False
    
    while not done:
        base_seq = gen_seq(half_length, {})
                        
        current_syll = base_seq[-1 * index_to_change]
        new_syll = random.choice(syllables)
        
        new_syll = make_list_blue([new_syll])[0]
            
        new_seq = base_seq[:]
        new_seq[-1 * index_to_change] = new_syll
            
        if tuple(new_seq) not in used_dict and new_syll != current_syll and new_syll not in base_seq[:half_length]:
            done = True
            return new_seq
        

In [61]:
def gen_bad_dependencies_dataset(half_length, index_to_change, num):
    used_dict = {}
    dataset = []
    
    for _ in range(num):
        seq = gen_bad_dependencies(half_length, index_to_change, used_dict)
        dataset.append(seq)
        used_dict[tuple(seq)] = 1
        
    return dataset

In [62]:
def gen_unmatched_length(first_half_length, second_half_length, used_dict):
    half_length = max([first_half_length, second_half_length])
    
    done = False
    
    while not done:
        good_seq = gen_seq(half_length, {})

        if first_half_length > second_half_length:
            bad_seq = good_seq[:-1 * (first_half_length - second_half_length)]
        else:
            bad_seq = good_seq[(second_half_length - first_half_length):]
            
        if tuple(bad_seq) not in used_dict:
            done = True
            return bad_seq
        

In [63]:
def gen_unmatched_length_dataset(first_half_length, second_half_length, num):
    used_dict = {}
    dataset = []
    
    for _ in range(num):
        seq = gen_unmatched_length(first_half_length, second_half_length, used_dict)
        dataset.append(seq)
        used_dict[tuple(seq)] = 1
        
    return dataset

In [64]:
# Training
ones = gen_dataset(1, 12)
twos = gen_dataset(2, 12)

train_ones = ones[:10]
train_twos = twos[:10]

# 1: Test on seen, seen length, same number of A’s and B’s, correct dependencies
test_seen_ones = train_ones[:]
shuffle(test_seen_ones)
test_seen_ones = test_seen_ones[:2]

test_seen_twos = train_twos[:]
shuffle(test_seen_twos)
test_seen_twos = test_seen_twos[:2]

# 2: Test on unseen, seen length, same number of A’s and B’s, correct dependencies
test_unseen_ones = ones[10:]
test_unseen_twos = twos[10:]

# 3: Test on unseen, seen length, same number of A’s and B’s, incorrect dependencies
test_bad_deps_one = gen_bad_dependencies_dataset(1,1,2)
test_bad_deps_two = gen_bad_dependencies_dataset(2,1,2)

# 4: Test on unseen, seen length, different number of A’s and B’s, incorrect dependencies
test_unmatched_length_seen = gen_unmatched_length_dataset(2,1,2) + gen_unmatched_length_dataset(1,2,2)

# 5: Test on unseen, unseen length, same number of A’s and B’s, correct dependencies
test_unseen_length_good = gen_dataset(3,4)

# 6: Test on unseen, unseen length, same number of A’s and B’s, incorrect dependencies
test_bad_deps_three = gen_bad_dependencies_dataset(3,1,2) + gen_bad_dependencies_dataset(3,2,2)

# 7: Test on unseen, unseen length, different number of A’s and B’s, incorrect dependencies
test_unmatched_length_three = gen_unmatched_length_dataset(3,2,2) + gen_unmatched_length_dataset(2,3,2)





In [65]:
train = train_ones + train_twos
test = test_seen_ones + test_seen_twos + test_unseen_ones + test_unseen_twos + test_bad_deps_one + test_bad_deps_two + test_unmatched_length_seen + test_unseen_length_good + test_bad_deps_three + test_unmatched_length_three

shuffle(train)
shuffle(test)

In [66]:
train

[['"<p style=\'color:green\'>de</p>"', '"<p style=\'color:blue\'>de</p>"'],
 ['"<p style=\'color:green\'>po</p>"',
  '"<p style=\'color:green\'>ko</p>"',
  '"<p style=\'color:blue\'>ko</p>"',
  '"<p style=\'color:blue\'>po</p>"'],
 ['"<p style=\'color:green\'>tu</p>"', '"<p style=\'color:blue\'>tu</p>"'],
 ['"<p style=\'color:green\'>to</p>"', '"<p style=\'color:blue\'>to</p>"'],
 ['"<p style=\'color:green\'>bi</p>"',
  '"<p style=\'color:green\'>ko</p>"',
  '"<p style=\'color:blue\'>ko</p>"',
  '"<p style=\'color:blue\'>bi</p>"'],
 ['"<p style=\'color:green\'>ku</p>"',
  '"<p style=\'color:green\'>ko</p>"',
  '"<p style=\'color:blue\'>ko</p>"',
  '"<p style=\'color:blue\'>ku</p>"'],
 ['"<p style=\'color:green\'>po</p>"',
  '"<p style=\'color:green\'>to</p>"',
  '"<p style=\'color:blue\'>to</p>"',
  '"<p style=\'color:blue\'>po</p>"'],
 ['"<p style=\'color:green\'>po</p>"', '"<p style=\'color:blue\'>po</p>"'],
 ['"<p style=\'color:green\'>bi</p>"', '"<p style=\'color:blue\'>bi</p>"'],


In [67]:
test

[['"<p style=\'color:green\'>ge</p>"',
  '"<p style=\'color:green\'>de</p>"',
  '"<p style=\'color:blue\'>de</p>"',
  '"<p style=\'color:blue\'>ge</p>"'],
 ['"<p style=\'color:green\'>di</p>"',
  '"<p style=\'color:green\'>ku</p>"',
  '"<p style=\'color:green\'>gi</p>"',
  '"<p style=\'color:blue\'>gi</p>"',
  '"<p style=\'color:blue\'>ku</p>"'],
 ['"<p style=\'color:green\'>bi</p>"',
  '"<p style=\'color:blue\'>bi</p>"',
  '"<p style=\'color:blue\'>be</p>"'],
 ['"<p style=\'color:green\'>gi</p>"',
  '"<p style=\'color:green\'>bi</p>"',
  '"<p style=\'color:green\'>ge</p>"',
  '"<p style=\'color:blue\'>ge</p>"',
  '"<p style=\'color:blue\'>bi</p>"',
  '"<p style=\'color:blue\'>to</p>"'],
 ['"<p style=\'color:green\'>to</p>"', '"<p style=\'color:blue\'>to</p>"'],
 ['"<p style=\'color:green\'>tu</p>"', '"<p style=\'color:blue\'>be</p>"'],
 ['"<p style=\'color:green\'>di</p>"',
  '"<p style=\'color:green\'>po</p>"',
  '"<p style=\'color:green\'>ku</p>"',
  '"<p style=\'color:blue\'>ku</p>

In [73]:
def print_list_of_lists(lol):
    string = "[" + ", ".join("[" + ", ".join(x) + "]" for x in lol) + "]"
    
    return string

In [77]:
print(print_list_of_lists(train[:3]))

[["<p style='color:green'>de</p>", "<p style='color:blue'>de</p>"], ["<p style='color:green'>po</p>", "<p style='color:green'>ko</p>", "<p style='color:blue'>ko</p>", "<p style='color:blue'>po</p>"], ["<p style='color:green'>tu</p>", "<p style='color:blue'>tu</p>"]]


In [78]:
print(print_list_of_lists(test[:3]))

[["<p style='color:green'>ge</p>", "<p style='color:green'>de</p>", "<p style='color:blue'>de</p>", "<p style='color:blue'>ge</p>"], ["<p style='color:green'>di</p>", "<p style='color:green'>ku</p>", "<p style='color:green'>gi</p>", "<p style='color:blue'>gi</p>", "<p style='color:blue'>ku</p>"], ["<p style='color:green'>bi</p>", "<p style='color:blue'>bi</p>", "<p style='color:blue'>be</p>"]]
