In [79]:
import numpy as np
import tensorflow as tf
import keras

for g in tf.config.list_physical_devices("GPU"):
    tf.config.experimental.set_memory_growth(g, True)

print(tf.config.list_physical_devices())

np.random.seed(42)
tf.random.set_seed(42)

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Data Preparation

In [None]:
# nodes graph for reber string generation
nodes = [
    {"current_node" : 0, "next_node" : [(1, "T"), (2, "P")]},
    {"current_node" : 1, "next_node" : [(1, "S"), (4, "X")]},
    {"current_node" : 2, "next_node" : [(2, "T"), (3, "V")]},
    {"current_node" : 3, "next_node" : [(4, "P"), (5, "V")]},
    {"current_node" : 4, "next_node" : [(2, "X"), (5, "S")]},
    {"current_node" : 5, "next_node" : [(6, "E"), (6, "E")]}
]

In [81]:
import pandas as pd

df = pd.DataFrame(nodes)
df # to visualize the graph of nodes

Unnamed: 0,current_node,next_node
0,0,"[(1, T), (2, P)]"
1,1,"[(1, S), (4, X)]"
2,2,"[(2, T), (3, V)]"
3,3,"[(4, P), (5, V)]"
4,4,"[(2, X), (5, S)]"
5,5,"[(6, E), (6, E)]"


In [82]:
from scipy.stats import norm

def pick_random_length(max_char_count, mu=7.5, sigma=2.0):
    lengths = np.arange(max_char_count)
    # mu = 7.5 center around 7-8
    # sigma = 2.0 adjust for spread; try 2.0 for a moderate peak

    probabilities = norm.pdf(lengths, loc=mu, scale=sigma)
    probabilities /= probabilities.sum()  # normalize

    reber_lenght = np.random.choice(lengths, p=probabilities)
    return reber_lenght

def pick_path(randomized_node=False):
    path_or_node = np.random.randint(0, 2) if randomized_node == False else np.random.randint(0, 6)
    return path_or_node


def generate_reber_string(nodes, is_reber=True, is_generator=False, **kwargs):
    """This method creates one instance of reber string, returns a string a tuple (reber, is_reber) or
    yields a tuple (reber, is_reber). Returns string only and only if is_reber isn't random
    and is_generator=False"""

    def create_reber_string(nodes, is_reber):
        node = 0
        reber = "B"
        max_char_count = kwargs.get("max_char_count", 16)
        
        # if is_reber true
        if is_reber:
            while node < 6:
                selected_path = pick_path()
                label = nodes[node]["next_node"][selected_path][1]
                node = nodes[node]["next_node"][selected_path][0] 
                if isinstance(label, list):
                    # this string is for inside the embedded reber
                    inner_reber = generate_reber_string(label)
                    reber += inner_reber
                else:
                    reber += label
        # else scope is quite overkill, I wont use
        else:
            try:            
                mistake_count = 0
                char_count = 0
                while (node < 6 or mistake_count == 0) and char_count <= max_char_count:
                    selected_path = pick_path()
                    label = nodes[node]["next_node"][selected_path][1] # either list or string
                    if isinstance(label, list):
                        # this string is for inside the embedded reber
                        inner_reber = generate_reber_string(label)
                        reber += inner_reber
                    else: # if string just add to reber
                        reber += label
                        if node != 6:
                            random_node = pick_path(True)
                            mistake_count += 1 if random_node != node else 0
                        else:
                            node = pick_path(True)
                        char_count += 1
                reber = reber[:pick_random_length(max_char_count)]
            except IndexError:
                print(f"IndexError : {node}, {reber}, {mistake_count}")
        
        return reber
    
    if not is_generator:
        if is_reber == "random":
            is_reber = bool(pick_path()) 
            return create_reber_string(nodes, is_reber), is_reber
        else:
            return create_reber_string(nodes, is_reber)
    else:
        dataset_size = kwargs.get("dataset_size", 10000)
        if is_reber == "random":
            is_reber = (bool(pick_path()) for i in range(dataset_size))
        return ((create_reber_string(nodes, is_reber), is_reber) for _ in range(dataset_size))

In [83]:
for i in range(5):
    print(generate_reber_string(nodes))

BTXXTTVPXTVPXTTVPSE
BPTVPSE
BPVVE
BPVPXVVE
BTXXTTTTVVE


In [84]:
for i in range(10):
    is_reber = bool(pick_path())
    print(generate_reber_string(nodes, is_reber), is_reber)

BPTVVE True
BTXXVVE True
BTTTPPPT False
BTSXSE True
BPVVE True
BPTTPPPPT False
BTTTTPPP False
BPPTPPP False
BTXSE True
BTSXXTTTVPSE True


In [None]:
embedded_reber_nodes = [
    {"current_node" : 0, "next_node" : [(1, "T"), (2, "P")]},
    {"current_node" : 1, "next_node" : [(4, nodes), 
                                        (4, nodes)]},
    {"current_node" : 2, "next_node" : [(3, nodes), 
                                        (3, nodes)]},
    {"current_node" : 3, "next_node" : [(5, "P"), (5, "P")]},
    {"current_node" : 4, "next_node" : [(5, "T"), (5, "T")]},
    {"current_node" : 5, "next_node" : [(6, "E"), (6, "E")]}
]

def create_embedded_reber(embedded_nodes, can_corrupt=True, **kwargs):
    dataset_size = kwargs.get("dataset_size", 10000)
    POSSIBLE_CHARS = "BEPSTVX"

    for i in range(dataset_size):
        embedded_reber_str = ""
        if can_corrupt is True:
            corrupt = bool(pick_path()) # Decide this single instance will be corrupted or not
            embedded_reber_str += generate_reber_string(embedded_nodes)
            # if this is instance is chosen to be corrupted
            if corrupt is True:
                corrupter_char = np.random.choice(list(POSSIBLE_CHARS))
                idx = np.random.randint(0, len(embedded_reber_str))
                while embedded_reber_str[idx] == corrupter_char:
                    idx = np.random.randint(0, len(embedded_reber_str))
                embedded_reber_str = embedded_reber_str[:idx] + corrupter_char + embedded_reber_str[idx+1:]
            yield embedded_reber_str, corrupt # which is label 
        else:
            embedded_reber_str += generate_reber_string(embedded_nodes)
            yield embedded_reber_str, True


In [None]:
dataset = tf.data.Dataset.from_generator(
    lambda: create_embedded_reber(embedded_reber_nodes, can_corrupt=True, dataset_size=10000),
    output_signature=(
        tf.TensorSpec(shape=(), dtype=tf.string),
        tf.TensorSpec(shape=(), dtype=tf.bool)
    )
).shuffle(10000).repeat() # infinite number of instances dataset

In [None]:
for x, y in dataset.take(3):
    print(x, y) # it works

tf.Tensor(b'BTBPVPXTTTVVTTE', shape=(), dtype=string) tf.Tensor(True, shape=(), dtype=bool)
tf.Tensor(b'BPBPVVEPE', shape=(), dtype=string) tf.Tensor(False, shape=(), dtype=bool)
tf.Tensor(b'BPETXSEPE', shape=(), dtype=string) tf.Tensor(True, shape=(), dtype=bool)


In [88]:
from keras.layers import TextVectorization

POSSIBLE_CHARS = "BEPSTVX" 

vectorizer = TextVectorization(
    standardize=None,
    split="character",
    vocabulary=list(POSSIBLE_CHARS),  
    output_mode="int",
)

def preprocess(x, y):
    x = vectorizer(x)
    x = tf.cast(x, tf.int32)
    return x, y

In [89]:
AUTOTUNE = tf.data.AUTOTUNE

dataset = dataset.map(preprocess, num_parallel_calls=AUTOTUNE)
dataset

<_ParallelMapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.bool, name=None))>

In [90]:
def create_dataset(size=10000, batch_size=32, can_corrupt=True):
    dataset = tf.data.Dataset.from_generator(
        lambda: create_embedded_reber(embedded_reber_nodes, can_corrupt=can_corrupt, dataset_size=size),
        output_signature=(tf.TensorSpec((), tf.string), tf.TensorSpec((), tf.bool))
    )
    dataset = dataset.map(preprocess, num_parallel_calls=AUTOTUNE).padded_batch(
            batch_size,
            padded_shapes=([None], []),
            padding_values=(tf.constant(0, tf.int32), tf.constant(False))
    ).repeat().prefetch(AUTOTUNE)

    return dataset

In [91]:
train_size = 10000
val_size = 2000
batch_size = 32

train_set = create_dataset(train_size, batch_size)
valid_set = create_dataset(val_size, batch_size)
test_set  = create_dataset(1000, batch_size)

## Model Training

In [92]:
steps_per_epoch = train_size // batch_size + 1
validation_steps = val_size // batch_size + 1

In [93]:
vocab_size = vectorizer.vocabulary_size()
vocab_size

9

In [94]:
from keras import layers, Model

inputs = layers.Input(shape=(None,), dtype="int32")  
x = layers.Embedding(vocab_size, 5, mask_zero=True)(inputs)
x = layers.GRU(32, return_sequences=True)(x)
x = layers.GRU(16)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)

model = Model(inputs, outputs)

In [None]:
optimizer = keras.optimizers.Nadam()
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
history = model.fit(
    train_set,
    epochs=20,
    steps_per_epoch=steps_per_epoch,
    validation_data=valid_set,
    validation_steps=validation_steps
) # accuracy: 0.9997 val_accuracy: 1.0000 in epoch 20

Epoch 1/20


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 25ms/step - accuracy: 0.5470 - loss: 0.6762 - val_accuracy: 0.6130 - val_loss: 0.6455
Epoch 2/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.6298 - loss: 0.6214 - val_accuracy: 0.6590 - val_loss: 0.5916
Epoch 3/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 25ms/step - accuracy: 0.7452 - loss: 0.5102 - val_accuracy: 0.8310 - val_loss: 0.4025
Epoch 4/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 22ms/step - accuracy: 0.8512 - loss: 0.3520 - val_accuracy: 0.8595 - val_loss: 0.3269
Epoch 5/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.8985 - loss: 0.2650 - val_accuracy: 0.8985 - val_loss: 0.2348
Epoch 6/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 24ms/step - accuracy: 0.9234 - loss: 0.2117 - val_accuracy: 0.9365 - val_loss: 0.1792
Epoch 7/20
[1m313/313[0m [32m━