In [1]:
import numpy as np

import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Reshape
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

import pickle as pk

BATCH_SIZE = 32
EPOCHS = 20
VALIDATION_SPLIT = 0.02
    
INPUT_DATA = "../data/problem_1_train.dat"
MODEL_NAME = "model_problem_1.keras"

2023-08-08 14:15:08.546082: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
labels = list()
sequences = list()
alphabet = set()

with open(INPUT_DATA, "rt") as inf:
    inf.readline() # kill the header
    for i, line in enumerate(inf):
        if len(line.strip()) == 0:
            continue
        line = line.split()
        labels.append(int(line[0]))
        sequences.append(np.array(list( int(x) for x in line[2:] ), dtype=np.int32) )
        alphabet = alphabet.union(set( int(x) for x in line[2:] ))
        if i % int(10e3) == 0:
            print("Line: ", i)
len(labels), len(sequences), alphabet

Line:  0
Line:  10000
Line:  20000
Line:  30000


(35621, 35621, {0, 1})

In [3]:
sequences[0], sequences[1], sequences[2], sequences[3], sequences[4]

(array([1, 0, 1, 1, 0, 1, 1, 1], dtype=int32),
 array([0, 0, 1, 0, 1, 0, 0], dtype=int32),
 array([1, 1, 0, 1, 0, 0], dtype=int32),
 array([1, 1, 1, 1, 0, 1], dtype=int32),
 array([0, 0, 1, 1, 1], dtype=int32))

In [4]:
labels = np.array(labels).reshape(-1, 1)
alphabet_map = {symbol: index for index, symbol in enumerate(alphabet)}

pk.dump(alphabet_map, open("alphabet_mapping.pk", "wb"))

len(sequences), labels.shape, alphabet_map

(35621, (35621, 1), {0: 0, 1: 1})

In [5]:
sequences_one_hot = list() # np.zeros((sequences.shape[0], sequences.shape[1], len(alphabet)))
for i, seq in enumerate(sequences):
    new_sequence = np.zeros((1, len(seq), len(alphabet)), dtype=np.int32)
    for j, sym in enumerate(seq):
        new_sequence[0, j, alphabet_map[sym]] = 1
    sequences_one_hot.append(new_sequence)
        
print("Before: ", len(sequences))
print("Shape: ", sequences[:2])
print("After: ", len(sequences_one_hot))
print("Shape: ", sequences_one_hot[:2])

Before:  35621
Shape:  [array([1, 0, 1, 1, 0, 1, 1, 1], dtype=int32), array([0, 0, 1, 0, 1, 0, 0], dtype=int32)]
After:  35621
Shape:  [array([[[0, 1],
        [1, 0],
        [0, 1],
        [0, 1],
        [1, 0],
        [0, 1],
        [0, 1],
        [0, 1]]], dtype=int32), array([[[1, 0],
        [1, 0],
        [0, 1],
        [1, 0],
        [0, 1],
        [1, 0],
        [1, 0]]], dtype=int32)]


In [6]:
sequences_one_hot[0].shape, len(sequences_one_hot), labels.shape

((1, 8, 2), 35621, (35621, 1))

In [7]:
def get_model(input_shape):
    OUTPUT_DIM = 1 # sigmoid output

    input_layer = Input(shape=input_shape)
    
    x = Bidirectional(LSTM(4))(input_layer)
    x = Dense(OUTPUT_DIM, activation="sigmoid")(x)
    x_out = Reshape((1,1))(x)
    model = Model(input_layer, x_out)

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["binary_crossentropy"]
    )
    return model

In [8]:
# because we have arrays of different length we have to use the fit_generator() method

def data_generator():
    global sequences_one_hot
    global labels
    
    idx = 0
    while True:
        if idx == len(sequences_one_hot):
            idx = 0
        yield sequences_one_hot[idx], labels[idx]
        idx += 1

In [9]:
model = get_model(( None, len(alphabet) ))

es = EarlyStopping(
    monitor="val_loss",
    patience=1,
    mode="auto")

data_gen = data_generator()

model.fit(
    data_gen,
    steps_per_epoch=int(len(sequences_one_hot) / BATCH_SIZE),
    epochs=EPOCHS,
    #validation_split=VALIDATION_SPLIT,
    callbacks=[es]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7f12706882b0>

In [10]:
model.save(MODEL_NAME)