In [1]:
import numpy as np

import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Reshape
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

import pickle as pk

BATCH_SIZE = 32
EPOCHS = 50
VALIDATION_SPLIT = 0.02
    
INPUT_DATA = "../data/problem_1_train.dat"
MODEL_NAME = "model_problem_1.keras"

2023-09-29 11:20:41.339802: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
labels = list()
sequences = list()
alphabet = dict()

with open(INPUT_DATA, "rt") as inf:
    inf.readline() # kill the header
    for i, line in enumerate(inf):
        if len(line.strip()) == 0:
            continue
        line = line.split()
        labels.append(float(line[0]))
        for x in line[2:]:
            if not x in alphabet:
                alphabet[x] = len(alphabet)
        sequences.append(np.array(list( alphabet[x] for x in line[2:] ), dtype=np.int32) )
        #alphabet = alphabet.union(set( int(x) for x in line[2:] ))
        if i % int(10e3) == 0:
            print("Line: ", i)
len(labels), len(sequences), alphabet

Line:  0
Line:  10000
Line:  20000
Line:  30000
Line:  40000
Line:  50000
Line:  60000
Line:  70000
Line:  80000
Line:  90000


(100000, 100000, {'c': 0, 'a': 1, 'b': 2})

In [3]:
sequences[0], sequences[1], sequences[2], sequences[3], sequences[4]

(array([0, 0, 0, 0, 1, 1], dtype=int32),
 array([0, 2, 0, 0, 0], dtype=int32),
 array([1, 1], dtype=int32),
 array([2], dtype=int32),
 array([0, 0, 0], dtype=int32))

In [4]:
labels = np.array(labels).reshape(-1, 1)
alphabet_map = {symbol: index for index, symbol in enumerate(alphabet.values())}

pk.dump(alphabet_map, open("alphabet_mapping.pk", "wb"))

len(sequences), labels.shape, alphabet_map

(100000, (100000, 1), {0: 0, 1: 1, 2: 2})

In [5]:
sequences_one_hot = list() # np.zeros((sequences.shape[0], sequences.shape[1], len(alphabet)))
for i, seq in enumerate(sequences):
    new_sequence = np.zeros((1, len(seq), len(alphabet)), dtype=np.int32)
    for j, sym in enumerate(seq):
        new_sequence[0, j, alphabet_map[sym]] = 1
    sequences_one_hot.append(new_sequence)
        
print("Before: ", len(sequences))
print("Shape: ", sequences[:2])
print("After: ", len(sequences_one_hot))
print("Shape: ", sequences_one_hot[:2])

Before:  100000
Shape:  [array([0, 0, 0, 0, 1, 1], dtype=int32), array([0, 2, 0, 0, 0], dtype=int32)]
After:  100000
Shape:  [array([[[1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0],
        [0, 1, 0],
        [0, 1, 0]]], dtype=int32), array([[[1, 0, 0],
        [0, 0, 1],
        [1, 0, 0],
        [1, 0, 0],
        [1, 0, 0]]], dtype=int32)]


In [6]:
sequences_one_hot[0].shape, len(sequences_one_hot), labels.shape

((1, 6, 3), 100000, (100000, 1))

In [7]:
validation_split_idx = int(VALIDATION_SPLIT * len(sequences_one_hot))
validation_set, validation_labels = (sequences_one_hot[:validation_split_idx], labels[:validation_split_idx])
sequences_one_hot, labels = (sequences_one_hot[validation_split_idx:], labels[validation_split_idx:])

In [8]:
def get_model(input_shape):
    OUTPUT_DIM = 1 # sigmoid output

    input_layer = Input(shape=input_shape)
    
    x = Bidirectional(LSTM(4))(input_layer)
    x_out = Dense(OUTPUT_DIM, activation="sigmoid")(x)
    #x_out = Reshape((1,1))(x)
    model = Model(input_layer, x_out)

    model.compile(
        loss="mae",
        optimizer="adam",
        metrics=["mae"]
    )
    return model

In [28]:
# because we have arrays of different length we have to use the fit_generator() method

def data_generator():
    global sequences_one_hot
    global labels
    
    idx = 0
    while True:
        if idx == len(sequences_one_hot):
            idx = 0
        yield sequences_one_hot[idx], labels[idx]
        idx += 1
        
def validation_data_generator():
    global validation_set
    global validation_labels
    
    idx = 0
    while True:
    #while idx <= len(validation_set):
        if idx == len(validation_set):
            idx = 0
        yield validation_set[idx], validation_labels[idx]
        idx += 1
    #idx = 0
    
class validation_data_generator_class:
    def __init__(self):
        self.idx = 0
    
    def __iter__(self):
        self.idx = 0
        return self

    def __next__(self):
        global validation_set
        global validation_labels
    
        if self.idx < len(validation_set):
            return validation_set[self.idx], validation_labels[self.idx]
            self.idx += 1
        else:
            print("Done once")
            raise StopIteration

In [29]:
model = get_model(( None, len(alphabet) ))

es = EarlyStopping(
    monitor="val_loss",
    patience=1,
    mode="auto")

data_gen = data_generator()
val_gen = validation_data_generator()

model.fit(
    data_gen,
    steps_per_epoch=int(len(sequences_one_hot) / BATCH_SIZE),
    epochs=EPOCHS,
    validation_data=val_gen,
    validation_steps=len(validation_set),
    #validation_split=VALIDATION_SPLIT,
    callbacks=[es]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


<keras.src.callbacks.History at 0x7f723e886ec0>

In [None]:
model.save(MODEL_NAME)