In [4]:
import numpy as np

import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

import pickle as pk

BATCH_SIZE = 32
EPOCHS = 5
VALIDATION_SPLIT = 0.02
    
INPUT_DATA = "../data/problem_1_train.dat"
MODEL_NAME = "model_problem_1.h5"

In [5]:
labels = list()
sequences = list()
alphabet = set()

with open(INPUT_DATA, "rt") as inf:
    inf.readline() # kill the header
    for i, line in enumerate(inf):
        if len(line.strip()) == 0:
            continue
        line = line.split()
        labels.append(int(line[0]))
        sequences.append(list( int(x) for x in line[2:] ))
        alphabet = alphabet.union(set( int(x) for x in line[2:] ))
        if i % int(10e3) == 0:
            print("Line: ", i)
len(labels), len(sequences), alphabet

Line:  0
Line:  10000
Line:  20000
Line:  30000


(39907, 39907, {0, 1})

In [6]:
sequences = np.array(sequences)
labels = np.array(labels).reshape(-1, 1)
alphabet_map = {symbol: index for index, symbol in enumerate(alphabet)}

pk.dump(alphabet_map, open("alphabet_mapping.pk", "wb"))

sequences.shape, labels.shape, alphabet_map

((39907, 8), (39907, 1), {0: 0, 1: 1})

In [7]:
sequences_one_hot = np.zeros((sequences.shape[0], sequences.shape[1], len(alphabet)))
for i, seq in enumerate(sequences):
    for j, sym in enumerate(seq):
        sequences_one_hot[i, j, alphabet_map[sym]] = 1
print("Before: ", sequences[:3])
print("Shape: ", sequences.shape)
print("After: ", sequences_one_hot[:3])
print("Shape: ", sequences_one_hot.shape)

Before:  [[1 0 1 1 0 1 1 1]
 [1 0 0 1 1 1 1 1]
 [1 1 1 0 0 1 1 0]]
Shape:  (39907, 8)
After:  [[[0. 1.]
  [1. 0.]
  [0. 1.]
  [0. 1.]
  [1. 0.]
  [0. 1.]
  [0. 1.]
  [0. 1.]]

 [[0. 1.]
  [1. 0.]
  [1. 0.]
  [0. 1.]
  [0. 1.]
  [0. 1.]
  [0. 1.]
  [0. 1.]]

 [[0. 1.]
  [0. 1.]
  [0. 1.]
  [1. 0.]
  [1. 0.]
  [0. 1.]
  [0. 1.]
  [1. 0.]]]
Shape:  (39907, 8, 2)


In [9]:
def get_model(input_shape):
    OUTPUT_DIM = 1 # sigmoid output

    input_layer = Input(shape=input_shape)
    
    x = Bidirectional(LSTM(4))(input_layer)
    x_out = Dense(OUTPUT_DIM, activation="sigmoid")(x)

    model = Model(input_layer, x_out)

    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["binary_crossentropy"]
    )
    return model

In [10]:
model = get_model(( sequences_one_hot.shape[1], sequences_one_hot.shape[2] ))

es = EarlyStopping(
    monitor="val_loss",
    patience=1,
    mode="auto")

model.fit(
    sequences_one_hot, 
    labels,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=VALIDATION_SPLIT,
    callbacks=[es]
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7faa104b47f0>

In [12]:
model.save(MODEL_NAME)

  saving_api.save_model(
