In [1]:
import numpy as np
import pickle as pk

import tensorflow.keras
from tensorflow.keras.models import load_model
    
INPUT_DATA = "../data/problem_1_test.dat"
MODEL_NAME = "model_problem_1.keras"

alphabet_mapping = pk.load(open("alphabet_mapping.pk", "rb"))

model = load_model(MODEL_NAME)

2023-08-09 13:00:17.562231: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
labels = list()
sequences = list()

with open(INPUT_DATA, "rt") as inf:
    inf.readline() # kill the header
    for i, line in enumerate(inf):
        if len(line.strip()) == 0:
            continue
        line = line.split()
        labels.append(int(line[0]))
        sequences.append(np.array(list( int(x) for x in line[2:] ), dtype=np.int32) )
        if i % int(10e3) == 0:
            print("Line: ", i)
len(labels), len(sequences)

Line:  0
Line:  10000
Line:  20000
Line:  30000


(35566, 35566)

In [3]:
labels = np.array(labels).reshape(-1, 1)
labels.shape, sequences[0], sequences[1]

((35566, 1),
 array([0, 0, 0, 1, 0, 0, 0, 0], dtype=int32),
 array([1, 1, 1, 0, 0, 0, 1], dtype=int32))

In [4]:
sequences_one_hot = list() # np.zeros((sequences.shape[0], sequences.shape[1], len(alphabet)))
for i, seq in enumerate(sequences):
    new_sequence = np.zeros((1, len(seq), len(alphabet_mapping)), dtype=np.int32)
    for j, sym in enumerate(seq):
        new_sequence[0, j, alphabet_mapping[sym]] = 1
    sequences_one_hot.append(new_sequence)
        
print("Before: ", len(sequences))
print("Shape: ", sequences[:2])
print("After: ", len(sequences_one_hot))
print("Shape: ", sequences_one_hot[:2])

Before:  35566
Shape:  [array([0, 0, 0, 1, 0, 0, 0, 0], dtype=int32), array([1, 1, 1, 0, 0, 0, 1], dtype=int32)]
After:  35566
Shape:  [array([[[1, 0],
        [1, 0],
        [1, 0],
        [0, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0]]], dtype=int32), array([[[0, 1],
        [0, 1],
        [0, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [0, 1]]], dtype=int32)]


In [5]:
def data_generator():
    global sequences_one_hot
    
    idx = 0
    while idx < len(sequences_one_hot):
        yield sequences_one_hot[idx]
        idx += 1

In [6]:
data_gen = data_generator()

y_pred = model.predict(data_gen, steps=len(sequences_one_hot))
y_pred[:2], labels[:2]



(array([[9.998781e-01],
        [3.785926e-04]], dtype=float32),
 array([[1],
        [0]]))

In [7]:
def map_to_int(x):
    return 1 if x >= 0.5 else 0

y_pred_mapped = np.array(list( map(map_to_int, list(y_pred)) ))
y_pred_mapped[:2], y_pred_mapped.shape

(array([1, 0]), (35566,))

In [8]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

accuracy_score(labels, y_pred_mapped), balanced_accuracy_score(labels, y_pred_mapped)

(1.0, 1.0)