In [6]:
import numpy as np
import pickle as pk

import tensorflow.keras
from tensorflow.keras.models import load_model
    
INPUT_DATA = "../data/problem_1_test.dat"
MODEL_NAME = "model_problem_1.h5"

alphabet_mapping = pk.load(open("alphabet_mapping.pk", "rb"))

model = load_model(MODEL_NAME)

In [8]:
labels = list()
sequences = list()

with open(INPUT_DATA, "rt") as inf:
    inf.readline() # kill the header
    for i, line in enumerate(inf):
        if len(line.strip()) == 0:
            continue
        line = line.split()
        labels.append(int(line[0]))
        sequences.append(list( int(x) for x in line[2:] ))
        if i % int(10e3) == 0:
            print("Line: ", i)
len(labels), len(sequences)

Line:  0
Line:  10000
Line:  20000
Line:  30000


(39870, 39870)

In [10]:
sequences = np.array(sequences)
labels = np.array(labels).reshape(-1, 1)

In [11]:
sequences_one_hot = np.zeros((sequences.shape[0], sequences.shape[1], len(alphabet_mapping)))
for i, seq in enumerate(sequences):
    for j, sym in enumerate(seq):
        sequences_one_hot[i, j, alphabet_mapping[sym]] = 1
print("Before: ", sequences[:3])
print("Shape: ", sequences.shape)
print("After: ", sequences_one_hot[:3])
print("Shape: ", sequences_one_hot.shape)

Before:  [[1 0 0 0 0 0 1 1]
 [1 1 0 1 0 1 0 1]
 [0 1 1 1 0 1 1 1]]
Shape:  (39870, 8)
After:  [[[0. 1.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [0. 1.]
  [0. 1.]]

 [[0. 1.]
  [0. 1.]
  [1. 0.]
  [0. 1.]
  [1. 0.]
  [0. 1.]
  [1. 0.]
  [0. 1.]]

 [[1. 0.]
  [0. 1.]
  [0. 1.]
  [0. 1.]
  [1. 0.]
  [0. 1.]
  [0. 1.]
  [0. 1.]]]
Shape:  (39870, 8, 2)


In [12]:
y_pred = model.predict(sequences_one_hot)
y_pred[:2], labels[:2]



(array([[0.9936825 ],
        [0.00306792]], dtype=float32),
 array([[1],
        [0]]))

In [15]:
def map_to_int(x):
    return 1 if x >= 0.5 else 0

y_pred_mapped = np.array(list( map(map_to_int, list(y_pred)) ))
y_pred_mapped[:2], y_pred_mapped.shape

(array([1, 0]), (39870,))

In [17]:
from sklearn.metrics import balanced_accuracy_score, accuracy_score

accuracy_score(labels, y_pred_mapped), balanced_accuracy_score(labels, y_pred_mapped)

(1.0, 1.0)