In [1]:
import os, sys
import pickle

import numpy as np
import matplotlib.pyplot as plt

import tensorflow.keras
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, Reshape
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional
from tensorflow.keras.layers import Lambda, Flatten, Embedding
import keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

EPOCHS = 10
BATCH_SIZE = 32
VALIDATION_SPLIT=0.05

MAX_LINESIZE = 20 # this is the maximum length of a line. We need this for zero padding
ZERO_PADDING_CHAR = "X"
INPUT_READER_BATCH_SIZE = int(1e5)
NEUTRAL_LABEL = "0"
EMBEDDING_DIM = 50

TRAIN_FILEPATH = os.path.join("july_week_5_train.dat.encoded.dat.extracted_sequences.dat")
TEST_FILEPATH = os.path.join("july_week_5_test.dat.encoded.dat.extracted_sequences.dat")
LABEL_FILEPATH = os.path.join("july_week_5_test.dat.labels.txt")

2023-07-25 12:09:21.125833: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


def get_batch_of_input(infile, n_lines):
    """Only if a file is too large to fit into memory"""
    res = list()
    for _ in range(n_lines):
        line = infile.readline()
        if line=="":
            return None
        res.append(np.array( line.split()[2:] ))
    return np.array(res)

In [2]:
def get_label(line):
    symbols = set( line.split() )
    if len(symbols) > 1 or not NEUTRAL_LABEL in symbols:
        return 1
    return 0

def read_input(infile, label_file=None, input_alph=None, MAXLINES=int(1e6)):
    res = list()
    labels = None 
    if label_file:
        labels = list()
    
    alphabet = dict() if input_alph is None else input_alph
    if not ZERO_PADDING_CHAR in alphabet:
        alphabet[ZERO_PADDING_CHAR] = len(alphabet)
                
    for i, line in enumerate(infile):
        if i == MAXLINES:
            break
            
        linesplit = line.split()[2:]
        line = list()
        for x in linesplit:
            if not x in alphabet:
                alphabet[x] = len(alphabet)
            line.append( alphabet[x] )
        if len(line) < MAX_LINESIZE:
            line.extend( [ alphabet[ZERO_PADDING_CHAR] ] * (MAX_LINESIZE - len(line)) )
        res.append( np.array(line) )
        
        if label_file:
            label_line = label_file.readline()
            labels.append( get_label(label_line) )

    reverse_alphabet = {v:k for k,v in alphabet.items()}
    labels = np.array(labels) if labels is not None else labels
    return np.array(res), alphabet, reverse_alphabet, labels

In [3]:
trainfile = open(TRAIN_FILEPATH, "rt")
train_header = trainfile.readline()

x_train, alphabet, _, _ = read_input(infile=trainfile, MAXLINES=int(3e6))
ALPHABET_SIZE = len(alphabet)

print(x_train[:3], x_train.shape, ALPHABET_SIZE)

[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 2 3 1 4 5 1 5 6 1 1 7 5 5 6 1 6 8 4 1]
 [2 3 1 4 5 1 5 6 1 1 7 5 5 6 1 6 8 4 1 6]] (3000000, 20) 76


x_train_target_one_hot = np.zeros((x_train.shape[0], x_train.shape[1] - 1, len(alphabet)), dtype="float32")

for i, row in enumerate(x_train[:, 1:]):
    for j, mapped_symbol in enumerate(row):
        x_train_target_one_hot[i, j, mapped_symbol] = 1

In [4]:
x_train_target_one_hot = np.zeros((x_train.shape[0], len(alphabet)), dtype="float32")

for i, row in enumerate(x_train):
    mapped_symbol = row[-1]
    x_train_target_one_hot[i, mapped_symbol] = 1

In [5]:
def get_model(OUTPUT_DIM, N_SEQUENCES):
    INPUT_SHAPE = (N_SEQUENCES)

    input_layer = Input(shape=(N_SEQUENCES,))
    embedding_layer = Embedding(OUTPUT_DIM, EMBEDDING_DIM, trainable=True)
    rnn_layer = LSTM(EMBEDDING_DIM)
    hidden_layer_1 = Dense(EMBEDDING_DIM)
    output_layer = Dense(OUTPUT_DIM, activation="softmax")

    x = embedding_layer(input_layer)
    x = rnn_layer(x)
    x = hidden_layer_1(x)
    x = output_layer(x)

    model = Model(input_layer, x)
    model.compile(
      loss="categorical_crossentropy",
      optimizer="adam",
      metrics=["categorical_crossentropy"]
    )
    return model


model = get_model(ALPHABET_SIZE, MAX_LINESIZE-1)

es = EarlyStopping(
    monitor="val_loss",
    patience=1,
    mode="auto"
)

In [6]:
history = model.fit(
  x_train[:, :-1], 
  x_train_target_one_hot,
  batch_size=BATCH_SIZE,
  epochs=EPOCHS,
  validation_split=0.1
)

2023-07-25 12:09:38.277109: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 410400000 exceeds 10% of free system memory.


Epoch 1/10


2023-07-25 12:09:38.834133: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 820800000 exceeds 10% of free system memory.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
del x_train
del x_train_target_one_hot

In [8]:
testfile = open(TRAIN_FILEPATH, "rt")
test_header = testfile.readline()

label_infile = open(LABEL_FILEPATH, "rt")
x_test, alphabet, reverse_alphabet, y_test = read_input(infile=testfile, label_file=label_infile, input_alph=alphabet, MAXLINES=int(2e6))

In [9]:
alphabet_symbols = set( range(ALPHABET_SIZE) )

TP, FP, FN, TN = 0, 0, 0, 0

idx_to_delete = list()
for i, (row, label) in enumerate(zip(x_test, y_test)):
    for x in row:
        if not x in alphabet_symbols:
            # we see a new symbol. What to do with that one now?
            if label == 1:
                TP += 1
            else:
                FP += 1
            idx_to_delete.append(i)
TP, FP

(0, 0)

In [10]:
x_test = np.delete(x_test, idx_to_delete, axis=0)
y_test = np.delete(y_test, idx_to_delete, axis=0)

In [11]:
x_test_pred = model.predict(x_test[:, :-1])

2023-07-25 13:06:55.663820: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 304000000 exceeds 10% of free system memory.




2023-07-25 13:08:54.694014: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 608000000 exceeds 10% of free system memory.


In [12]:
x_test_target_one_hot = np.zeros((x_test.shape[0], len(alphabet)), dtype="float32")

for i, row in enumerate(x_test):
    mapped_symbol = row[-1]
    x_test_target_one_hot[i, mapped_symbol] = 1

In [13]:
K = 3

for label, pred, target in zip(y_test, x_test_pred, x_test_target_one_hot):
    true_idx = np.where(target==1)
    pred_sort_idx = np.argsort(pred)
    if true_idx in pred_sort_idx[-K:]:
        if label == 0:
            TN += 1
        else:
            FN += 1
    else:
        if label == 1:
            TP += 1
        else:
            FP += 1
TP, FP, FN, TN

(1459, 93171, 152301, 1753069)