# HAPPENN - CNN

## Load data

In [1]:
import numpy as np

def load_sequences(file_path):
    sequences = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                label = line.strip()[1:].split('|')[7]
                if label == 'hemolytic':
                    labels.append(1)
                elif label == 'non-hemolytic':
                    labels.append(0)
                else:
                    raise ValueError(f"Unknown label: {label}")
            else:
                sequences.append(line.strip())
    return sequences, np.array(labels, dtype=np.int32)

sequences, labels = load_sequences('data/HAPPENN_dataset.fasta')

In [2]:
from sklearn.model_selection import train_test_split

train_sequences, val_sequences, train_labels, val_labels = train_test_split(sequences, labels, test_size=0.2, random_state=42, stratify=labels)

## Tokenize

In [3]:
vocab = list(set(''.join(sequences)))
vocab.sort()
vocab.insert(0, '_')
vocab

['_',
 'A',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'K',
 'L',
 'M',
 'N',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'V',
 'W',
 'Y']

In [4]:
import numpy as np

token2id = {token: i for i, token in enumerate(vocab)}
id2token = {i: token for token, i in token2id.items()}

def pad_and_truncate(sequence, max_len):
    return [token2id[token] for token in sequence[:max_len]] + [0] * max(0, max_len - len(sequence))

X_train = np.array([pad_and_truncate(sequence, 32) for sequence in train_sequences])
X_val = np.array([pad_and_truncate(sequence, 32) for sequence in val_sequences])

X_train[0]

array([ 9, 19,  9, 18,  5,  9,  9,  1,  4,  9, 11,  8, 15, 12,  8, 15, 12,
        9,  8, 18,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

## Build model

In [5]:
import keras

dim = 64

model = keras.Sequential([
    keras.layers.Embedding(input_dim=len(vocab), output_dim=dim),
    keras.layers.Conv1D(filters=dim, kernel_size=3, activation='relu', padding='same'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.1),
    keras.layers.Conv1D(filters=dim, kernel_size=3, activation='relu', dilation_rate=2, padding='same'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    keras.layers.Conv1D(filters=dim, kernel_size=3, activation='relu', dilation_rate=4, padding='same'),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.3),
    keras.layers.Conv1D(filters=dim, kernel_size=3, activation='relu', dilation_rate=8, padding='same'),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(1, activation='sigmoid')
])

model(np.array([X_train[0]]))

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.5011651]], dtype=float32)>

## Train

In [6]:
batch_size = 128

model.compile(
    optimizer=keras.optimizers.Adam(
        learning_rate=keras.optimizers.schedules.CosineDecay(
            initial_learning_rate=0.01,
            decay_steps=len(X_train) // batch_size * 120,
        )
    ),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.fit(
    X_train, train_labels,
    validation_data=(X_val, val_labels),
    epochs=100,
    batch_size=batch_size
)

Epoch 1/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.6361 - loss: 0.6356 - val_accuracy: 0.5856 - val_loss: 0.7317
Epoch 2/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7500 - loss: 0.5278 - val_accuracy: 0.5869 - val_loss: 0.8579
Epoch 3/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7463 - loss: 0.4952 - val_accuracy: 0.5869 - val_loss: 0.9604
Epoch 4/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.7797 - loss: 0.4681 - val_accuracy: 0.5869 - val_loss: 0.8038
Epoch 5/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.8018 - loss: 0.4368 - val_accuracy: 0.5869 - val_loss: 1.2410
Epoch 6/100
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.8180 - loss: 0.4066 - val_accuracy: 0.5869 - val_loss: 0.9623
Epoch 7/100
[1m24/24[0m [

<keras.src.callbacks.history.History at 0x1692b1f50>

In [9]:
from sklearn.metrics import accuracy_score

predictions = model.predict(X_val)
accuracy_score(val_labels, (predictions > 0.5).astype(np.int32))

[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


0.8368983957219251

Accuracy was improved by over 1%.
Adjusting the model could improve the accuracy even more.