# HAPPENN - Bag of words

## Load data

In [1]:
def load_sequences(file_path):
    sequences = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('>'):
                label = line.strip()[1:].split('|')[7]
                if label == 'hemolytic':
                    labels.append(1)
                elif label == 'non-hemolytic':
                    labels.append(0)
                else:
                    raise ValueError(f"Unknown label: {label}")
            else:
                sequences.append(line.strip())
    return sequences, labels

sequences, labels = load_sequences('data/HAPPENN_dataset.fasta')

In [2]:
from sklearn.model_selection import train_test_split

train_sequences, val_sequences, train_labels, val_labels = train_test_split(sequences, labels, test_size=0.2, random_state=42, stratify=labels)

## Bag of words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
X_train = vectorizer.fit_transform(train_sequences).toarray()
X_val = vectorizer.transform(val_sequences).toarray()
X_train.shape, X_val.shape

((2990, 20), (748, 20))

## Train model

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, train_labels)
y_pred = model.predict(X_val)
print(classification_report(val_labels, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85       439
           1       0.79      0.75      0.77       309

    accuracy                           0.82       748
   macro avg       0.81      0.81      0.81       748
weighted avg       0.82      0.82      0.82       748



This is a good baseline.

SOTA in the paper is <86% accuracy.