# HemoPI - Bag of words

## Load data

In [1]:
def load_sequences(file_path):
    sequences = []
    with open(file_path, 'r') as file:
        for line in file:
            if not line.startswith('>'):
                sequences.append(line.strip())
    return sequences

train_pos_sequences = load_sequences('data/hemopi/train/pos.fa')
train_neg_sequences = load_sequences('data/hemopi/train/neg.fa')

val_pos_sequences = load_sequences('data/hemopi/val/pos.fa')
val_neg_sequences = load_sequences('data/hemopi/val/neg.fa')

train_sequences = train_pos_sequences + train_neg_sequences
train_labels = [1] * len(train_pos_sequences) + [0] * len(train_neg_sequences)

val_sequences = val_pos_sequences + val_neg_sequences
val_labels = [1] * len(val_pos_sequences) + [0] * len(val_neg_sequences)

In [2]:
train_sequences[:4]

['GIFGKILGVGKKVLCGLSGVC',
 'KWKSFLKTFKSLKKTVLHTLLKLISS',
 'KFFKFFKFF',
 'LLKKLLKKLLKKLLKK']

In [3]:
train_labels[:4]

[1, 1, 1, 1]

## Bag of words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 1))
X_train = vectorizer.fit_transform(train_sequences).toarray()
X_val = vectorizer.transform(val_sequences).toarray()

In [5]:
X_train.shape, X_val.shape

((884, 20), (220, 20))

In [6]:
vectorizer.get_feature_names_out()

array(['a', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'p',
       'q', 'r', 's', 't', 'v', 'w', 'y'], dtype=object)

In [7]:
X_train[0:4]

array([[0, 2, 0, 0, 1, 6, 0, 2, 3, 3, 0, 0, 0, 0, 0, 1, 0, 3, 0, 0],
       [0, 0, 0, 0, 2, 0, 1, 1, 7, 6, 0, 0, 0, 0, 0, 4, 3, 1, 1, 0],
       [0, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

## Train model

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

clf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, random_state=42)
clf.fit(X_train, train_labels)

y_pred = clf.predict(X_val)

print(classification_report(val_labels, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       110
           1       1.00      0.95      0.98       110

    accuracy                           0.98       220
   macro avg       0.98      0.98      0.98       220
weighted avg       0.98      0.98      0.98       220





This dataset is too easy to experiment with more advanced methods.