In [1]:
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import os
import urllib
import json
seed = 1
np.random.seed(seed)

In [2]:
urllib.request.urlretrieve(
    "https://github.com/ur-whitelab/peptide-dashboard/raw/master/ml/data/hemo-positive.npz",
    "positive.npz",
)
urllib.request.urlretrieve(
    "https://github.com/ur-whitelab/peptide-dashboard/raw/master/ml/data/hemo-negative.npz",
    "negative.npz",
)
with np.load("positive.npz") as r:
    pos_data = r[list(r.keys())[0]]
with np.load("negative.npz") as r:
    neg_data = r[list(r.keys())[0]]

# create labels and stich it all into one
# tensor
labels = np.concatenate(
    (
        np.ones((pos_data.shape[0], 1)),
        np.zeros((neg_data.shape[0], 1)),
    ),
    axis=0,
)

features = np.concatenate((pos_data, neg_data), axis=0)
pos_data_lengths = np.count_nonzero(pos_data, axis=1)
neg_data_lengths = np.count_nonzero(neg_data, axis=1)

In [3]:
print('Positive data', pos_data.shape[0], pos_data.shape[0]/(pos_data.shape[0]+ neg_data.shape[0])*100)
print('Negative data', neg_data.shape[0], neg_data.shape[0]/(pos_data.shape[0]+ neg_data.shape[0])*100)


Positive data 1826 19.600686990124515
Negative data 7490 80.39931300987548


In [4]:
# we now need to shuffle before training
# so that our train/test/val splits are random
i = np.arange(len(labels))
np.random.shuffle(i)
shuffled_labels = labels[i]
shuffled_features = features[i]


from sklearn.model_selection import train_test_split

X_temp, X_test, y_temp, y_test = train_test_split(shuffled_features, shuffled_labels, test_size=0.1, random_state=seed)

X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1, random_state=seed)

print(f'Train size: {X_train.shape[0]}, Val size: {X_val.shape[0]}, Test size: {X_test.shape[0]}')

Train size: 7545, Val size: 839, Test size: 932


Now we use classical ML models to predict hemolytic activity of the peptides given their sequences represented as intiger encoded vectors.

## Random forests

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

clf = RandomForestClassifier(n_estimators=500, random_state=seed)
clf.fit(X_train, y_train)

y_hat_val = clf.predict_proba(X_val)[:, 1]

auroc = roc_auc_score(y_val, y_hat_val)
print(f"Validation AUROC: {auroc:.4f}")

  return fit_method(estimator, *args, **kwargs)


Validation AUROC: 0.8045


In [6]:
y_hat_test = clf.predict_proba(X_test)[:, 1]

auroc = roc_auc_score(y_test, y_hat_test)
print(f"Test AUROC: {auroc:.4f}")

Test AUROC: 0.7851


## SVM

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

clf = SVC(probability=True, random_state=seed)
clf.fit(X_train, y_train)

# Predict probabilities for the positive class
y_hat_val = clf.predict_proba(X_val)[:, 1]

auroc = roc_auc_score(y_val, y_hat_val)
print(f"Validation AUROC: {auroc:.4f}")

  y = column_or_1d(y, warn=True)


Validation AUROC: 0.7055


In [8]:
y_hat_test = clf.predict_proba(X_test)[:, 1]

auroc = roc_auc_score(y_test, y_hat_test)
print(f"Test AUROC: {auroc:.4f}")

Test AUROC: 0.6814
