In [1]:

!pip install sumonet

Collecting sumonet
  Downloading sumonet-0.1.5.tar.gz (787 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.9/787.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting epitopepredict (from sumonet)
  Downloading epitopepredict-0.5.0.tar.gz (11.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting biopython (from sumonet)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting loguru (from sumonet)
  Downloading loguru-0.7.2-py3-none-any.whl (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for c

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load your CSV file
file_path = 'data.csv'  # Replace with the actual file path
df = pd.read_csv(file_path)

# Assuming 'Sequence' is the column containing protein sequences, and 'Label' is the target column
X = df['Sequence'].tolist()  # Convert 'Sequence' column to a list
y = df['Label'].tolist()  # Convert 'Label' column to a list

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
from sumonet.utils.encodings import Encoding
encoder = Encoding(encoderType='one-hot')
X_train_encoded = encoder.encode_data(X_train)

In [39]:
import numpy as np
y_train = np.asarray(y_train)
y_train = (y_train[:,None] == np.arange(2)).astype(int)

In [41]:
from sumonet.model.architecture import SUMOnet
model = SUMOnet(input_shape = X_train_encoded.shape[1:] )
model.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.fit(X_train_encoded,y_train,epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7e1caf5c1c30>

In [42]:
encoder = Encoding(encoderType='one-hot')
X_test_encoded = encoder.encode_data(X_test)
y_preds = model.predict(X_test_encoded)



In [43]:
from sklearn import metrics

def f1_score(y_true, y_probs):
    return metrics.f1_score(y_true, y_probs.argmax(-1))

def mcc(y_true, y_probs):
    return metrics.matthews_corrcoef(y_true, y_probs.argmax(-1))

def rocAuc_score(y_true, y_probs):
    lr_fpr, lr_tpr, _ = metrics.roc_curve(y_true, y_probs[:,1])
    return metrics.auc(lr_fpr, lr_tpr)

def aupr_score(y_true, y_probs):
    precision, recall, _ = metrics.precision_recall_curve(y_true, y_probs[:,1])
    return metrics.auc(recall, precision)

def accuracy(y_true, y_probs):
    predicted_labels = y_probs.argmax(-1)
    return metrics.accuracy_score(y_true, predicted_labels)

def auc_score(y_true, y_probs):
    return metrics.roc_auc_score(y_true, y_probs[:,1])

def evaluate(y_true, y_probs, metrics_):
    metricsDict = {'f1': f1_score, 'mcc': mcc, 'roc': rocAuc_score, 'aupr': aupr_score, 'accuracy': accuracy, 'auc': auc_score}
    outputEval = {}

    if type(metrics_) == str:
        if metrics_.lower() not in metricsDict:
            raise ValueError(f"{metrics_.lower()} is not supported! f1, mcc, roc, aupr, accuracy, and auc are supported evaluation metrics")
        else:
            outputEval[metrics_.lower()] = metricsDict[metrics_.lower()](y_true, y_probs)
        return outputEval

    elif type(metrics_) == list:
        metricNames = [x.lower() for x in metrics_]
        for metricName in metricNames:
            if metricName not in metricsDict:
                raise ValueError(f"{metricName} is not supported! f1, mcc, roc, aupr, accuracy, and auc are supported evaluation metrics")
            else:
                outputEval[metricName] = metricsDict[metricName](y_true, y_probs)
        return outputEval

    else:
        raise TypeError("metrics must be a string or list of strings")


In [44]:
y_test = np.asarray(y_test)
y_test = (y_test[:,None] == np.arange(2)).astype(int)
y_test
f1_score = evaluate(y_test.argmax(-1),y_preds,'f1')

mcc = evaluate(y_test.argmax(-1),y_preds,'mcc')
roc = evaluate(y_test.argmax(-1),y_preds,'roc')
aupr = evaluate(y_test.argmax(-1),y_preds,'aupr')


print(f1_score)
print(mcc)
print(roc)
print(aupr)

{'f1': 0.18061674008810572}
{'mcc': 0.025829157350258414}
{'roc': 0.5554008828625453}
{'aupr': 0.5505524605183632}


In [45]:
acc = evaluate(y_test.argmax(-1),y_preds,'accuracy')
acc

{'accuracy': 0.5013404825737265}

In [47]:
auc = evaluate(y_test.argmax(-1),y_preds,'auc')
auc

{'auc': 0.5554008828625453}