<a href="https://colab.research.google.com/github/zainabbio/Data_Science/blob/main/HLA_DR_ML_DDE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Dipeptide Deviation from Expected Mean**

The DDE is an effective protein feature representation method proposed by Saravanan et al.,23 for linear B-Cell Epitope prediction. DDE considers the consecutive pairs (local sequence information) of AA in peptides and generates 400-dimension feature vector. These dipeptides have an associated properties that influence the protein’s function and structure. The working principle of the DDE descriptor relies on three parameters: DPC, theoretical mean (Tm), and theoretical variance (Tv).



Step 1: Import Libraries

In [None]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import scipy.io as sio
import numpy as np
from sklearn.preprocessing import scale

# Install catboost if not already installed
!pip install catboost

from catboost import CatBoostClassifier
import lightgbm as lgb

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
import math
import os
import pickle
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier as xgbc, XGBClassifier
from sklearn.model_selection import train_test_split, KFold, GridSearchCV

Load and preprocess the data

In [None]:
train_esm1b_P = pd.read_csv('/content/HLA_main_pos_DDE.csv')

train_esm1b_N = pd.read_csv('/content/HLA_main_neg_DDE.csv')

train_esm1b = np.row_stack((train_esm1b_P, train_esm1b_N))
[m1, n1] = np.shape(train_esm1b)
label1 = np.ones((20175, 1))
label2 = np.zeros((20385, 1))
label = np.append(label1, label2)

Standardized dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(train_esm1b)

shu = scale(train_esm1b)



#shu = scale(train_esm1b)
#X = np.reshape(shu, (-1, 1, n1))
X = shu
y = label

Divide the dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(32448, 400)
(8112, 400)
(32448,)
(8112,)


In [None]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix

In [None]:
model  = CatBoostClassifier().fit(X_train, y_train)


model = VotingClassifier(
    estimators=[('svc', SVC(probability=True, kernel='rbf')),
                ('rf', RandomForestClassifier(n_estimators=300, max_depth=7, random_state=7)),
                ('xgb', xgbc(n_estimators=800, max_depth=5, random_state=7)),
                ('lr', LogisticRegression(solver='liblinear', random_state=7)),
                ('knn', KNeighborsClassifier(n_neighbors=6)),
                ('mlp', MLPClassifier(hidden_layer_sizes=[64, 32], max_iter=1000)),
                ],
    voting='soft')
model.fit(X_train, y_train)

Learning rate set to 0.045521
0:	learn: 0.6914182	total: 102ms	remaining: 1m 42s
1:	learn: 0.6897138	total: 146ms	remaining: 1m 12s
2:	learn: 0.6882710	total: 189ms	remaining: 1m 2s
3:	learn: 0.6869738	total: 237ms	remaining: 58.9s
4:	learn: 0.6855824	total: 280ms	remaining: 55.7s
5:	learn: 0.6841393	total: 326ms	remaining: 54.1s
6:	learn: 0.6828115	total: 369ms	remaining: 52.3s
7:	learn: 0.6814136	total: 413ms	remaining: 51.2s
8:	learn: 0.6802801	total: 459ms	remaining: 50.5s
9:	learn: 0.6790524	total: 502ms	remaining: 49.7s
10:	learn: 0.6778831	total: 558ms	remaining: 50.2s
11:	learn: 0.6768429	total: 602ms	remaining: 49.6s
12:	learn: 0.6755491	total: 649ms	remaining: 49.2s
13:	learn: 0.6743002	total: 706ms	remaining: 49.7s
14:	learn: 0.6733179	total: 749ms	remaining: 49.2s
15:	learn: 0.6723550	total: 793ms	remaining: 48.8s
16:	learn: 0.6712158	total: 839ms	remaining: 48.5s
17:	learn: 0.6702406	total: 883ms	remaining: 48.2s
18:	learn: 0.6692478	total: 934ms	remaining: 48.2s
19:	learn

Save the model

In [None]:
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']

Cross-validation a fold = 5

In [None]:
def confusion_matrix_scorer(clf, X_train, y_train):
        y_pred = clf.predict(X_train)
        cm = confusion_matrix(y_train, y_pred)
        return {'tn': cm[0, 0], 'fp': cm[0, 1],
                'fn': cm[1, 0], 'tp': cm[1, 1]}

cv_results = cross_validate(model, X_train, y_train, cv=5,
                            scoring=confusion_matrix_scorer)

In [None]:
# Getting the test set true positive scores
TP = cv_results['test_tp'].mean()

# Getting the test set false negative scores
FN = cv_results['test_fn'].mean()

# Getting the test set false positive scores
FP = cv_results['test_fp'].mean()

# Getting the test set true negative scores
TN = cv_results['test_tn'].mean()

Training

In [None]:
acurracy = (TP+TN) / (TP+TN+FP+FN)
F1_score = 2*TP / ((2*TP) + (FP + FN))
precision = TP / (TP + FP)
specificity = TN / (FP + TN)
sensitivity_recall = TP / (TP + FN)
import math
MCC = ((TP*TN) - (FP*FN)) / math.sqrt(((TP+FP)*(TP+FN))*((TN+FP)*(TN+FN)))

print("Accuracy: ", acurracy)
print("F1_score: ", F1_score)
print("Precision: ", precision)
print("Specificity: ", specificity)
print("Sensitivity/Recall: ", sensitivity_recall)
print("MCC: ", MCC)

Testing

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report
pred_test=model.predict(X_test)

conf = confusion_matrix(y_test, pred_test)
TP = conf[1, 1]
FP = conf[0, 1]
TN = conf[0, 0]
FN = conf[1, 0]

acurracy = (TP+TN) / (TP+TN+FP+FN)
F1_score = 2*TP / ((2*TP) + (FP + FN))
precision = TP / (TP + FP)
specificity = TN / (FP + TN)
sensitivity_recall = TP / (TP + FN)

import math
MCC = ((TP*TN) - (FP*FN)) / math.sqrt(((TP+FP)*(TP+FN))*((TN+FP)*(TN+FN)))

print("Accuracy: ", acurracy)
print("F1_score: ", F1_score)
print("Precision: ", precision)
print("Specificity: ", specificity)
print("Sensitivity/Recall: ", sensitivity_recall)
print("MCC: ", MCC)