In [2]:
pip install fair-esm pandas scikit-learn torch lazypredict


Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: fair-esm, lazypredict
Successfully installed fair-esm-2.0.0 lazypredict-0.2.12
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import torch
import esm
import numpy as np

ESM Feature Extraction**

In [2]:


# Load the CSV file
file_path = '/kaggle/input/combined-dataset/combined_peptides.csv'
data = pd.read_csv(file_path)

# Load the pretrained ESM model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()  # Adjust model as needed
batch_converter = alphabet.get_batch_converter()

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare the sequence data
sequences = data['sequence'].tolist()
labels = data['label'].tolist()

batch_size = 100  # Adjust the batch size based on your memory capacity
all_embeddings = []

for i in range(0, len(sequences), batch_size):
    batch_sequences = sequences[i:i+batch_size]
    batch_labels = labels[i:i+batch_size]
    
    data_tuples = [(f"sequence_{j}", seq) for j, seq in enumerate(batch_sequences)]
    _, _, batch_tokens = batch_converter(data_tuples)
    
    # Move batch tokens to GPU
    batch_tokens = batch_tokens.to(device)
    
    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[33])
    token_embeddings = results["representations"][33]
    
    # Move embeddings to CPU for further processing
    token_embeddings = token_embeddings.cpu()

    # Mean pooling to get fixed-size feature vectors
    def mean_pooling(token_embeddings, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

    attention_mask = batch_tokens.ne(alphabet.padding_idx)
    attention_mask = attention_mask.cpu()  # Ensure attention mask is also on CPU
    sequence_embeddings = mean_pooling(token_embeddings, attention_mask)
    
    # Move sequence embeddings to CPU before converting to numpy
    all_embeddings.append(sequence_embeddings.cpu().numpy())

# Concatenate all the embeddings
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(all_embeddings)
embedding_df['label'] = labels

# Save the DataFrame to a CSV file
embedding_df.to_csv('/kaggle/working/embeddings.csv', index=False)





Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm1b_t33_650M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S-contact-regression.pt


In [4]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/input/embeddings/embeddings_esm.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# Split the data into training (60%) and testing (40%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train.csv', index=False)
X_test.to_csv('/kaggle/working/X_test.csv', index=False)
y_train.to_csv('/kaggle/working/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/y_test.csv', index=False)
X_val.to_csv('/kaggle/working/X_val.csv', index=False)
y_val.to_csv('/kaggle/working/y_val.csv', index=False)



In [5]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train.csv')
X_test = pd.read_csv('/kaggle/working/X_test.csv')
X_val = pd.read_csv('/kaggle/working/X_val.csv')
y_train = pd.read_csv('/kaggle/working/y_train.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)


 97%|█████████▋| 28/29 [02:31<00:04,  4.43s/it]

[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.043809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745


100%|██████████| 29/29 [02:42<00:00,  5.61s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.93               0.93     0.93      0.93   
LGBMClassifier                     0.93               0.93     0.93      0.93   
ExtraTreesClassifier               0.93               0.93     0.93      0.93   
SVC                                0.93               0.93     0.93      0.93   
RandomForestClassifier             0.93               0.93     0.93      0.93   
RidgeClassifierCV                  0.92               0.92     0.92      0.92   
LinearDiscriminantAnalysis         0.91               0.91     0.91      0.91   
RidgeClassifier                    0.91               0.91     0.91      0.91   
CalibratedClassifierCV             0.91               0.91     0.91      0.91   
PassiveAggressiveClassifier        0.91               0.91     0.91      0.91   
BaggingClassifier           




In [6]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(probability=True),  # SVC needs probability=True for AUROC
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
}

# Function to calculate additional metrics
def calculate_metrics(y_true, y_pred, y_prob=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    sensitivity = tp / (tp + fn)  # Sensitivity (SN)
    specificity = tn / (tn + fp)  # Specificity (SP)
    fdr = fp / (fp + tp)  # False Discovery Rate (FDR)
    
    if y_prob is not None:
        auroc = roc_auc_score(y_true, y_prob[:, 1])
        precision, recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
        auprc = auc(recall, precision)
    else:
        auroc = None
        auprc = None

    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'mcc': mcc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'fdr': fdr,
        'auroc': auroc,
        'auprc': auprc
    }

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0
results = {}

for model_name, model in model_dict.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    
    # Probability predictions for ROC and AUPRC
    if hasattr(model, "predict_proba"):
        val_prob = model.predict_proba(X_val)
        test_prob = model.predict_proba(X_test)
    else:
        val_prob = None
        test_prob = None
    
    # Calculate metrics for validation set
    val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
    test_metrics = calculate_metrics(y_test, test_predictions, test_prob)
    
    # Store results
    results[model_name] = {
        'validation_metrics': val_metrics,
        'test_metrics': test_metrics
    }
    
    # Print metrics
    print(f"\n{model_name} Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print(f"\n{model_name} Test Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Track the best model by accuracy
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")


[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.042624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745

LGBMClassifier Validation Metrics:
accuracy: 0.9297
f1_score: 0.9341
mcc: 0.8589
sensitivity: 0.9272
specificity: 0.9327
fdr: 0.0588
auroc: 0.9802
auprc: 0.9838

LGBMClassifier Test Metrics:
accuracy: 0.9289
f1_score: 0.9350
mcc: 0.8566
sensitivity: 0.9319
specificity: 0.9252
fdr: 0.0620
auroc: 0.9734
auprc: 0.9780

XGBClassifier Validation Metrics:
accuracy: 0.9306
f1_score: 0.9351
mcc: 0.8606
sensitivity: 0.9305
specificity: 0.9308
fdr: 0.0602
auroc: 0.9797
auprc: 0.9840

XGBClassifier Test Metrics:

In [7]:
from sklearn.ensemble import StackingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression

# Define base models for stacking
base_models = [
    ('lgbm', LGBMClassifier()),  # LightGBM model
    ('extra_trees', ExtraTreesClassifier()),  # Extra Trees Classifier
    ('xgb', XGBClassifier(tree_method='gpu_hist', gpu_id=0))  # XGBoost model with GPU
]

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),  # You can choose any classifier here
    cv=5  # Use 5-fold cross-validation
)

# Fit the stacking model
stacking_model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.045064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745
[LightGBM] [Info] Number of positive: 1449, number of negative: 1248
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 2697, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537264 -> initscore=0.149331
[LightGBM] [Info] Start training from score 0.149331
[LightGBM] [In

In [8]:
# Make predictions on validation and test sets
val_predictions = stacking_model.predict(X_val)
test_predictions = stacking_model.predict(X_test)

# Probability predictions for ROC and AUPRC
if hasattr(stacking_model, "predict_proba"):
    val_prob = stacking_model.predict_proba(X_val)
    test_prob = stacking_model.predict_proba(X_test)
else:
    val_prob = None
    test_prob = None

# Calculate metrics for validation set
val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
test_metrics = calculate_metrics(y_test, test_predictions, test_prob)

# Print stacking model metrics
print("\nStacking Model Validation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nStacking Model Test Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")



Stacking Model Validation Metrics:
accuracy: 0.9270
f1_score: 0.9310
mcc: 0.8542
sensitivity: 0.9156
specificity: 0.9404
fdr: 0.0531
auroc: 0.9782
auprc: 0.9831

Stacking Model Test Metrics:
accuracy: 0.9289
f1_score: 0.9342
mcc: 0.8573
sensitivity: 0.9206
specificity: 0.9390
fdr: 0.0518
auroc: 0.9745
auprc: 0.9800


In [10]:
from sklearn.model_selection import cross_val_score

In [11]:
# Perform cross-validation for stacking model using the training set
n_folds = 5  # You can adjust this number
cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=n_folds, scoring='accuracy')

# Calculate average cross-validation score
average_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

# Print cross-validation results
print("\nCross-Validation Scores for Stacking Model:")
print(f"Scores: {cv_scores}")
print(f"Average Score: {average_cv_score:.4f} ± {std_cv_score:.4f}")

[LightGBM] [Info] Number of positive: 1449, number of negative: 1248
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 2697, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537264 -> initscore=0.149331
[LightGBM] [Info] Start training from score 0.149331
[LightGBM] [Info] Number of positive: 1159, number of negative: 998
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 2157, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537320 -> initscore=0.149560
[LightGBM] [Info] Start training from score 0.149560
[LightGBM] [Inf