In [1]:
pip install fair-esm pandas scikit-learn torch lazypredict


Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: fair-esm, lazypredict
Successfully installed fair-esm-2.0.0 lazypredict-0.2.12
Note: you may need to restart the kernel to use updated packages.


# # Protein

In [2]:
import pandas as pd
import torch
import esm
import numpy as np
import gc
from torch.cuda.amp import autocast

# Load the CSV file
file_path = '/kaggle/input/combineddb/combined_protein.csv'
data = pd.read_csv(file_path)

# Load the pretrained ESM model
model, alphabet = esm.pretrained.esm1b_t33_650M_UR50S()
batch_converter = alphabet.get_batch_converter()

# Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Prepare the sequence data
sequences = data['sequence'].tolist()
labels = data['label'].tolist()

batch_size = 10  # Reduce the batch size to fit within the available GPU memory
all_embeddings = []

def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

for i in range(0, len(sequences), batch_size):
    print(f"Processing batch {i//batch_size + 1}/{len(sequences)//batch_size + 1}")
    batch_sequences = sequences[i:i+batch_size]
    batch_labels = labels[i:i+batch_size]

    data_tuples = [(f"sequence_{j}", seq) for j, seq in enumerate(batch_sequences)]
    _, _, batch_tokens = batch_converter(data_tuples)
    print("Batch tokens:", batch_tokens.shape)

    # Move batch tokens to GPU
    batch_tokens = batch_tokens.to(device)

    with torch.no_grad():
        with autocast():
            results = model(batch_tokens, repr_layers=[33])
    token_embeddings = results["representations"][33]

    # Mean pooling to get fixed-size feature vectors
    attention_mask = batch_tokens.ne(alphabet.padding_idx)
    sequence_embeddings = mean_pooling(token_embeddings, attention_mask)

    # Move sequence embeddings to CPU before converting to numpy
    all_embeddings.append(sequence_embeddings.cpu().numpy())

    # Clear the cache
    torch.cuda.empty_cache()
    gc.collect()  # Invoke garbage collector to free up memory

# Concatenate all the embeddings
all_embeddings = np.concatenate(all_embeddings, axis=0)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(all_embeddings)
embedding_df['label'] = labels

# Save the DataFrame to a CSV file
embedding_df.to_csv('/kaggle/working/embeddings_protien.csv', index=False)


Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm1b_t33_650M_UR50S.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm1b_t33_650M_UR50S-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm1b_t33_650M_UR50S-contact-regression.pt


Processing batch 1/906
Batch tokens: torch.Size([10, 845])
Processing batch 2/906
Batch tokens: torch.Size([10, 563])
Processing batch 3/906
Batch tokens: torch.Size([10, 1004])
Processing batch 4/906
Batch tokens: torch.Size([10, 367])
Processing batch 5/906
Batch tokens: torch.Size([10, 918])
Processing batch 6/906
Batch tokens: torch.Size([10, 762])
Processing batch 7/906
Batch tokens: torch.Size([10, 513])
Processing batch 8/906
Batch tokens: torch.Size([10, 455])
Processing batch 9/906
Batch tokens: torch.Size([10, 741])
Processing batch 10/906
Batch tokens: torch.Size([10, 691])
Processing batch 11/906
Batch tokens: torch.Size([10, 711])
Processing batch 12/906
Batch tokens: torch.Size([10, 657])
Processing batch 13/906
Batch tokens: torch.Size([10, 798])
Processing batch 14/906
Batch tokens: torch.Size([10, 399])
Processing batch 15/906
Batch tokens: torch.Size([10, 790])
Processing batch 16/906
Batch tokens: torch.Size([10, 815])
Processing batch 17/906
Batch tokens: torch.Size

In [3]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/working/embeddings_protien.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# Split the data into training (60%) and testing (40%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train_pro.csv', index=False)
X_test.to_csv('/kaggle/working/X_test_pro.csv', index=False)
y_train.to_csv('/kaggle/working/y_train_pro.csv', index=False)
y_test.to_csv('/kaggle/working/y_test_pro.csv', index=False)
X_val.to_csv('/kaggle/working/X_val_pro.csv', index=False)
y_val.to_csv('/kaggle/working/y_val_pro.csv', index=False)



In [4]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train_pro.csv')
X_test = pd.read_csv('/kaggle/working/X_test_pro.csv')
X_val = pd.read_csv('/kaggle/working/X_val_pro.csv')
y_train = pd.read_csv('/kaggle/working/y_train_pro.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test_pro.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val_pro.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)


 97%|█████████▋| 28/29 [03:46<00:04,  4.84s/it]

[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.067080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574


100%|██████████| 29/29 [03:59<00:00,  8.27s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.97               0.97     0.97      0.97   
LogisticRegression                 0.97               0.97     0.97      0.97   
SVC                                0.97               0.97     0.97      0.97   
SGDClassifier                      0.97               0.96     0.96      0.97   
KNeighborsClassifier               0.96               0.96     0.96      0.96   
LGBMClassifier                     0.97               0.96     0.96      0.97   
RidgeClassifierCV                  0.96               0.96     0.96      0.96   
LinearDiscriminantAnalysis         0.96               0.96     0.96      0.96   
RidgeClassifier                    0.96               0.96     0.96      0.96   
ExtraTreesClassifier               0.97               0.96     0.96      0.97   
PassiveAggressiveClassifier 




In [5]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Define the model dictionary including the specified models
model_dict = {
    'XGBClassifier': XGBClassifier(),
    'LGBMClassifier': LGBMClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier(),
}

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0

for model_name, model in model_dict.items():
    model.fit(X_train, y_train)
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_val, val_predictions)
    test_accuracy = accuracy_score(y_test, test_predictions)
    print(f"{model_name} validation accuracy: {val_accuracy:.4f}, test accuracy: {test_accuracy:.4f}")

    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")

XGBClassifier validation accuracy: 0.9691, test accuracy: 0.9763
[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.070243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574
LGBMClassifier validation accuracy: 0.9647, test accuracy: 0.9768
ExtraTreesClassifier validation accuracy: 0.9663, test accuracy: 0.9774
SVC validation accuracy: 0.9431, test accuracy: 0.9432
RandomForestClassifier validation accuracy: 0.9641, test accuracy: 0.9752

Best Model Name: XGBClassifier


In [6]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(probability=True),  # SVC needs probability=True for AUROC
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
    'ExtraTreesClassifier':ExtraTreesClassifier(),
}

# Function to calculate additional metrics
def calculate_metrics(y_true, y_pred, y_prob=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    sensitivity = tp / (tp + fn)  # Sensitivity (SN)
    specificity = tn / (tn + fp)  # Specificity (SP)
    fdr = fp / (fp + tp)  # False Discovery Rate (FDR)
    
    if y_prob is not None:
        auroc = roc_auc_score(y_true, y_prob[:, 1])
        precision, recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
        auprc = auc(recall, precision)
    else:
        auroc = None
        auprc = None

    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'mcc': mcc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'fdr': fdr,
        'auroc': auroc,
        'auprc': auprc
    }

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0
results = {}

for model_name, model in model_dict.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    
    # Probability predictions for ROC and AUPRC
    if hasattr(model, "predict_proba"):
        val_prob = model.predict_proba(X_val)
        test_prob = model.predict_proba(X_test)
    else:
        val_prob = None
        test_prob = None
    
    # Calculate metrics for validation set
    val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
    test_metrics = calculate_metrics(y_test, test_predictions, test_prob)
    
    # Store results
    results[model_name] = {
        'validation_metrics': val_metrics,
        'test_metrics': test_metrics
    }
    
    # Print metrics
    print(f"\n{model_name} Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print(f"\n{model_name} Test Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Track the best model by accuracy
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")

[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 326400
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 1280
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574

LGBMClassifier Validation Metrics:
accuracy: 0.9647
f1_score: 0.9508
mcc: 0.9233
sensitivity: 0.9552
specificity: 0.9699
fdr: 0.0535
auroc: 0.9948
auprc: 0.9898

LGBMClassifier Test Metrics:
accuracy: 0.9768
f1_score: 0.9688
mcc: 0.9503
sensitivity: 0.9716
specificity: 0.9799
fdr: 0.0341
auroc: 0.9973
auprc: 0.9955

XGBClassifier Validation Metrics:
accuracy: 0.9691
f1_score: 0.9573
mcc: 0.9333
sensitivity: 0.9691
specificity: 0.9690
fdr: 0.0542
auroc: 0.9942
auprc: 0.9884

XGBClassifier Test Metric

In [7]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Define base models for stacking
base_models = [
    ('xgb', XGBClassifier(tree_method='gpu_hist', gpu_id=0)),  # XGBoost model with GPU
    ('svc', SVC(probability=True)),  # Support Vector Classifier with probability enabled
    ('log_reg', LogisticRegression())  # Logistic Regression
]

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),  # Final meta-classifier
    cv=5  # Use 5-fold cross-validation
)

# Fit the stacking model
stacking_model.fit(X_train, y_train)


In [8]:
# Make predictions on validation and test sets
val_predictions = stacking_model.predict(X_val)
test_predictions = stacking_model.predict(X_test)

# Probability predictions for ROC and AUPRC
if hasattr(stacking_model, "predict_proba"):
    val_prob = stacking_model.predict_proba(X_val)
    test_prob = stacking_model.predict_proba(X_test)
else:
    val_prob = None
    test_prob = None

# Calculate metrics for validation set
val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
test_metrics = calculate_metrics(y_test, test_predictions, test_prob)

# Print stacking model metrics
print("\nStacking Model Validation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nStacking Model Test Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")


Stacking Model Validation Metrics:
accuracy: 0.9691
f1_score: 0.9573
mcc: 0.9332
sensitivity: 0.9676
specificity: 0.9699
fdr: 0.0529
auroc: 0.9952
auprc: 0.9909

Stacking Model Test Metrics:
accuracy: 0.9779
f1_score: 0.9703
mcc: 0.9527
sensitivity: 0.9746
specificity: 0.9799
fdr: 0.0340
auroc: 0.9950
auprc: 0.9929


In [9]:
from sklearn.model_selection import cross_val_score

In [None]:
# Perform cross-validation for stacking model using the training set
n_folds = 5  # You can adjust this number
cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=n_folds, scoring='accuracy')

# Calculate average cross-validation score
average_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

# Print cross-validation results
print("\nCross-Validation Scores for Stacking Model:")
print(f"Scores: {cv_scores}")
print(f"Average Score: {average_cv_score:.4f} ± {std_cv_score:.4f}")