In [1]:
pip install transformers pandas scikit-learn torch lazypredict


Collecting lazypredict
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Installing collected packages: lazypredict
Successfully installed lazypredict-0.2.12
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

## peptides

In [3]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, BertModel

# Load the CSV file
# file_path = 'path/to/your/combined_peptides.csv'
data = pd.read_csv('/kaggle/input/new-combined-data/combined_peptides.csv')

# Load the pretrained ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)
model = BertModel.from_pretrained('Rostlab/prot_bert')
model.to('cuda')

def add_spaces(s):
    return ' '.join(s)

# Prepare the sequence data
sequences = data['sequence'].tolist()
sequences = [add_spaces(s) for s in sequences]
labels = data['label'].tolist()

# Tokenize sequences
def tokenize_sequences(sequences, tokenizer, max_length=512):
    tokenized_sequences = []
    for seq in sequences:
        # Tokenize and encode the sequence
        inputs = tokenizer(seq, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        tokenized_sequences.append(inputs)
    return tokenized_sequences

tokenized_sequences = tokenize_sequences(sequences, tokenizer)

# Extract features
# Extract features
def extract_features(tokenized_sequences, model):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for tokenized_sequence in tokenized_sequences:
            # Move tokenized sequence to CUDA
            tokenized_sequence = {key: val.to('cuda') for key, val in tokenized_sequence.items()}
            # Get model outputs
            outputs = model(**tokenized_sequence)
            # Compute mean pooling on the token embeddings
            sequence_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            all_embeddings.append(sequence_embeddings)
    return all_embeddings

sequence_embeddings = extract_features(tokenized_sequences, model)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(sequence_embeddings)
embedding_df['label'] = labels

embedding_df.to_csv('/kaggle/working/embeddings.csv', index=False)


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/81.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.68G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [4]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/working/embeddings.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# First split into training and temporary (test + validation) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split to divide temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train.csv', index=False)
X_test.to_csv('/kaggle/working/X_test.csv', index=False)
y_train.to_csv('/kaggle/working/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/y_test.csv', index=False)
X_val.to_csv('/kaggle/working/X_val.csv', index=False)
y_val.to_csv('/kaggle/working/y_val.csv', index=False)


In [5]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train.csv')
X_test = pd.read_csv('/kaggle/working/X_test.csv')
X_val = pd.read_csv('/kaggle/working/X_val.csv')
y_train = pd.read_csv('/kaggle/working/y_train.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)

 97%|█████████▋| 28/29 [01:58<00:03,  3.83s/it]

[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.035018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745


100%|██████████| 29/29 [02:07<00:00,  4.41s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
QuadraticDiscriminantAnalysis      0.91               0.91     0.91      0.91   
SVC                                0.89               0.89     0.89      0.89   
LGBMClassifier                     0.89               0.89     0.89      0.89   
RidgeClassifierCV                  0.88               0.88     0.88      0.88   
XGBClassifier                      0.89               0.88     0.88      0.89   
RidgeClassifier                    0.88               0.88     0.88      0.88   
LinearDiscriminantAnalysis         0.88               0.88     0.88      0.88   
ExtraTreesClassifier               0.88               0.88     0.88      0.88   
RandomForestClassifier             0.88               0.87     0.87      0.88   
LogisticRegression                 0.87               0.87     0.87      0.87   
CalibratedClassifierCV      




In [6]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(probability=True),  # SVC needs probability=True for AUROC
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
}

# Function to calculate additional metrics
def calculate_metrics(y_true, y_pred, y_prob=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    sensitivity = tp / (tp + fn)  # Sensitivity (SN)
    specificity = tn / (tn + fp)  # Specificity (SP)
    fdr = fp / (fp + tp)  # False Discovery Rate (FDR)
    
    if y_prob is not None:
        auroc = roc_auc_score(y_true, y_prob[:, 1])
        precision, recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
        auprc = auc(recall, precision)
    else:
        auroc = None
        auprc = None

    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'mcc': mcc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'fdr': fdr,
        'auroc': auroc,
        'auprc': auprc
    }

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0
results = {}

for model_name, model in model_dict.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    
    # Probability predictions for ROC and AUPRC
    if hasattr(model, "predict_proba"):
        val_prob = model.predict_proba(X_val)
        test_prob = model.predict_proba(X_test)
    else:
        val_prob = None
        test_prob = None
    
    # Calculate metrics for validation set
    val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
    test_metrics = calculate_metrics(y_test, test_predictions, test_prob)
    
    # Store results
    results[model_name] = {
        'validation_metrics': val_metrics,
        'test_metrics': test_metrics
    }
    
    # Print metrics
    print(f"\n{model_name} Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print(f"\n{model_name} Test Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Track the best model by accuracy
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")


[LightGBM] [Info] Number of positive: 1812, number of negative: 1560
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.037674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 3372, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.537367 -> initscore=0.149745
[LightGBM] [Info] Start training from score 0.149745

LGBMClassifier Validation Metrics:
accuracy: 0.8906
f1_score: 0.9012
mcc: 0.7805
sensitivity: 0.9288
specificity: 0.8462
fdr: 0.1248
auroc: 0.9643
auprc: 0.9667

LGBMClassifier Test Metrics:
accuracy: 0.8862
f1_score: 0.8978
mcc: 0.7700
sensitivity: 0.9109
specificity: 0.8563
fdr: 0.1150
auroc: 0.9573
auprc: 0.9618

XGBClassifier Validation Metrics:
accuracy: 0.8861
f1_score: 0.8978
mcc: 0.7720
sensitivity: 0.9305
specificity: 0.8346
fdr: 0.1327
auroc: 0.9633
auprc: 0.9694

XGBClassifier Test Metrics:

In [18]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [19]:
# Define base models for stacking
base_models = [
    ('qda', QuadraticDiscriminantAnalysis()),
    ('svc', SVC(probability=True)),
    ('lgbm', LGBMClassifier())
]

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),  # You can choose any classifier here
    cv=5  # Use 5-fold cross-validation
)

# Fit the stacking model
stacking_model.fit(X_train, y_train)



[LightGBM] [Info] Number of positive: 2733, number of negative: 4511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.145838 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 7244, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377278 -> initscore=-0.501119
[LightGBM] [Info] Start training from score -0.501119
[LightGBM] [Info] Number of positive: 2187, number of negative: 3608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.200358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5795, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377394 -> initscore=-0.500623
[LightGBM] [Info] Start training from score -0.500623
[LightGBM]

In [20]:
# Make predictions on validation and test sets
val_predictions = stacking_model.predict(X_val)
test_predictions = stacking_model.predict(X_test)

# Probability predictions for ROC and AUPRC
if hasattr(stacking_model, "predict_proba"):
    val_prob = stacking_model.predict_proba(X_val)
    test_prob = stacking_model.predict_proba(X_test)
else:
    val_prob = None
    test_prob = None

# Calculate metrics for validation set
val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
test_metrics = calculate_metrics(y_test, test_predictions, test_prob)

# Print stacking model metrics
print("\nStacking Model Validation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nStacking Model Test Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")



Stacking Model Validation Metrics:
accuracy: 0.9735
f1_score: 0.9624
mcc: 0.9419
sensitivity: 0.9624
specificity: 0.9796
fdr: 0.0376
auroc: 0.9959
auprc: 0.9924

Stacking Model Test Metrics:
accuracy: 0.9862
f1_score: 0.9813
mcc: 0.9704
sensitivity: 0.9806
specificity: 0.9895
fdr: 0.0179
auroc: 0.9988
auprc: 0.9976


In [21]:
# Perform cross-validation for stacking model using the training set
n_folds = 5  # You can adjust this number
cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=n_folds, scoring='accuracy')

# Calculate average cross-validation score
average_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

# Print cross-validation results
print("\nCross-Validation Scores for Stacking Model:")
print(f"Scores: {cv_scores}")
print(f"Average Score: {average_cv_score:.4f} ± {std_cv_score:.4f}")

[LightGBM] [Info] Number of positive: 2187, number of negative: 3608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.116157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5795, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377394 -> initscore=-0.500623
[LightGBM] [Info] Start training from score -0.500623
[LightGBM] [Info] Number of positive: 1749, number of negative: 2887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089489 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 4636, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377265 -> initscore=-0.501174
[LightGBM] [Info] Start training from score -0.501174
[LightGBM]

## proteins

In [7]:
import torch
from transformers import BertForMaskedLM, BertTokenizer, BertModel

# Load the CSV file
# file_path = 'path/to/your/combined_peptides.csv'
data = pd.read_csv('/kaggle/input/new-combined-data/combined_protein.csv')

# Load the pretrained ProtBERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('Rostlab/prot_bert', do_lower_case=False)
model = BertModel.from_pretrained('Rostlab/prot_bert')
model.to('cuda')

def add_spaces(s):
    return ' '.join(s)

# Prepare the sequence data
sequences = data['sequence'].tolist()
sequences = [add_spaces(s) for s in sequences]
labels = data['label'].tolist()

# Tokenize sequences
def tokenize_sequences(sequences, tokenizer, max_length=512):
    tokenized_sequences = []
    for seq in sequences:
        # Tokenize and encode the sequence
        inputs = tokenizer(seq, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
        tokenized_sequences.append(inputs)
    return tokenized_sequences

tokenized_sequences = tokenize_sequences(sequences, tokenizer)

# Extract features
# Extract features
def extract_features(tokenized_sequences, model):
    model.eval()
    all_embeddings = []
    with torch.no_grad():
        for tokenized_sequence in tokenized_sequences:
            # Move tokenized sequence to CUDA
            tokenized_sequence = {key: val.to('cuda') for key, val in tokenized_sequence.items()}
            # Get model outputs
            outputs = model(**tokenized_sequence)
            # Compute mean pooling on the token embeddings
            sequence_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            all_embeddings.append(sequence_embeddings)
    return all_embeddings

sequence_embeddings = extract_features(tokenized_sequences, model)

# Convert the embeddings to a DataFrame
embedding_df = pd.DataFrame(sequence_embeddings)
embedding_df['label'] = labels

embedding_df.to_csv('/kaggle/working/embeddings_pro.csv', index=False)


In [8]:
from sklearn.model_selection import train_test_split

# Load the embeddings
embedding_df = pd.read_csv('/kaggle/working/embeddings_pro.csv')

# Split the data into features and labels
X = embedding_df.drop(columns=['label'])
y = embedding_df['label']

# First split into training and temporary (test + validation) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

# Second split to divide temporary set into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Save the split datasets
X_train.to_csv('/kaggle/working/X_train_pro.csv', index=False)
X_test.to_csv('/kaggle/working/X_test_pro.csv', index=False)
y_train.to_csv('/kaggle/working/y_train_pro.csv', index=False)
y_test.to_csv('/kaggle/working/y_test_pro.csv', index=False)
X_val.to_csv('/kaggle/working/X_val_pro.csv', index=False)
y_val.to_csv('/kaggle/working/y_val_pro.csv', index=False)


In [9]:
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score

# Load the split datasets
X_train = pd.read_csv('/kaggle/working/X_train_pro.csv')
X_test = pd.read_csv('/kaggle/working/X_test_pro.csv')
X_val = pd.read_csv('/kaggle/working/X_val_pro.csv')
y_train = pd.read_csv('/kaggle/working/y_train_pro.csv').values.ravel()
y_test = pd.read_csv('/kaggle/working/y_test_pro.csv').values.ravel()
y_val = pd.read_csv('/kaggle/working/y_val_pro.csv').values.ravel()

# Initialize LazyClassifier
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Train and test the models
models, predictions = clf.fit(X_train, X_val, y_train, y_val)

# Display the results
print(models)

 97%|█████████▋| 28/29 [03:07<00:04,  4.01s/it]

[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.107432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574


100%|██████████| 29/29 [03:18<00:00,  6.86s/it]

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
SVC                                0.97               0.97     0.97      0.97   
XGBClassifier                      0.97               0.97     0.97      0.97   
LGBMClassifier                     0.97               0.97     0.97      0.97   
LinearDiscriminantAnalysis         0.97               0.97     0.97      0.97   
RidgeClassifier                    0.97               0.97     0.97      0.97   
RidgeClassifierCV                  0.97               0.97     0.97      0.97   
KNeighborsClassifier               0.96               0.97     0.97      0.96   
ExtraTreesClassifier               0.97               0.96     0.96      0.97   
RandomForestClassifier             0.97               0.96     0.96      0.97   
PassiveAggressiveClassifier        0.96               0.96     0.96      0.96   
LogisticRegression          




In [10]:
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef, roc_auc_score, precision_recall_curve, auc, confusion_matrix, classification_report

# Define the model dictionary including AdaBoostClassifier
model_dict = {
    'LGBMClassifier': LGBMClassifier(),
    'XGBClassifier': XGBClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(probability=True),  # SVC needs probability=True for AUROC
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
}

# Function to calculate additional metrics
def calculate_metrics(y_true, y_pred, y_prob=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    
    sensitivity = tp / (tp + fn)  # Sensitivity (SN)
    specificity = tn / (tn + fp)  # Specificity (SP)
    fdr = fp / (fp + tp)  # False Discovery Rate (FDR)
    
    if y_prob is not None:
        auroc = roc_auc_score(y_true, y_prob[:, 1])
        precision, recall, _ = precision_recall_curve(y_true, y_prob[:, 1])
        auprc = auc(recall, precision)
    else:
        auroc = None
        auprc = None

    return {
        'accuracy': accuracy,
        'f1_score': f1,
        'mcc': mcc,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'fdr': fdr,
        'auroc': auroc,
        'auprc': auprc
    }

# Fit and evaluate each model
best_model_name = None
best_accuracy = 0
results = {}

for model_name, model in model_dict.items():
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    val_predictions = model.predict(X_val)
    test_predictions = model.predict(X_test)
    
    # Probability predictions for ROC and AUPRC
    if hasattr(model, "predict_proba"):
        val_prob = model.predict_proba(X_val)
        test_prob = model.predict_proba(X_test)
    else:
        val_prob = None
        test_prob = None
    
    # Calculate metrics for validation set
    val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
    test_metrics = calculate_metrics(y_test, test_predictions, test_prob)
    
    # Store results
    results[model_name] = {
        'validation_metrics': val_metrics,
        'test_metrics': test_metrics
    }
    
    # Print metrics
    print(f"\n{model_name} Validation Metrics:")
    for metric, value in val_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print(f"\n{model_name} Test Metrics:")
    for metric, value in test_metrics.items():
        print(f"{metric}: {value:.4f}")
    
    # Track the best model by accuracy
    if val_metrics['accuracy'] > best_accuracy:
        best_accuracy = val_metrics['accuracy']
        best_model_name = model_name

print(f"\nBest Model Name: {best_model_name}")


[LightGBM] [Info] Number of positive: 2053, number of negative: 3380
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.109615 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5433, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377876 -> initscore=-0.498574
[LightGBM] [Info] Start training from score -0.498574

LGBMClassifier Validation Metrics:
accuracy: 0.9680
f1_score: 0.9557
mcc: 0.9308
sensitivity: 0.9660
specificity: 0.9690
fdr: 0.0544
auroc: 0.9938
auprc: 0.9862

LGBMClassifier Test Metrics:
accuracy: 0.9719
f1_score: 0.9620
mcc: 0.9397
sensitivity: 0.9642
specificity: 0.9764
fdr: 0.0401
auroc: 0.9962
auprc: 0.9930

XGBClassifier Validation Metrics:
accuracy: 0.9685
f1_score: 0.9565
mcc: 0.9320
sensitivity: 0.9676
specificity: 0.9690
fdr: 0.0543
auroc: 0.9932
auprc: 0.9856

XGBClassifier Test Metric

In [24]:

# Define base models for stacking
base_models = [
    ('lgbm', LGBMClassifier()),  # LightGBM model
    ('xgb', XGBClassifier(tree_method='gpu_hist', gpu_id=0)),  # XGBoost model with GPU
    ('svc', SVC(probability=True))  # Support Vector Classifier
]

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(),  # You can choose any classifier here
    cv=5  # Use 5-fold cross-validation
)

# Fit the stacking model
stacking_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 2733, number of negative: 4511
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.121897 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 7244, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377278 -> initscore=-0.501119
[LightGBM] [Info] Start training from score -0.501119
[LightGBM] [Info] Number of positive: 2187, number of negative: 3608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093791 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5795, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377394 -> initscore=-0.500623
[LightGBM] [Info] Start training from score -0.500623
[LightGBM]

In [25]:
# Make predictions on validation and test sets
val_predictions = stacking_model.predict(X_val)
test_predictions = stacking_model.predict(X_test)

# Probability predictions for ROC and AUPRC
if hasattr(stacking_model, "predict_proba"):
    val_prob = stacking_model.predict_proba(X_val)
    test_prob = stacking_model.predict_proba(X_test)
else:
    val_prob = None
    test_prob = None

# Calculate metrics for validation set
val_metrics = calculate_metrics(y_val, val_predictions, val_prob)
test_metrics = calculate_metrics(y_test, test_predictions, test_prob)

# Print stacking model metrics
print("\nStacking Model Validation Metrics:")
for metric, value in val_metrics.items():
    print(f"{metric}: {value:.4f}")

print("\nStacking Model Test Metrics:")
for metric, value in test_metrics.items():
    print(f"{metric}: {value:.4f}")



Stacking Model Validation Metrics:
accuracy: 0.9752
f1_score: 0.9648
mcc: 0.9456
sensitivity: 0.9655
specificity: 0.9804
fdr: 0.0360
auroc: 0.9948
auprc: 0.9895

Stacking Model Test Metrics:
accuracy: 0.9868
f1_score: 0.9821
mcc: 0.9716
sensitivity: 0.9806
specificity: 0.9904
fdr: 0.0165
auroc: 0.9984
auprc: 0.9967


In [26]:
# Perform cross-validation for stacking model using the training set
n_folds = 5  # You can adjust this number
cv_scores = cross_val_score(stacking_model, X_train, y_train, cv=n_folds, scoring='accuracy')

# Calculate average cross-validation score
average_cv_score = cv_scores.mean()
std_cv_score = cv_scores.std()

# Print cross-validation results
print("\nCross-Validation Scores for Stacking Model:")
print(f"Scores: {cv_scores}")
print(f"Average Score: {average_cv_score:.4f} ± {std_cv_score:.4f}")

[LightGBM] [Info] Number of positive: 2187, number of negative: 3608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.093673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 5795, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377394 -> initscore=-0.500623
[LightGBM] [Info] Start training from score -0.500623
[LightGBM] [Info] Number of positive: 1749, number of negative: 2887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.058930 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 4636, number of used features: 1024
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.377265 -> initscore=-0.501174
[LightGBM] [Info] Start training from score -0.501174
[LightGBM]