Save fold details

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut


# Change the following variables according to your needs
species = "human"
tissue = "heart"

esslnc_path = f"../../data/benchmark/{species}/ess_lnc.csv"
nonesslnc_path = f"../../data/benchmark/{species}/noness_lnc.csv"

esslnc = pd.read_csv(esslnc_path)
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc["lncRNA_id"])
nonesslnc_id = set(nonesslnc["lncRNA_id"])

# File paths
lncRNA_path = f"../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv"
lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)

lnc_ess = lnc[lnc.index.isin(esslnc_id)]
lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

# Prepare data arrays
X_positive = lnc_ess.values
X_negative = lnc_noness.values
ids_positive = lnc_ess.index
ids_negative = lnc_noness.index

# Combine datasets
X_all = np.vstack((X_positive, X_negative))
y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
ids_all = np.hstack((ids_positive, ids_negative))


# Choose CV strategy
if species == 'mouse':
    cv = LeaveOneOut() 
else:
    cv = KFold(n_splits=10, shuffle=True, random_state=42)

fold_records = pd.DataFrame()

for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all[train_index], y_all[test_index]
    ids_train, ids_test = ids_all[train_index], ids_all[test_index]

    fold_data = {
        "Fold": fold + 1,
        "Train_IDs": [list(ids_train)],
        "Train_Labels": [list(y_train)],
        "Test_IDs": [list(ids_test)],
        "Test_Labels": [list(y_test)],
    }
    fold_df = pd.DataFrame(fold_data)
    fold_records = pd.concat([fold_records, fold_df], ignore_index=True)

fold_records.to_csv(
    f"../fold_details/{species}_fold_details.csv",
    index=False
)


### Tuning layer size for HinSAGE

In [35]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, auc, precision_recall_curve
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

layer_size_1 = [16, 32, 64]
layer_size_2 = [32, 64, 128, 256]

# Change the following variables according to your needs
species = "mouse"
tissue = "heart"

esslnc_path = f"../../data/benchmark/{species}/ess_lnc.csv"
nonesslnc_path = f"../../data/benchmark/{species}/noness_lnc.csv"

esslnc = pd.read_csv(esslnc_path)
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc["lncRNA_id"])
nonesslnc_id = set(nonesslnc["lncRNA_id"])

metrics_df = pd.DataFrame()  # DataFrame to store metrics for each configuration

# Create output directories
os.makedirs(f"./performance/{species}", exist_ok=True)

for i in layer_size_1:
    for j in layer_size_2:

        # File paths
        lncRNA_path = f"../../HinSAGE/{species}/layer_size/lncRNA_embeddings_{tissue}_{i}_{j}.csv"
        lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)

        lnc_ess = lnc[lnc.index.isin(esslnc_id)]
        lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

        # Prepare data arrays
        X_positive = lnc_ess.values
        X_negative = lnc_noness.values
        ids_positive = lnc_ess.index
        ids_negative = lnc_noness.index

        # Combine datasets
        X_all = np.vstack((X_positive, X_negative))
        y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
        ids_all = np.hstack((ids_positive, ids_negative))

        # Initialize lists to store all true labels and decision scores
        all_true_labels = []
        all_decision_scores = []

        # Choose CV strategy
        if species == 'mouse':
            cv = LeaveOneOut() 
        else:
            cv = KFold(n_splits=10, shuffle=True, random_state=42)

        for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
            X_train, X_test = X_all[train_index], X_all[test_index]
            y_train, y_test = y_all[train_index], y_all[test_index]
            ids_train, ids_test = ids_all[train_index], ids_all[test_index]

            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # ✅ RBF-SVM with fixed parameters: C=10, gamma="scale"
            svm = SVC(kernel="rbf", C=10, gamma="scale")
            svm.fit(X_train_scaled, y_train)

            decision_scores = svm.decision_function(X_test_scaled)
            predictions = (decision_scores >= 0).astype(int)

            all_true_labels.extend(y_test)
            all_decision_scores.extend(decision_scores)

        # Convert lists to arrays
        all_true_labels = np.array(all_true_labels)
        all_decision_scores = np.array(all_decision_scores)

        # Confusion matrix using threshold=0
        tn, fp, fn, tp = confusion_matrix(all_true_labels, (all_decision_scores >= 0).astype(int)).ravel()

        # Avoid division-by-zero
        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
        f1_score = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
        mcc = matthews_corrcoef(all_true_labels, (all_decision_scores >= 0).astype(int))

        # ROC curve & AUC
        fpr, tpr, _ = roc_curve(all_true_labels, all_decision_scores)
        roc_auc = auc(fpr, tpr)

        # Precision-Recall curve & AUC
        precision, recall, _ = precision_recall_curve(all_true_labels, all_decision_scores)
        pr_auc = auc(recall, precision)

        metrics = {
            "layer_size_1": i,
            "layer_size_2": j,
            "Sensitivity": round(sensitivity, 4),
            "Specificity": round(specificity, 4),
            "PPV": round(ppv, 4),
            "F1 Score": round(f1_score, 4),
            "Accuracy": round(accuracy, 4),
            "MCC": round(mcc, 4),
            "ROC AUC": round(roc_auc, 4),
            "PR AUC": round(pr_auc, 4),
        }
        metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics])], ignore_index=True)

# Save metrics DataFrame
metrics_df.to_csv(f"./performance/{species}/hinsage_layersize_{tissue}_metrics_RBF_fixed.csv", index=False)

print("✅ Done!")
print(f"Metrics saved to: ./performance/{species}/hinsage_layersize_{tissue}_metrics_RBF_fixed.csv")


✅ Done!
Metrics saved to: ./performance/mouse/hinsage_layersize_heart_metrics_RBF_fixed.csv


### Tuning samples numbers for HinSAGE

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut 
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, auc, precision_recall_curve
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

samples_num_1=[5,10,15,20]
samples_num_2=[10,15,20,25]

# Change the following variables according to your needs
species = 'mouse'
tissue = 'heart'

esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'  
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'  

esslnc = pd.read_csv(esslnc_path)  
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

metrics_df = pd.DataFrame()  # DataFrame to store metrics for each configuration

for i in samples_num_1:
    for j in samples_num_2:

        # File paths
        lncRNA_path = f'../../HinSAGE/{species}/samples_num/lncRNA_embeddings_heart_{i}_{j}.csv'
        lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)  

        lnc_ess = lnc[lnc.index.isin(esslnc_id)]
        lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

        # Prepare data arrays
        X_positive = lnc_ess.values
        X_negative = lnc_noness.values
        ids_positive = lnc_ess.index
        ids_negative = lnc_noness.index

        # Combine datasets
        X_all = np.vstack((X_positive, X_negative))
        y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
        ids_all = np.hstack((ids_positive, ids_negative))

        # Initialize lists to store all true labels and decision scores
        all_true_labels = []
        all_decision_scores = []

        if species == 'mouse':
            cv = LeaveOneOut() 
        else:
            cv = KFold(n_splits=10, shuffle=True, random_state=42)

        for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
            X_train, X_test = X_all[train_index], X_all[test_index]
            y_train, y_test = y_all[train_index], y_all[test_index]
            ids_train, ids_test = ids_all[train_index], ids_all[test_index]

            # Apply MinMaxScaler
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # ✅ RBF-SVM with fixed parameters: C=10, gamma="scale"
            svm = SVC(kernel="rbf", C=10, gamma="scale")
            svm.fit(X_train_scaled, y_train)

            decision_scores = svm.decision_function(X_test_scaled)
            predictions = (decision_scores >= 0).astype(int)

            all_true_labels.extend(y_test)
            all_decision_scores.extend(decision_scores)

        # Convert lists to arrays for performance evaluation
        all_true_labels = np.array(all_true_labels)
        all_decision_scores = np.array(all_decision_scores)

        # Compute confusion matrix using threshold at 0
        tn, fp, fn, tp = confusion_matrix(all_true_labels, (all_decision_scores >= 0).astype(int)).ravel()

        # Compute performance metrics
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        f1_score = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
        mcc = matthews_corrcoef(all_true_labels, (all_decision_scores >= 0).astype(int))

        # Compute and save ROC curve data
        fpr, tpr, _ = roc_curve(all_true_labels, all_decision_scores)
        roc_auc = auc(fpr, tpr)

        # Compute and save Precision-Recall curve data
        precision, recall, _ = precision_recall_curve(all_true_labels, all_decision_scores)
        pr_auc = auc(recall, precision)

        metrics = {
            "layer_size_1": i,
            "layer_size_2": j,
            "Sensitivity": round(sensitivity, 4),
            "Specificity": round(specificity, 4),
            "PPV": round(ppv, 4),
            "F1 Score": round(f1_score, 4),
            "Accuracy": round(accuracy, 4),
            "MCC": round(mcc, 4),
            "ROC AUC": round(roc_auc, 4),
            "PR AUC": round(pr_auc, 4),
        }
        new_row = pd.DataFrame([metrics])
        metrics_df = pd.concat([metrics_df, new_row], ignore_index=True)

# Save metrics DataFrame to CSV
metrics_df.to_csv(f"./performance/{species}/hinsage_samplesnum_{tissue}_metrics_RBF_fixed.csv", index=False)

print("✅ Done!")
print(f"Metrics saved to: ./performance/{species}/hinsage_samplesnum_{tissue}_metrics_RBF_fixed.csv")

✅ Done!
Metrics saved to: ./performance/mouse/hinsage_samplesnum_heart_metrics_RBF_fixed.csv


### Tune C and gamma for SVM

In [24]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, auc, precision_recall_curve
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

# Change the following variables according to your needs
species = "human"
tissue = "heart"

esslnc_path = f"../../data/benchmark/{species}/ess_lnc.csv"
nonesslnc_path = f"../../data/benchmark/{species}/noness_lnc.csv"
lncRNA_path = f"../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv"

# Output directory
out_dir = f"./performance/{species}"
os.makedirs(out_dir, exist_ok=True)

# Read data
lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)
esslnc = pd.read_csv(esslnc_path)
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc["lncRNA_id"])
nonesslnc_id = set(nonesslnc["lncRNA_id"])

lnc_ess = lnc[lnc.index.isin(esslnc_id)]
lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

# Prepare data arrays
X_positive = lnc_ess.values
X_negative = lnc_noness.values
ids_positive = lnc_ess.index
ids_negative = lnc_noness.index

# Combine datasets
X_all = np.vstack((X_positive, X_negative))
y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
ids_all = np.hstack((ids_positive, ids_negative))

# Define grid search space for RBF-SVM
C_values = [1, 10, 100]
gamma_values = [0.1, 0.01, 0.001]

# Prepare to store results
results = []

# Initialize CV
if species == "mouse":
    cv = LeaveOneOut()
else:
    cv = KFold(n_splits=10, shuffle=True, random_state=42)

# Grid search over (C, gamma)
for C in C_values:
    for gamma in gamma_values:
        all_true_labels = []
        all_decision_scores = []

        for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
            X_train, X_test = X_all[train_index], X_all[test_index]
            y_train, y_test = y_all[train_index], y_all[test_index]

            # ✅ StandardScaler 
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # ✅ RBF-SVM
            svm = SVC(kernel="rbf", C=C, gamma=gamma)
            svm.fit(X_train_scaled, y_train)

            decision_scores = svm.decision_function(X_test_scaled)

            all_true_labels.extend(y_test)
            all_decision_scores.extend(decision_scores)

        # Convert lists to arrays
        all_true_labels = np.array(all_true_labels)
        all_decision_scores = np.array(all_decision_scores)
        y_pred = (all_decision_scores >= 0).astype(int)

        # Confusion matrix
        tn, fp, fn, tp = confusion_matrix(all_true_labels, y_pred).ravel()

        sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
        f1 = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
        mcc = matthews_corrcoef(all_true_labels, y_pred)

        # ROC AUC
        fpr, tpr, _ = roc_curve(all_true_labels, all_decision_scores)
        roc_auc = auc(fpr, tpr)

        # PR AUC
        precision, recall, _ = precision_recall_curve(all_true_labels, all_decision_scores)
        pr_auc = auc(recall, precision)

        results.append({
            "C": C,
            "gamma": gamma,
            "Sensitivity": round(sensitivity, 4),
            "Specificity": round(specificity, 4),
            "PPV": round(ppv, 4),
            "F1 Score": round(f1_score, 4),
            "Accuracy": round(accuracy, 4),
            "MCC": round(mcc, 4),
            "ROC AUC": round(roc_auc, 4),
            "PR AUC": round(pr_auc, 4),
        })

# Save all results to CSV
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="PPV", ascending=False)
out_file = f"{out_dir}/rbf_svm_grid.csv"
results_df.to_csv(out_file, index=False)


### cross validition

In [28]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, auc, precision_recall_curve
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import LeaveOneOut 
import os

species = 'human'
tissue = 'stomach'

# File paths
esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'

lncRNA_path = f'../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv'

lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)  

esslnc = pd.read_csv(esslnc_path)  
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

lnc_ess = lnc[lnc.index.isin(esslnc_id)]
lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

# Prepare data arrays
X_positive = lnc_ess.values
X_negative = lnc_noness.values
ids_positive = lnc_ess.index
ids_negative = lnc_noness.index

# Combine datasets
X_all = np.vstack((X_positive, X_negative))
y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
ids_all = np.hstack((ids_positive, ids_negative))

# Initialize cross-validation
if species == 'mouse':
    cv = LeaveOneOut() 
    C = 10
    gamma = 0.01
else:
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    C = 100
    gamma = 0.001

# Prepare DataFrame to save roc and pr
roc_data = []
pr_data = []

# Initialize lists to store all true labels and decision scores
all_true_labels = []
all_decision_scores = []

# cross-validation
for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all[train_index], y_all[test_index]
    ids_train, ids_test = ids_all[train_index], ids_all[test_index]
    
    # ✅ StandardScaler 
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # ✅ RBF-SVM
    svm = SVC(kernel="rbf", C=C, gamma=gamma)
    svm.fit(X_train_scaled, y_train)

    decision_scores = svm.decision_function(X_test_scaled)
    predictions = (decision_scores >= 0).astype(int)
    
    all_true_labels.extend(y_test)
    all_decision_scores.extend(decision_scores)

# Convert lists to arrays for performance evaluation
all_true_labels = np.array(all_true_labels)
all_decision_scores = np.array(all_decision_scores)

# Compute confusion matrix using threshold at 0
tn, fp, fn, tp = confusion_matrix(all_true_labels, (all_decision_scores >= 0).astype(int)).ravel()

# Compute performance metrics
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
f1_score = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
mcc = matthews_corrcoef(all_true_labels, (all_decision_scores >= 0).astype(int))

# Compute and save ROC curve data
fpr, tpr, _ = roc_curve(all_true_labels, all_decision_scores)
roc_auc = auc(fpr, tpr)
roc_data.append(pd.DataFrame({'FPR': fpr, 'TPR': tpr}))

# Compute and save Precision-Recall curve data
precision, recall, _ = precision_recall_curve(all_true_labels, all_decision_scores)
pr_auc = auc(recall, precision)
pr_data.append(pd.DataFrame({'Recall': recall, 'Precision': precision}))

metrics_row = {
    'Model': 'SVM',
    'Tissue': f'{tissue}',
    'Sensitivity': round(sensitivity,4),
    'Specificity': round(specificity,4),
    'PPV': round(ppv,4),
    'F1 Score': round(f1_score,4),
    'Accuracy': round(accuracy,4),
    'MCC': round(mcc,4),
    'ROC AUC': round(roc_auc,4),
    'PR AUC': round(pr_auc,4)
}

metrics_df = pd.DataFrame([metrics_row])

metrics_output_path = f'./performance/{species}/svm_{tissue}_summary.csv'
os.makedirs(os.path.dirname(metrics_output_path), exist_ok=True)

if os.path.exists(metrics_output_path):
    metrics_df.to_csv(metrics_output_path, mode='a', header=False, index=False)
else:
    metrics_df.to_csv(metrics_output_path, mode='w', header=True, index=False)

roc_data[0].to_csv(f'./performance/{species}/curve/roc_curve_{tissue}.csv', index=False)
pr_data[0].to_csv(f'./performance/{species}/curve/pr_curve_{tissue}.csv', index=False)



### Predict step

In [31]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

species = 'human'
tissue = 'stomach'

if species == 'mouse':
	C = 10
	gamma = 0.01
else:
	C = 100
	gamma = 0.001

# File paths
esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'

esslnc = pd.read_csv(esslnc_path)  
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

all_samples_path = f'../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv'
all_lnc = pd.read_csv(all_samples_path, index_col=0, header=None)

lnc_ess = all_lnc[all_lnc.index.isin(esslnc_id)]
lnc_noness = all_lnc[all_lnc.index.isin(nonesslnc_id)]

# Prepare training data
X_positive = lnc_ess.values
X_negative = lnc_noness.values

# Apply StandardScaler
scaler = StandardScaler()

X_train = np.vstack((X_positive, X_negative))
X_train_scaled = scaler.fit_transform(X_train)
y_train = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))

# Train SVM model
svm = SVC(kernel="rbf", C=C, gamma=gamma)
svm.fit(X_train_scaled, y_train)

# Score and predict for all data
X_all = all_lnc.values
X_all_scaled = scaler.transform(X_all)
ids_all = all_lnc.index

scores = svm.decision_function(X_all_scaled)
labels = np.where(scores > 0, 1, 0)

# Generate results DataFrame
results_df = pd.DataFrame({'lncRNA_id': ids_all, 'Score': scores, 'Pre_Label': labels})

# Save results to CSV file
results_df.to_csv(f'../../results/{species}/SVM_predictions_{tissue}.csv', index=False)

