### Tune layer size.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import confusion_matrix, matthews_corrcoef, roc_curve, auc, precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

# Change the following variables according to your needs
species = 'mouse'
tissue = 'heart'

# -------------------- Load Data --------------------
esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'  
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'  
lncRNA_path = f'../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv'

lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)
esslnc = pd.read_csv(esslnc_path)
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

lnc_ess = lnc[lnc.index.isin(esslnc_id)]
lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

X_positive = lnc_ess.to_numpy()
X_negative = lnc_noness.to_numpy()
ids_positive = np.array(lnc_ess.index)
ids_negative = np.array(lnc_noness.index)

X_all = np.vstack((X_positive, X_negative))
y_all = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))
ids_all = np.hstack((ids_positive, ids_negative))

# -------------------- Config --------------------

hidden_layer_sizes = [32,64,128,256]  
alpha = 1e-3
learning_rate = 0.01
input_dim = X_all.shape[1]

metrics_df = pd.DataFrame()

# -------------------- Grid Search --------------------
for i in hidden_layer_sizes:
    for j in hidden_layer_sizes:
        if j > i:
            continue
        all_true_labels = []
        all_pred_labels = []
        all_pred_probs = []
        # Prepare DataFrame to save experimental records
        experiment_records = pd.DataFrame()

        if species == 'mouse':
            cv = LeaveOneOut() 
        else:
            cv = KFold(n_splits=10, shuffle=True, random_state=42)

        for fold, (train_index, test_index) in enumerate(cv.split(X_all)):
            X_train, X_test = X_all[train_index], X_all[test_index]
            y_train, y_test = y_all[train_index], y_all[test_index]
            ids_train, ids_test = ids_all[train_index], ids_all[test_index]

            scaler = MinMaxScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Instantiate custom MLP classifier
            mlp = MLPClassifier(
                hidden_layer_sizes=(i, j),
                activation='relu',
                alpha=1e-3,
                learning_rate_init=0.01,
                max_iter=200,
                random_state=42
            )
            mlp.fit(X_train_scaled, y_train)

            prob = mlp.predict_proba(X_test_scaled)[:, 1]
            pred = mlp.predict(X_test_scaled)

            all_true_labels.extend(y_test.tolist())
            all_pred_labels.extend(pred.tolist())
            all_pred_probs.extend(prob.tolist())

        # -------------------- Metrics --------------------
        tn, fp, fn, tp = confusion_matrix(all_true_labels, all_pred_labels).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        f1 = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
        mcc = matthews_corrcoef(all_true_labels, all_pred_labels)

        # Compute ROC curve and PR curve data
        fpr, tpr, _ = roc_curve(all_true_labels, all_pred_probs)
        roc_auc = auc(fpr, tpr)

        precision, recall, _ = precision_recall_curve(all_true_labels, all_pred_probs)
        pr_auc = auc(recall, precision)

        metrics = {
            'layer_size_1': i,
            'layer_size_2': j,
            "Sensitivity": round(sensitivity, 4),
            "Specificity": round(specificity, 4),
            "PPV": round(ppv, 4),
            "F1 Score": round(f1, 4),
            "Accuracy": round(accuracy, 4),
            "MCC": round(mcc, 4),
            "ROC AUC": round(roc_auc, 4),
            "PR AUC": round(pr_auc, 4),
        }

        metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics])], ignore_index=True)
        print(f"Finished structure: ({i}, {j})")

# -------------------- Save Results --------------------
metrics_df.to_csv(f'./performance/{species}/mlp_layersize_metrics.csv', index=False)
print("All configurations completed and saved.")

Finished structure: (32, 32)
Finished structure: (64, 32)
Finished structure: (64, 64)
Finished structure: (128, 32)
Finished structure: (128, 64)
Finished structure: (128, 128)
Finished structure: (256, 32)
Finished structure: (256, 64)
Finished structure: (256, 128)
Finished structure: (256, 256)
All configurations completed and saved.


### Cross validation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import LeaveOneOut, KFold
from sklearn.metrics import (
    confusion_matrix, matthews_corrcoef, roc_curve,
    auc, precision_recall_curve
)
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier
import os

species = 'human'
tissue = 'heart'

print(f"Running MLP for {species} {tissue}...")

# ---------------------- 1. Load Data ----------------------
esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'

lncRNA_path = f'../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv'

lnc = pd.read_csv(lncRNA_path, index_col=0, header=None)
esslnc = pd.read_csv(esslnc_path)
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

lnc_ess = lnc[lnc.index.isin(esslnc_id)]
lnc_noness = lnc[lnc.index.isin(nonesslnc_id)]

X_pos = lnc_ess.to_numpy()
X_neg = lnc_noness.to_numpy()
y_pos = np.ones(len(X_pos))
y_neg = np.zeros(len(X_neg))

X_all = np.vstack([X_pos, X_neg])
y_all = np.hstack([y_pos, y_neg])
ids_all = np.hstack([lnc_ess.index, lnc_noness.index])

# ---------------------- 2. Init Containers ----------------------
all_true_labels = []
all_pred_labels = []
all_pred_probs = []
experiment_records = pd.DataFrame()
roc_data, pr_data = [], []

# ---------------------- 3. Cross-validation ----------------------
# Initialize cross-validation
if species == 'mouse':
    cv = LeaveOneOut() 
    layer_size = (256,32)
else:
    cv = KFold(n_splits=10, shuffle=True, random_state=42)
    layer_size = (128,64)

for fold, (train_idx, test_idx) in enumerate(cv.split(X_all)):
    X_train, X_test = X_all[train_idx], X_all[test_idx]
    y_train, y_test = y_all[train_idx], y_all[test_idx]
    ids_train, ids_test = ids_all[train_idx], ids_all[test_idx]

    # Normalize using MinMaxScaler
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    mlp = MLPClassifier(
        hidden_layer_sizes=layer_size,
        activation='relu',
        alpha=1e-3,
        learning_rate_init=0.01,
        max_iter=200,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)

    prob = mlp.predict_proba(X_test_scaled)[:, 1]
    pred = mlp.predict(X_test_scaled)

    all_true_labels.extend(y_test.tolist())
    all_pred_labels.extend(pred.tolist())
    all_pred_probs.extend(prob.tolist())

# ---------------------- 4. Evaluate ----------------------
tn, fp, fn, tp = confusion_matrix(all_true_labels, all_pred_labels).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
ppv = tp / (tp + fp) if (tp + fp) > 0 else 0
accuracy = (tp + tn) / (tp + tn + fp + fn)
f1 = 2 * (ppv * sensitivity) / (ppv + sensitivity) if (ppv + sensitivity) > 0 else 0
mcc = matthews_corrcoef(all_true_labels, all_pred_labels)

# ---------------------- 5. Save Results ----------------------
# ROC & PR curve
fpr, tpr, _ = roc_curve(all_true_labels, all_pred_probs)
roc_auc = auc(fpr, tpr)
roc_data.append(pd.DataFrame({'FPR': fpr, 'TPR': tpr}))

precision, recall, _ = precision_recall_curve(all_true_labels, all_pred_probs)
pr_auc = auc(recall, precision)
pr_data.append(pd.DataFrame({'Recall': recall, 'Precision': precision}))
roc_data[0].to_csv(f'./performance/{species}/curve/roc_curve_{tissue}.csv', index=False)
pr_data[0].to_csv(f'./performance/{species}/curve/pr_curve_{tissue}.csv', index=False)

metrics_row = {
    'Model': 'MLP',
    'Cell_Line': f'{tissue}',
    'Sensitivity': round(sensitivity,4),
    'Specificity': round(specificity,4),
    'PPV': round(ppv,4),
    'F1 Score': round(f1,4),
    'Accuracy': round(accuracy,4),
    'MCC': round(mcc,4),
    'ROC AUC': round(roc_auc,4),
    'PR AUC': round(pr_auc,4)
}

metrics_df = pd.DataFrame([metrics_row])

metrics_output_path = f'./performance/{species}/mlp_{tissue}_summary.csv'
os.makedirs(os.path.dirname(metrics_output_path), exist_ok=True)
if os.path.exists(metrics_output_path):
    metrics_df.to_csv(metrics_output_path, mode='a', header=False, index=False)
else:
    metrics_df.to_csv(metrics_output_path, mode='w', header=True, index=False)



Running MLP for human heart...


### Predict Step

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier

species = 'human'
tissue = 'stomach'

if species == 'mouse':
	layer_size = (256,32)
else:
	layer_size = (128, 64)
# ---------------------- 1. Load Data ----------------------

# File paths
esslnc_path = f'../../data/benchmark/{species}/ess_lnc.csv'
nonesslnc_path = f'../../data/benchmark/{species}/noness_lnc.csv'

esslnc = pd.read_csv(esslnc_path)  
nonesslnc = pd.read_csv(nonesslnc_path)

esslnc_id = set(esslnc['lncRNA_id'])
nonesslnc_id = set(nonesslnc['lncRNA_id'])

all_samples_path = f'../../HinSAGE/{species}/lncRNA_embeddings_{tissue}.csv'
all_lnc = pd.read_csv(all_samples_path, index_col=0, header=None)

lnc_ess = all_lnc[all_lnc.index.isin(esslnc_id)]
lnc_noness = all_lnc[all_lnc.index.isin(nonesslnc_id)]

# Prepare training data
X_positive = lnc_ess.values
X_negative = lnc_noness.values

X_train = np.vstack((X_positive, X_negative))
y_train = np.hstack((np.ones(len(X_positive)), np.zeros(len(X_negative))))

# Split train/test set for validation
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Apply MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

mlp = MLPClassifier(
    hidden_layer_sizes=layer_size,
    activation='relu',
    alpha=1e-3,
    learning_rate_init=0.01,
    max_iter=200,
    random_state=42
)
mlp.fit(X_train_scaled, y_train)

# Predict on all data
X_all = all_lnc.values
ids_all = all_lnc.index

X_all_scaled = scaler.transform(X_all)

scores = mlp.predict_proba(X_all_scaled)[:,1] 
predictions = mlp.predict(X_all_scaled).astype(int)  # Convert predictions to int

# Generate results DataFrame
results_df = pd.DataFrame({'lncRNA_id': ids_all, 'Score': scores, 'Pre_Label': predictions})

# Save results to CSV file
results_df.to_csv(f'../../results/{species}/MLP_predictions_{tissue}.csv', index=False)
