In [26]:
import os
import random
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import Model
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, roc_curve, auc, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [27]:
# 设置随机种子
SEED = 17
def set_seed(seed=SEED):
    random.seed(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.keras.utils.set_random_seed(seed)
    tf.config.experimental.enable_op_determinism()

set_seed(SEED)

# 设置环境变量来抑制警告信息
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.get_logger().setLevel('ERROR')

In [28]:
# 加载数据集
train_data = pd.read_csv('inputs/tpm_train.csv', index_col=0)
train_labels_df = pd.read_csv('inputs/label_train.csv')
test_data = pd.read_csv('inputs/tpm_test.csv', index_col=0)
test_labels_df = pd.read_csv('inputs/label_test.csv')

# 选择特征
selected_features = pd.read_csv("inputs/selected_features_tpm_300.csv", header=None).iloc[1:, 0].tolist()
X_train = train_data[selected_features]
X_test = test_data[selected_features]

# 合并标签
train_labels_df.set_index('sample', inplace=True)
test_labels_df.set_index('sample', inplace=True)

# 合并数据集
train_data = X_train.join(train_labels_df[['label', 'subject', 'batch']])
test_data = X_test.join(test_labels_df[['label', 'subject', 'batch']])

# 提取特征和标签
X_train = train_data[selected_features]
X_test = test_data[selected_features]
y_train = train_data['label'].values
y_test = test_data['label'].values

In [29]:
# 定义记录结果的函数
def calculate_metrics(y_true, y_pred, y_proba):
    accuracy = accuracy_score(y_true, y_pred)
    sensitivity = recall_score(y_true, y_pred)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    auroc = roc_auc_score(y_true, y_proba)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision_vals, recall_vals, _ = precision_recall_curve(y_true, y_proba)
    auprc = auc(recall_vals, precision_vals)
    return [accuracy, sensitivity, specificity, auroc, precision, f1, auprc]

In [30]:
# 训练和评估逻辑回归模型
best_params_lr = {'C': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
lr_model = LogisticRegression(**best_params_lr)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_proba = lr_model.predict_proba(X_test)[:, 1]
lr_metrics = calculate_metrics(y_test, lr_predictions, lr_proba)

# 训练和评估随机森林模型
best_params_rf = {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
rf_model = RandomForestClassifier(**best_params_rf, random_state=SEED)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_proba = rf_model.predict_proba(X_test)[:, 1]
rf_metrics = calculate_metrics(y_test, rf_predictions, rf_proba)

# 训练和评估支持向量机模型
best_params_svm = {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
svm_model = SVC(**best_params_svm, probability=True, random_state=SEED)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_proba = svm_model.predict_proba(X_test)[:, 1]
svm_metrics = calculate_metrics(y_test, svm_predictions, svm_proba)

In [31]:
# 定义距离计算和损失函数
def euclidean_distance_squared(vectors):
    x, y = vectors
    return tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)

# 定义 Soft Triplet Loss 函数
def soft_triplet_loss(margin):
    def loss(y_true, y_pred):
        positive_distance = y_pred[:, 0]
        negative_distance = y_pred[:, 1]
        return tf.reduce_mean(tf.nn.softplus(positive_distance - negative_distance + margin))
    return loss

# 加载最佳孪生网络模型
best_siamese_model = tf.keras.models.load_model('best_siamese_model.keras', custom_objects={'soft_triplet_loss': soft_triplet_loss, 'euclidean_distance_squared': euclidean_distance_squared})

# 获取base_network模型
base_network = best_siamese_model.get_layer(best_siamese_model.layers[3].name)

# 提取特征表示及标签
feature_extractor = Model(inputs=base_network.input, outputs=base_network.output)
train_features = feature_extractor.predict(X_train, verbose=0)
test_features = feature_extractor.predict(X_test, verbose=0)

# 创建一个新的模型，将特征提取和分类器结合
class CombinedModel:
    def __init__(self, feature_extractor, classifier):
        self.feature_extractor = feature_extractor
        self.classifier = classifier
    
    def predict(self, data):
        features = self.feature_extractor.predict(data)
        return self.classifier.predict(features)
    
    def predict_proba(self, data):
        features = self.feature_extractor.predict(data)
        proba = self.classifier.predict_proba(features)
        return proba[:, 1]

In [32]:
# 训练和评估孪生网络特征提取与逻辑回归结合的模型
best_params_lr = {'C': 10000, 'penalty': 'l1', 'solver': 'liblinear'}
lr_model = LogisticRegression(**best_params_lr)
lr_model.fit(train_features, y_train)
combined_lr_model = CombinedModel(feature_extractor, lr_model)
lr_combined_predictions = combined_lr_model.predict(X_test)
lr_combined_proba = combined_lr_model.predict_proba(X_test)
lr_combined_metrics = calculate_metrics(y_test, lr_combined_predictions, lr_combined_proba)

# 训练和评估孪生网络特征提取与随机森林结合的模型
best_params_rf = {'n_estimators': 100, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
rf_model = RandomForestClassifier(**best_params_rf, random_state=SEED)
rf_model.fit(train_features, y_train)
combined_rf_model = CombinedModel(feature_extractor, rf_model)
rf_combined_predictions = combined_rf_model.predict(X_test)
rf_combined_proba = combined_rf_model.predict_proba(X_test)
rf_combined_metrics = calculate_metrics(y_test, rf_combined_predictions, rf_combined_proba)

# 训练和评估孪生网络特征提取与支持向量机结合的模型
best_params_svm = {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
svm_model = SVC(**best_params_svm, probability=True, random_state=SEED)
svm_model.fit(train_features, y_train)
combined_svm_model = CombinedModel(feature_extractor, svm_model)
svm_combined_predictions = combined_svm_model.predict(X_test)
svm_combined_proba = combined_svm_model.predict_proba(X_test)
svm_combined_metrics = calculate_metrics(y_test, svm_combined_predictions, svm_combined_proba)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [33]:
# 创建数据框
model_metrics = {
    'LR': lr_metrics,
    'RF': rf_metrics,
    'SVM': svm_metrics,
    'SNN-LR': lr_combined_metrics,
    'SNN-RF': rf_combined_metrics,
    'SNN-SVM': svm_combined_metrics
}

metrics_df = pd.DataFrame.from_dict(model_metrics, orient='index', columns=['Accuracy', 'Sensitivity', 'Specificity', 'AUROC', 'Precision', 'F1 Score', 'AUPRC'])
print(metrics_df)

         Accuracy  Sensitivity  Specificity   AUROC  Precision  F1 Score  \
LR           0.68         0.76         0.60  0.7616   0.655172  0.703704   
RF           0.76         0.84         0.68  0.8448   0.724138  0.777778   
SVM          0.86         0.88         0.84  0.9264   0.846154  0.862745   
SNN-LR       0.90         0.92         0.88  0.8960   0.884615  0.901961   
SNN-RF       0.86         0.88         0.84  0.8896   0.846154  0.862745   
SNN-SVM      0.88         0.92         0.84  0.8784   0.851852  0.884615   

            AUPRC  
LR       0.797487  
RF       0.803837  
SVM      0.907588  
SNN-LR   0.903456  
SNN-RF   0.887951  
SNN-SVM  0.855238  
