In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from preprocess_smote import *
from sklearn.preprocessing import StandardScaler

In [19]:
# 药物列表
drugs = [
    'Gefitinib', 'Afatinib', 'AR-42', 'Cetuximab', 'Etoposide', 
    'NVP-TAE684', 'PLX4720', 'PLX4720_451Lu', 'Sorafenib', 'Vorinostat'
]

In [20]:
# 使用随机森林进行分类
def train_rf_model(features, labels):
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(features, labels)
    return rf_model

In [21]:
# 评估模型
def evaluate_rf_model(rf_model, features, labels):
    preds = rf_model.predict(features)
    preds_proba = rf_model.predict_proba(features)
    
    # 计算准确率、AUC 和 AUPR
    accuracy = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUC
    aupr = average_precision_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUPR
    
    return accuracy, auc, aupr

In [22]:
# 主函数 - 执行实验
def run_rf_comparison(DRUG):
    # 训练：使用源数据集
    adj_s, features_s, labels_s, knn_s, n_features_s = load_data_drug('source', DRUG)
    
    # 测试：使用目标数据集
    adj_t, features_t, labels_t, knn_t, n_features_t = load_data_drug('target', DRUG)
    
    # 训练随机森林
    rf_model = train_rf_model(features_s, labels_s)
    
    # 测试：使用目标数据集
    accuracy, auc, aupr = evaluate_rf_model(rf_model, features_t, labels_t)
    
    # 输出结果
    print(f"Random Forest Performance for {DRUG}:")
    print(f"Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, AUPR: {aupr:.4f}")
    return accuracy, auc, aupr

In [23]:
# 循环进行对比实验
for drug in drugs:
    acc, auc, aupr = run_rf_comparison(drug)
    print(f"Results for {drug} - Accuracy: {acc:.4f}, AUC: {auc:.4f}, AUPR: {aupr:.4f}")

当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
Random Forest Performance for Gefitinib:
Accuracy: 0.5152, AUC: 0.6129, AUPR: 0.6393
Results for Gefitinib - Accuracy: 0.5152, AUC: 0.6129, AUPR: 0.6393
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
Random Forest Performance for Afatinib:
Accuracy: 0.6083, AUC: 0.6997, AUPR: 0.7054
Results for Afatinib - Accuracy: 0.6083, AUC: 0.6997, AUPR: 0.7054
当前的采样率为: 100%
[0.91021324 0.08978676]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
Random Forest Performance for AR-42:
Accuracy: 0.5000, AUC: 0.7507, AUPR: 0.7755
Results for AR-42 - Accuracy: 0.5000, AUC: 0.7507, AUPR: 0.7755
当前的采样率为: 100%
[0.8583043 0.1416957]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 554
Random Forest Performance for Cetuximab:
Accuracy: 0.5417, AUC: 0.6643, AUPR: 0.6675
Results for Cetuximab - Accuracy: 0.5417, AUC: 0.6643, AUPR: 0.6675
当前的采样率为: 100%
[0.93865741 0.06134259]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
Random Forest Performance

In [32]:
import pandas as pd
from preprocess_smote import *  # 直接从 preprocess_smote 导入所有内容
from sklearn.linear_model import LogisticRegression  # 逻辑回归
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # 随机森林，AdaBoost
from sklearn.svm import SVC  # SVM支持向量机
from sklearn.tree import DecisionTreeClassifier  # 决策树
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

In [33]:
# 药物列表
drugs = [
    'Gefitinib', 'Afatinib', 'AR-42', 'Cetuximab', 'Etoposide', 
    'NVP-TAE684', 'PLX4720', 'PLX4720_451Lu', 'Sorafenib', 'Vorinostat'
]

In [34]:
# 定义训练和评估模型的通用函数
def train_model(model, features, labels):
    model.fit(features, labels)
    return model

def evaluate_model(model, features, labels):
    preds = model.predict(features)
    preds_proba = model.predict_proba(features)
    
    # 计算准确率、AUC 和 AUPR
    accuracy = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUC
    aupr = average_precision_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUPR
    
    return accuracy, auc, aupr

In [35]:
# 主函数 - 执行实验
def run_comparison(DRUG, model):
    # 训练：使用源数据集
    adj_s, features_s, labels_s, knn_s, n_features_s = load_data_drug('source', DRUG)
    
    # 测试：使用目标数据集
    adj_t, features_t, labels_t, knn_t, n_features_t = load_data_drug('target', DRUG)
    
    # 训练模型
    model = train_model(model, features_s, labels_s)
    
    # 测试：使用目标数据集
    accuracy, auc, aupr = evaluate_model(model, features_t, labels_t)
    
    # 返回结果
    return DRUG, model.__class__.__name__, accuracy, auc, aupr

In [36]:
# 循环进行对比实验
models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    LogisticRegression(max_iter=1000, random_state=42),
    SVC(probability=True, random_state=42),  # SVM支持向量机，设置 probability=True 以获得预测概率
    DecisionTreeClassifier(random_state=42),
    AdaBoostClassifier(n_estimators=100, random_state=42)
]

In [37]:
# 存储所有实验结果的列表
results = []

In [38]:
# 循环进行对比实验
for drug in drugs:
    for model in models:
        drug_name, model_name, acc, auc, aupr = run_comparison(drug, model)
        results.append([drug_name, model_name, acc, auc, aupr])

当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
当前的采样率为: 100%
[0.91021324 0.08978676]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
当前的采样率为: 100%
[0.91021324 0.08978676]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
当前的采样率为: 100%
[0.91021324 0.08978676]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
当前的采样率为: 100%
[0.91021324

In [39]:
# 将结果转换为 DataFrame
df_results = pd.DataFrame(results, columns=['Drug', 'Model', 'Accuracy', 'AUC', 'AUPR'])

In [40]:
# 输出结果到屏幕
print(df_results)

# 保存结果到 CSV 文件
df_results.to_csv('machineLearning_comparison_results.csv', index=False)

             Drug                   Model  Accuracy       AUC      AUPR
0       Gefitinib  RandomForestClassifier  0.545455  0.692837  0.677104
1       Gefitinib      LogisticRegression  0.636364  0.772268  0.728021
2       Gefitinib                     SVC  0.530303  0.466483  0.525563
3       Gefitinib  DecisionTreeClassifier  0.666667  0.666667  0.608108
4       Gefitinib      AdaBoostClassifier  0.484848  0.370064  0.435032
5        Afatinib  RandomForestClassifier  0.641667  0.770417  0.773821
6        Afatinib      LogisticRegression  0.558333  0.584722  0.576832
7        Afatinib                     SVC  0.533333  0.511806  0.581152
8        Afatinib  DecisionTreeClassifier  0.525000  0.525000  0.512963
9        Afatinib      AdaBoostClassifier  0.466667  0.492222  0.547226
10          AR-42  RandomForestClassifier  0.500000  0.761708  0.717803
11          AR-42      LogisticRegression  0.484848  0.651974  0.638351
12          AR-42                     SVC  0.484848  0.599633  0

In [41]:
# 四舍五入保留三位有效数字
df_results['Accuracy'] = df_results['Accuracy'].round(3)
df_results['AUC'] = df_results['AUC'].round(3)
df_results['AUPR'] = df_results['AUPR'].round(3)

# 输出修改后的结果
print(df_results)

# 如果需要，可以保存到 CSV 文件
df_results.to_csv('comparison_results_rounded.csv', index=False)

             Drug                   Model  Accuracy    AUC   AUPR
0       Gefitinib  RandomForestClassifier     0.545  0.693  0.677
1       Gefitinib      LogisticRegression     0.636  0.772  0.728
2       Gefitinib                     SVC     0.530  0.466  0.526
3       Gefitinib  DecisionTreeClassifier     0.667  0.667  0.608
4       Gefitinib      AdaBoostClassifier     0.485  0.370  0.435
5        Afatinib  RandomForestClassifier     0.642  0.770  0.774
6        Afatinib      LogisticRegression     0.558  0.585  0.577
7        Afatinib                     SVC     0.533  0.512  0.581
8        Afatinib  DecisionTreeClassifier     0.525  0.525  0.513
9        Afatinib      AdaBoostClassifier     0.467  0.492  0.547
10          AR-42  RandomForestClassifier     0.500  0.762  0.718
11          AR-42      LogisticRegression     0.485  0.652  0.638
12          AR-42                     SVC     0.485  0.600  0.602
13          AR-42  DecisionTreeClassifier     0.485  0.485  0.493
14        

In [42]:
from preprocess_smote import *  # 直接从 preprocess_smote 导入所有内容
from sklearn.naive_bayes import GaussianNB  # 导入朴素贝叶斯（高斯朴素贝叶斯）
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score

# 药物列表
drugs = [
    'Gefitinib', 'Afatinib', 'AR-42', 'Cetuximab', 'Etoposide', 
    'NVP-TAE684', 'PLX4720', 'PLX4720_451Lu', 'Sorafenib', 'Vorinostat'
]

# 使用朴素贝叶斯进行分类
def train_nb_model(features, labels):
    nb_model = GaussianNB()  # 使用高斯朴素贝叶斯（适用于连续数据）
    nb_model.fit(features, labels)
    return nb_model

# 评估模型
def evaluate_nb_model(nb_model, features, labels):
    preds = nb_model.predict(features)
    preds_proba = nb_model.predict_proba(features)
    
    # 计算准确率、AUC 和 AUPR
    accuracy = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUC
    aupr = average_precision_score(labels, preds_proba[:, 1])  # 使用第二列的概率值计算 AUPR
    
    return accuracy, auc, aupr

# 主函数 - 执行实验
def run_nb_comparison(DRUG):
    # 训练：使用源数据集
    adj_s, features_s, labels_s, knn_s, n_features_s = load_data_drug('source', DRUG)
    
    # 测试：使用目标数据集
    adj_t, features_t, labels_t, knn_t, n_features_t = load_data_drug('target', DRUG)
    
    # 训练朴素贝叶斯
    nb_model = train_nb_model(features_s, labels_s)
    
    # 测试：使用目标数据集
    accuracy, auc, aupr = evaluate_nb_model(nb_model, features_t, labels_t)
    
    # 输出结果
    print(f"Naive Bayes Performance for {DRUG}:")
    print(f"Accuracy: {accuracy:.4f}, AUC: {auc:.4f}, AUPR: {aupr:.4f}")
    return accuracy, auc, aupr

# 循环进行对比实验
for drug in drugs:
    acc, auc, aupr = run_nb_comparison(drug)
    print(f"Results for {drug} - Accuracy: {acc:.4f}, AUC: {auc:.4f}, AUPR: {aupr:.4f}")

当前的采样率为: 100%
[0.86127865 0.13872135]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 535
Naive Bayes Performance for Gefitinib:
Accuracy: 0.5152, AUC: 0.5533, AUPR: 0.5272
Results for Gefitinib - Accuracy: 0.5152, AUC: 0.5533, AUPR: 0.5272
当前的采样率为: 100%
[0.81971154 0.18028846]
类别为1的样本占总样本的比例: 0.4287
类别为1的样本数: 511
Naive Bayes Performance for Afatinib:
Accuracy: 0.5167, AUC: 0.5350, AUPR: 0.5153
Results for Afatinib - Accuracy: 0.5167, AUC: 0.5350, AUPR: 0.5153
当前的采样率为: 100%
[0.91021324 0.08978676]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
Naive Bayes Performance for AR-42:
Accuracy: 0.5303, AUC: 0.6795, AUPR: 0.6676
Results for AR-42 - Accuracy: 0.5303, AUC: 0.6795, AUPR: 0.6676
当前的采样率为: 100%
[0.8583043 0.1416957]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 554
Naive Bayes Performance for Cetuximab:
Accuracy: 0.5000, AUC: 0.4978, AUPR: 0.4952
Results for Cetuximab - Accuracy: 0.5000, AUC: 0.4978, AUPR: 0.4952
当前的采样率为: 100%
[0.93865741 0.06134259]
类别为1的样本占总样本的比例: 0.4288
类别为1的样本数: 608
Naive Bayes Performance for Etopo