In [None]:
import pandas as pd
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import sklearn

## Organize training data

In [None]:
Train_file_PAT =r'..//train_dataset//REG_dataset_XY.tsv'
# read tsv
train_ds = pd.read_csv(Train_file_PAT, sep='\t', na_values=['.'], header=0)
train_ds = train_ds.drop(train_ds.columns[range(4)],axis=1)
train_ds = train_ds.fillna(train_ds.mean())

# pandas.DataFrame -->numpy
train_np = train_ds.values
x_train = train_np[:, :-1]
y_train = train_np[:, -1]

## Organize testing data

In [None]:
test_dataset_filename = ['Brown_eQTL','GTEx_eQTL'] #19维特征
x_test = {}
y_test = {}
    
for test_index, test_file_name in enumerate(test_dataset_filename,start=0):
    test_file =  r'..//test_dataset//' + test_file_name +'_dataset_XY.tsv'    
    test_ds = pd.read_csv(test_file, sep='\t', na_values=['.'], header=0)
    
    if(test_ds.shape[1] == 19):
        test_ds = test_ds.fillna(test_ds.mean())
        test_ds = test_ds.drop(test_ds.columns[range(4)], axis=1)
    elif(test_ds.shape[1] == 30):
        test_ds = test_ds.fillna(test_ds.mean())
        test_ds = test_ds.drop(test_ds.columns[range(9)], axis=1)
        test_ds = test_ds.drop(test_ds.columns[14:20], axis=1)


    test_np = test_ds.values
    x_test[test_index] = test_np[:, :-1]
    y_test[test_index] = test_np[:, -1]
   

### model_load

In [None]:
import joblib

REG_GBDT = joblib.load("../model/REG/REG_GBDT"+".model")
REG_KNN = joblib.load("../model/REG/REG_KNN"+".model")
REG_LR = joblib.load("../model/REG/REG_LR"+".model")
REG_XGBoost = joblib.load("../model/REG/REG_XGBoost"+".model")
REG_RF = joblib.load("../model/REG/REG_RF"+".model")
REG_SVM = joblib.load("../model/REG/REG_SVM"+".model")
REG_Stacking = joblib.load("../model/REG/REG_Stacking"+".model")

In [None]:
REG_model_List = [REG_GBDT, REG_KNN, REG_LR, REG_XGBoost,REG_SVM, REG_RF, REG_Stacking]

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score,auc

def multi_models_roc(names, sampling_methods, colors, X_test, y_test, save_name, save=False, dpin=100):
        """
        Output the AUROC results obtained by all models to a graph
        
        Args:
            names: list, model names
            sampling_methods: list, instantiated objects for all models
            save: Choose whether to save the result (default is png format)
            
        Returns:
            Returns the image object plt
        """
        plt.figure(figsize=(5, 5), dpi=dpin)

        for (name, method, colorname) in zip(names, sampling_methods, colors):
            y_test_preds = method.predict(X_test)
            y_test_predprob = method.predict_proba(X_test)[:,1]
            fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
            
            plt.plot(fpr, tpr, lw=2, label='{} (AUC={:.4f})'.format(name, auc(fpr, tpr)),color = colorname)
            plt.plot([0, 1], [0, 1], '--', lw=5, color = 'grey')
            plt.axis('square')  
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.xlabel('False Positive Rate',fontsize=13)
            plt.ylabel('True Positive Rate',fontsize=13)
            plt.title('ROC Curve',fontsize=8)
            plt.legend(loc='lower right',fontsize=8)

        if save:
            plt.savefig('REG_auroc'+save_name+'_baseClf.png')
            
        return plt



In [None]:
from sklearn.metrics import roc_curve,roc_auc_score,auc

def multi_models_auprc(names, sampling_methods, colors, X_test, y_test,  save_name, save=False, dpin=100):
        """
        Output the Auprc results obtained by all models to a graph
        
        Args:
            names: list, model names
            sampling_methods: list, instantiated objects for all models
            save: Choose whether to save the result (default is png format)
            
        Returns:
            Returns the image object plt
        """
            
            
        plt.figure(figsize=(5, 5), dpi=dpin)

        for (name, method, colorname) in zip(names, sampling_methods, colors):
            y_test_predprob = method.predict_proba(X_test)[:,1]
            fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
            precision, recall,_ = precision_recall_curve(y_test, y_test_predprob)
            
            plt.plot(recall, precision, lw=2, label='{} (AUC={:.4f})'.format(name, auc(recall, precision)),color = colorname)
            plt.plot([0, 1], [0, 1], '--', lw=5, color = 'grey')
            plt.axis('square')  
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.xlabel('Recall',fontsize=13)
            plt.ylabel('Precision',fontsize=13)
            plt.title('AUPR Curve',fontsize=8)
            plt.legend(loc='lower right',fontsize=8)

        if save:
            plt.savefig('REG_multi_models_aupr'+save_name+'_baseClf.png')
            
        return plt



#### get predictions

In [None]:
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
names = ['GBDT', 'KNN', 'LR', 'XGBoost', 'SVM','RF','Stacking']

colors = ['crimson',
          'orange',
          'gold',
          'mediumseagreen',
          'steelblue', 
          'mediumpurple',
          'green',
          'black'
]

REG_ROC_graph = multi_models_roc(names, REG_model_List, colors,x_test[0],y_test[0],save_name="base_model_Brown_eQTL")
REG_AUPRC_graph = multi_models_auprc(names, REG_model_List, colors,x_test[0],y_test[0],save_name="base_model_Brown_eQTL")

## Other_method

In [None]:
test_dataset_filename = ['Brown_eQTL','GTEx_eQTL']

x_test = {}
y_test = {}

test_ds = {}    
for test_index, test_file_name in enumerate(test_dataset_filename,start=0):
    test_file =  r'C://Users//L//Coding//try_score//test_dataset//' + test_file_name +'_dataset_XY.tsv'    
    test_ds[test_index] = pd.read_csv(test_file, sep='\t', na_values=['.'], header=0)
    test_ds[test_index] =  test_ds[test_index].fillna(test_ds[test_index].mean())
    test_ds[test_index] =  test_ds[test_index].drop(test_ds[test_index].columns[range(4)], axis=1)

    test_np = test_ds[test_index].values
    x_test[test_index] = test_np[:, :-1]
    y_test[test_index] = test_np[:, -1]
    print(x_test[test_index].shape, y_test[test_index].shape)

## CADD,CScape,DANN,FATHMM-MKL,FIRE

In [None]:
colors = ['crimson',
          'orange',
          'gold',
          'mediumseagreen',
          'steelblue',
]

method_list = ['CADD','CScape','DANN','FATHMM-MKL','FIRE']
method_data = {}
method_data[0] = np.concatenate((test_ds[0]['CADD'].values.reshape(-1,1), test_ds[0]['CScape'].values.reshape(-1,1),
                            test_ds[0]['DANN'].values.reshape(-1,1),test_ds[0]['FATHMM-MKL'].values.reshape(-1,1),
                            test_ds[0]['FIRE'].values.reshape(-1,1)),axis=1)

method_data[1] = np.concatenate((test_ds[1]['CADD'].values.reshape(-1,1), test_ds[1]['CScape'].values.reshape(-1,1),
                            test_ds[1]['DANN'].values.reshape(-1,1),test_ds[1]['FATHMM-MKL'].values.reshape(-1,1),
                            test_ds[1]['FIRE'].values.reshape(-1,1)),axis=1)

method_data[2] = np.concatenate((test_ds[2]['CADD'].values.reshape(-1,1), test_ds[2]['CScape'].values.reshape(-1,1),
                            test_ds[2]['DANN'].values.reshape(-1,1),test_ds[2]['FATHMM-MKL'].values.reshape(-1,1),
                            test_ds[2]['FIRE'].values.reshape(-1,1)),axis=1)

label = train_ds['label']

print(method_data[0].shape)
print(method_data[1].shape)

### load DeepGS & regBase

In [None]:
from tensorflow.keras.models import load_model

model_DeepGS = load_model('../model/REG/REG_DeepGS.h5')
model_regBase = joblib.load("../model/REG/REG_regBase"+".model")



In [None]:
print(method_data[0].shape)
print(method_data[1].shape)

In [None]:
y_test_predprob_DeepGS = {}
method_list.append('DeepGS')
for test_index, test_file_name in enumerate(test_dataset_filename,start=0):
    print(test_index)
    y_test_predprob_DeepGS[test_index] = model_DeepGS.predict(x_test[test_index])[:,1]
    print(y_test_predprob_DeepGS[test_index].shape)
    method_data[test_index] = np.append(method_data[test_index],y_test_predprob_DeepGS[test_index].reshape(-1,1),axis = 1)

y_test_predprob_regBase = {}
method_list.append('regBase')
for test_index, test_file_name in enumerate(test_dataset_filename,start=0):
    print(test_index)
    y_test_predprob_regBase[test_index] = model_regBase.predict_proba(x_test[test_index])[:,1]
    print(y_test_predprob_regBase[test_index].shape)
    method_data[test_index] = np.append(method_data[test_index],y_test_predprob_regBase[test_index].reshape(-1,1),axis = 1)

## Stacking

In [None]:
y_test_predprob_stacking = {}
method_list.append('Stacking')
for test_index, test_file_name in enumerate(test_dataset_filename,start=0):
    print(test_index)
    y_test_predprob_regBase[test_index] = CAN_Stacking.predict_proba(x_test[test_index])[:,1]
    print(y_test_predprob_regBase[test_index].shape)
    method_data[test_index] = np.append(method_data[test_index],y_test_predprob_regBase[test_index].reshape(-1,1),axis = 1)

In [None]:
print(method_list)
print(method_data[0].shape)
print(method_data[1].shape)

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score,auc,precision_recall_curve
from sklearn import metrics

def multi_method_AUROC(names,  colors, X_test, y_test,save_name,save=False, dpin=100):
        """
        Output the AUROC results obtained by all models to a graph
        
        Args:
            names: list, methods names
            sampling_methods: list, instantiated objects for all methods
            save: Choose whether to save the result (default is png format)
            
        Returns:
            Returns the image object plt
        """
        plt.figure(figsize=(5, 5), dpi=dpin)

        for (method_index, method_name) in enumerate(names, start=0):
            y_test_predprob = X_test[:,method_index]
            fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
            print(method_name)
            
            plt.plot(fpr, tpr, lw=2, label='{} (AUC={:.4f})'.format(method_name, auc(fpr, tpr)),color = colors[method_index])
            plt.plot([0, 1], [0, 1], '--', lw=5, color = 'grey')
            plt.axis('square')  
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.xlabel('False Positive Rate',fontsize=13)
            plt.ylabel('True Positive Rate',fontsize=13)
            #plt.title('ROC Curve',fontsize=8)
            plt.legend(loc='lower right',fontsize=8)

        if save:
            plt.savefig('REG_multi_models_auroc'+save_name+'_otherMethod.png')
            
        return plt

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score,auc,precision_recall_curve
from sklearn import metrics

def multi_method_AUPRC(names,  colors, X_test, y_test,save_name,save=False, dpin=100):
        """
        Output the AUPRC results obtained by all models to a graph
        
        Args:
            names: list, methods names
            sampling_methods: list, instantiated objects for all methods
            save: Choose whether to save the result (default is png format)
            
        Returns:
            Returns the image object plt
        """
        plt.figure(figsize=(5, 5), dpi=dpin)

        for (method_index, method_name) in enumerate(names, start=0):
            y_test_predprob = X_test[:,method_index]
            fpr, tpr, thresholds = roc_curve(y_test, y_test_predprob, pos_label=1)
            print(method_name)
            
            precision, recall,_ = precision_recall_curve(y_test, y_test_predprob)
            plt.plot(recall, precision, lw=2, label='{} (AUC={:.4f})'.format(method_name, auc(recall, precision)),color = colors[method_index])
            plt.plot([0, 1], [0, 1], '--', lw=5, color = 'grey')
            plt.axis('square')  
            plt.xlim([0, 1])
            plt.ylim([0, 1])
            plt.xlabel('False Positive Rate',fontsize=13)
            plt.ylabel('True Positive Rate',fontsize=13)
            #plt.title('AUPR Curve',fontsize=6)
            plt.legend(loc='lower right',fontsize=8)

        if save:
            plt.savefig('REG_multi_models_aupr_'+save_name+'otherMethod.png')
            
        return plt

In [None]:
colors = ['crimson',
          'orange',
          'gold',
          'mediumseagreen',
          'steelblue', 
          'mediumpurple',
          'blue',
          'black'
]


multi_method_AUPRC(method_list, colors, method_data[0],y_test[0],'other_Brown')
multi_method_AUROC(method_list, colors, method_data[0],y_test[0],'other_Brown')


In [None]:
multi_method_AUPRC(method_list, colors, method_data[1],y_test[1],'GTEx')
multi_method_AUROC(method_list, colors, method_data[1],y_test[1],'GTEx')