In [None]:
#import package
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from PyBioMed import Pyprotein
from PyBioMed.PyProtein import CTD
from sklearn.decomposition import PCA
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from deepctr.models import AutoInt
from deepctr.feature_column import SparseFeat,DenseFeat,get_feature_names
from tensorflow.python.keras.callbacks import EarlyStopping
from tensorflow.python.keras.optimizers import Adam,Adagrad,Adamax
from tensorflow import keras
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
def load_data(i):
    train = pd.read_csv(data_path+'train_fold_'+str(i+1)+'.csv')[['head','relation','tail','label']]
    train_pos = train[train['label']==1]
    test = pd.read_csv(data_path+'test_fold_'+str(i+1)+'.csv')[['head','relation','tail','label']]
    data = pd.concat([train_pos,kg])[['head','relation','tail']]
    return train,train_pos,test,data

def roc_auc(y,pred):
    fpr, tpr, thresholds = metrics.roc_curve(y, pred)
    roc_auc = metrics.auc(fpr, tpr)
    return roc_auc

def pr_auc(y, pred):
    precision, recall, thresholds = metrics.precision_recall_curve(y, pred)
    pr_auc = metrics.auc(recall, precision)
    return pr_auc

def get_features(data,fp_df,prodes_df):
    drug_features = pd.merge(data,fp_df,how='left',left_on='tail',right_on='drug_id').iloc[:,4:1029].values
    pro_features = pd.merge(data,prodes_df,how='left',left_on='head',right_on='pro_id').iloc[:,4:105].values
    feature = np.concatenate([drug_features,pro_features],axis=1)
    pca = PCA(n_components=400)
    scaled_pca_features = pca.fit_transform(feature)
    return scaled_pca_features

#'DenseFeat("des",train_des.shape[1]),'des':train_des,' is used fodefr nfm training
def get_input(re_train_all,re_test_all,train_des,test_des,embedding_dim):
    train_all_feats_scaled = mms.fit_transform(train_des)
    test_all_feats_scaled = mms.transform(test_des)
    feature_columns = [SparseFeat('head',re_train_all['head'].unique().shape[0],embedding_dim=embedding_dim),
                        SparseFeat('tail',re_train_all['tail'].unique().shape[0],embedding_dim=embedding_dim),
                        DenseFeat("feats",train_all_feats_scaled.shape[1]),
                        #DenseFeat("des",train_des.shape[1])
                        ]
    train_model_input = {'head':head_le.transform(re_train_all['head'].values),
                    'tail':tail_le.transform(re_train_all['tail'].values),
                     'feats':train_all_feats_scaled,
                     #'des':train_des
                    }
    test_model_input = {'head':head_le.transform(re_test_all['head'].values),
                    'tail':tail_le.transform(re_test_all['tail'].values),
                    'feats':test_all_feats_scaled,
                    # 'des':test_des
                    }
    return feature_columns,train_model_input,test_model_input


In [None]:
"""Load data"""
#1.Get all dt-drug target
dt = pd.read_csv("data/dt_drug.txt",delimiter='\t',header=None)
dt.columns = ['head','realtion','tail']

#2.Get dt KG data
kg = pd.read_csv('data/KG/dt_graph.txt',delimiter='\t',header=None)
kg.columns = ['head','relation','tail']

In [None]:
"""Normalization"""
mms = MinMaxScaler(feature_range=(0,1))

In [None]:
#3.Load Drug and protein desctiptors preparation
drug = pd.read_csv('data/drug_smi.txt',sep='\t')
fp_id = drug['drug_id']
df_proseq = pd.read_csv('data/transport_pro_seq.txt',sep='\t')
df_proseq.columns = ['pro_id','seq']
pro_id = df_proseq['pro_id']
drug_feats = np.loadtxt('data/drug_morganfp.txt',delimiter=',')
pro_feats = np.loadtxt('data/dt_pro_ctd.txt',delimiter=',')
prodes_df = pd.concat([pro_id,pd.DataFrame(pro_feats)],axis=1)
fp_df = pd.concat([fp_id,pd.DataFrame(drug_feats)],axis=1)

In [None]:
#4.Encode head and tail for input
head_le = LabelEncoder()
tail_le = LabelEncoder()
head_le.fit(df_proseq['pro_id'].values)
tail_le.fit(drug['drug_id'].values)

In [None]:
#5.DataPath(Unbalanced dataset：change data_path)
data_path = "data/data_folds/1_1/"

In [None]:
test_num_neg=10
train_num_neg=10
embedding_dim=50
patience=10

In [None]:
for i in range(10):
    #print("———————————— Cross_valid "+ str(i) +"——————————————")
    train,train_pos,test,data = load_data(i)
    columns = ['head','relation','tail']
    re_train_all = train[columns]
    re_test_all = test[columns]
    train_label = train['label']
    test_label = test['label'].values
    train_des = get_features(re_train_all,fp_df,prodes_df)
    test_des = get_features(re_test_all,fp_df,prodes_df)
    
    feature_columns,train_model_input,test_model_input = get_input(re_train_all,re_test_all,
                                                               train_des,test_des,embedding_dim)
    autoint_model = AutoInt(feature_columns,feature_columns,att_layer_num=9,
                                    att_head_num=4,l2_reg_embedding=0.001)
    
    autoint_model.compile("adam", "binary_crossentropy",
                    metrics=[keras.metrics.Precision(name='precision'),], )
    es = EarlyStopping(monitor='loss',patience=patience,min_delta=0.0001,mode='min',restore_best_weights=True)
    history = autoint_model.fit(train_model_input, train_label,
                        batch_size=64, epochs=2000, 
                        verbose=2,
                        callbacks=[es]
                        )
    pred_y = autoint_model.predict(test_model_input, batch_size=64)
    roc_nfm = roc_auc(test_label,pred_y[:,0])
    pr_nfm = pr_auc(test_label,pred_y[:,0])
                
    print("ROC:"+str(roc_nfm))
    print("PROC:"+str(pr_nfm))

In [None]:
for i in range(10):
    print("———————————— Cross_valid "+ str(i) +"——————————————")
    train,train_pos,test,data = load_data(i)
    columns = ['head','relation','tail']
    re_train_all = train[columns]
    re_test_all = test[columns]
    train_label = train['label']
    test_label = test['label'].values
    train_des = get_features(re_train_all,fp_df,prodes_df)
    test_des = get_features(re_test_all,fp_df,prodes_df)
    
    feature_columns,train_model_input,test_model_input = get_input(re_train_all,re_test_all,
                                                               train_des,test_des,
                                                                    embedding_dim)
    
    #Logisitc
    lr_model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                             intercept_scaling=1, max_iter=100,
                             n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
                             tol=0.0001, verbose=0, warm_start=False)

    lr_model.fit(train_model_input['feats'],train_label)
    y_pred = lr_model.predict(test_model_input['feats'])
    fpr, tpr, thresholds = metrics.roc_curve(test_label, y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(test_label, y_pred)

    roc = metrics.auc(fpr, tpr)
    pr = metrics.auc(recall, precision)

    print("ROC:"+str(roc))
    print("PROC:"+str(pr))

In [None]:
for i in range(10):
    print("———————————— Cross_valid "+ str(i) +"——————————————")
    train,train_pos,test,data = load_data(i)
    columns = ['head','relation','tail']
    re_train_all = train[columns]
    re_test_all = test[columns]
    train_label = train['label']
    test_label = test['label'].values
    train_des = get_features(re_train_all,fp_df,prodes_df)
    test_des = get_features(re_test_all,fp_df,prodes_df)
    
    feature_columns,train_model_input,test_model_input = get_input(re_train_all,re_test_all,
                                                               train_des,test_des,
                                                                embedding_dim)
    
    #RamdomForest
    rf_model = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=80,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

    rf_model.fit(train_model_input['feats'],train_label)
    y_pred = rf_model.predict(test_model_input['feats'])
    fpr, tpr, thresholds = metrics.roc_curve(test_label, y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(test_label, y_pred)

    roc = metrics.auc(fpr, tpr)
    pr = metrics.auc(recall, precision)

    print("ROC:"+str(roc))
    print("PROC:"+str(pr))

In [None]:
for i in range(10):
    print("———————————— Cross_valid "+ str(i) +"——————————————")
    train,train_pos,test,data = load_data(i)
    columns = ['head','relation','tail']
    re_train_all = train[columns]
    re_test_all = test[columns]
    train_label = train['label']
    test_label = test['label'].values
    train_des = get_features(re_train_all,fp_df,prodes_df)
    test_des = get_features(re_test_all,fp_df,prodes_df)
    
    feature_columns,train_model_input,test_model_input = get_input(re_train_all,re_test_all,
                                                               train_des,test_des,
                                                                embedding_dim)
    
    #SVM
    svm_model = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
                    decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
                    max_iter=-1, probability=False, random_state=None, shrinking=True,
                    tol=0.001, verbose=False)

    svm_model.fit(train_model_input['feats'],train_label)
    y_pred = svm_model.predict(test_model_input['feats'])
    fpr, tpr, thresholds = metrics.roc_curve(test_label, y_pred)
    precision, recall, thresholds = metrics.precision_recall_curve(test_label, y_pred)

    roc = metrics.auc(fpr, tpr)
    pr = metrics.auc(recall, precision)

    print("ROC:"+str(roc))
    print("PROC:"+str(pr))