In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
import numpy as np
import os
import re
import pandas as pd
import scipy.sparse as sp
import torch as th

#import dgl
#from dgl.data.utils import download, extract_archive, get_download_dir

from itertools import product
from collections import Counter
from copy import deepcopy
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import accuracy_score

import random
random.seed(1234)
np.random.seed(1234)


In [21]:
def load_data(directory):
    D_SSM = np.loadtxt(directory + '/D_SM.txt')


    M_FSM = np.loadtxt(directory + '/M_SM.txt')

    print('D_SSM',D_SSM)
    print('M_FSM',M_FSM)

    ID = np.zeros(shape=(D_SSM.shape[0], D_SSM.shape[1]))
    IM = np.zeros(shape=(M_FSM.shape[0], M_FSM.shape[1]))
    for i in range(D_SSM.shape[0]):
        for j in range(D_SSM.shape[1]):
            if D_SSM[i][j] == 0:
                ID[i][j] = D_GSM[i][j]###
            else:
                ID[i][j] = D_SSM[i][j]
    for i in range(M_FSM.shape[0]):
        for j in range(M_FSM.shape[1]):
            if M_FSM[i][j] == 0:
                IM[i][j] = M_GSM[i][j]##3
            else:
                IM[i][j] = M_FSM[i][j]
                
    ID = pd.DataFrame(ID).reset_index()
    IM = pd.DataFrame(IM).reset_index()
    print('ID',ID)
    print('IM',IM)
    ID.rename(columns = {'index':'id'}, inplace = True)
    IM.rename(columns = {'index':'id'}, inplace = True)
    ID['id'] = ID['id'] + 1
    IM['id'] = IM['id'] + 1
    print('ID',ID)
    print('IM',IM)
    #print(ID.shape)
    #print(IM.shape)
    return ID, IM


In [22]:
def sample(directory, random_seed):
    all_associations = pd.read_csv(directory + '/drug_mutation_pairs.csv', names=['Drug', 'Mutation', 'label'])
    known_associations = all_associations.loc[all_associations['label'] == 1]
    unknown_associations = all_associations.loc[all_associations['label'] == 0]
    random_negative = unknown_associations.sample(n=known_associations.shape[0], random_state=random_seed, axis=0)

    sample_df = known_associations.append(random_negative)
    sample_df.reset_index(drop=True, inplace=True)
    #print(sample_df)
                 
    return sample_df

In [23]:
def obtain_data(directory, isbalance):
    ID, IM = load_data(directory)
    
    if isbalance:
        dtp = sample(directory, random_seed = 1234)
    else:
        dtp = pd.read_csv(directory + '/drug_mutation_pairs.csv', names=['Drug', 'Mutation', 'label'])
        
    mirna_ids = list(set(dtp['Drug']))
    disease_ids = list(set(dtp['Mutation']))
    
    print('mirna_ids',len(mirna_ids))
    print('disease_ids',len(disease_ids))
    random.shuffle(mirna_ids)
    random.shuffle(disease_ids)
    print('# Drug = {} | Mutation = {}'.format(len(mirna_ids), len(disease_ids)))

    mirna_test_num = int(len(mirna_ids) / 5)
    disease_test_num = int(len(disease_ids) / 5)
    print('# Test: Drug = {} | Mutation = {}'.format(mirna_test_num, disease_test_num))
    
    #print(ID.shape)
    #print(IM.shape)
    #print('dtp',dtp)
    #cf=pd.merge(dtp, IM, left_on = 'miRNA', right_on = 'id')
    #print('cf',cf)
    #print('ID',ID)
    #df=pd.merge(cf, ID, left_on = 'disease', right_on = 'id')
    #print('df',df)
    #print(dtp.to_csv('C:/Users/Administrator/Desktop/图采样data/text/dtp.csv'))
    #print(ID.to_csv('C:/Users/Administrator/Desktop/图采样data/text/ID.csv'))
    #print(IM.to_csv('C:/Users/Administrator/Desktop/图采样data/text/IM.csv'))
    
    knn_x = pd.merge(pd.merge(dtp, ID, left_on = 'Drug', right_on = 'id'), IM, left_on = 'Mutation', right_on = 'id')
    #print('knn_x',knn_x)
    label = dtp['label']
    knn_x.drop(labels = ['Drug', 'Mutation', 'label', 'id_x', 'id_y'], axis = 1, inplace = True)
    assert ID.shape[0] + IM.shape[0] == knn_x.shape[1]
    #print(knn_x.shape, Counter(label))
    #print(label.shape)
    return ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, knn_x, label

In [24]:
def generate_task_Tp_train_test_idx(knn_x):
    kf = KFold(n_splits = 5, shuffle = True, random_state = 1234)

    train_index_all, test_index_all, n = [], [], 0
    train_id_all, test_id_all = [], []
    fold = 0
    for train_idx, test_idx in tqdm(kf.split(knn_x)): #train_index与test_index为下标
        print('-------Fold ', fold)
        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)

        train_id_all.append(np.array(dtp.iloc[train_idx][['Drug', 'Mutation']]))
        test_id_all.append(np.array(dtp.iloc[test_idx][['Drug', 'Mutation']]))

        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        fold += 1
    return train_index_all, test_index_all, train_id_all, test_id_all

In [25]:
def generate_task_Tm_Td_train_test_idx(item, ids, dtp):
    
    test_num = int(len(ids) / 5)
    
    train_index_all, test_index_all = [], []
    train_id_all, test_id_all = [], []
    
    for fold in range(5):
        print('-------Fold ', fold)
        if fold != 4:
            test_ids = ids[fold * test_num : (fold + 1) * test_num]
        else:
            test_ids = ids[fold * test_num :]

        train_ids = list(set(ids) ^ set(test_ids))
        print('# {}: Train = {} | Test = {}'.format(item, len(train_ids), len(test_ids)))

        test_idx = dtp[dtp[item].isin(test_ids)].index.tolist()
        train_idx = dtp[dtp[item].isin(train_ids)].index.tolist()
        random.shuffle(test_idx)
        random.shuffle(train_idx)
        print('# Pairs: Train = {} | Test = {}'.format(len(train_idx), len(test_idx)))
        assert len(train_idx) + len(test_idx) == len(dtp)

        train_index_all.append(train_idx) 
        test_index_all.append(test_idx)
        
        train_id_all.append(train_ids)
        test_id_all.append(test_ids)
        
    return train_index_all, test_index_all, train_id_all, test_id_all

# KNN

In [26]:
from sklearn.neighbors import KNeighborsClassifier

In [27]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report

In [28]:
def generate_knn_graph_save(knn_x, label, n_neigh, train_index_all, test_index_all, pwd, task, balance):
    
    fold = 0
    for train_idx, test_idx in zip(train_index_all, test_index_all): 
        print('-------Fold ', fold)
        
        knn_y = deepcopy(label)  ###深层复制label
        knn_y[test_idx] = 0
        print('Label: ', Counter(label))
        print('knn_y: ', Counter(knn_y))

        knn = KNeighborsClassifier(n_neighbors = n_neigh)
        knn.fit(knn_x, knn_y)

        knn_y_pred = knn.predict(knn_x)
        knn_y_prob = knn.predict_proba(knn_x)
        knn_neighbors_graph = knn.kneighbors_graph(knn_x, n_neighbors = n_neigh)
        #print(knn_neighbors_graph)
        #print(knn_y)
        #print(accuracy_score(knn_y, knn_y_pred))

        #prec_reca_f1_supp_report = classification_report(knn_y, knn_y_pred, target_names = ['label_0', 'label_1'])
        #tn, fp, fn, tp = confusion_matrix(knn_y, knn_y_pred).ravel()        
        
        
        #prec_reca_f1_supp_report = classification_report(knn_y, knn_y_pred, target_names = ['label_0', 'label_1','label_-1'])
        #print(prec_reca_f1_supp_report)
        #cf=confusion_matrix(knn_y, knn_y_pred,labels=["0", "1","-1"])
        #print(cf)
        #tn, fp, fn, tp = confusion_matrix(knn_y, knn_y_pred,labels=['label_0', 'label_1','label_-1']).ravel()

        #pos_acc = tp / sum(knn_y)
        #neg_acc = tn / (len(knn_y_pred) - sum(knn_y_pred)) # [y_true=0 & y_pred=0] / y_pred=0
        #accuracy = (tp+tn)/(tn+fp+fn+tp)

        #recall = tp / (tp+fn)
        #precision = tp / (tp+fp)
        #f1 = 2*precision*recall / (precision+recall)

        #roc_auc = roc_auc_score(knn_y, knn_y_prob[:, 1])
        #prec, reca, _ = precision_recall_curve(knn_y, knn_y_prob[:, 1])
        #aupr = auc(reca, prec)

        #print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}|pos_acc={:.4f}|neg_acc={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr, pos_acc, neg_acc))
        #print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
        #print('y_pred: ', Counter(knn_y_pred))
        #print('y_true: ', Counter(knn_y))
#         print('knn_score = {:.4f}'.format(knn.score(knn_x, knn_y)))

        sp.save_npz(pwd + 'task_' + task + balance + '__testlabel0_knn' + str(n_neigh) + 'neighbors_edge__fold' + str(fold) + '.npz', knn_neighbors_graph)
        fold += 1
    return knn_x, knn_y, knn, knn_neighbors_graph

# Run

In [29]:

for isbalance in [True]:
    print('************isbalance = ', isbalance)
    
    #for task in ['Tp', 'Td', 'Tm']:
    for task in ['Tp','Td', 'Tm']:
        print('=================task = ', task)
        
        ID, IM, dtp, mirna_ids, disease_ids, mirna_test_num, disease_test_num, knn_x, label = obtain_data('C:/Users/xs/Desktop/图采样data/last data', isbalance)

        if task == 'Tp':
            train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tp_train_test_idx(knn_x)
        #elif task == 'Tm':
        #    item = 'Drug'
        #    ids = mirna_ids
        #    train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)
        #elif task == 'Td':
        #    item = 'Mutation'
        #    ids = disease_ids
        #    train_index_all, test_index_all, train_id_all, test_id_all = generate_task_Tm_Td_train_test_idx(item, ids, dtp)

        if isbalance:
            balance = ''
        else:
            balance = '__nobalance'

        #np.savez_compressed('F:/graph data/' + task + balance + '__testlabel0_knn_edge_train_test_index_all.npz', 
        #                       train_index_all = train_index_all, 
        #                       test_index_all = test_index_all,
        #                       train_id_all = train_id_all, 
        #                       test_id_all = test_id_all)

        #pwd = 'F:/graph data/'
        for n_neigh in [1]: 
            print('--------------------------n_neighbors = ', n_neigh)
            #knn_x, knn_y, knn, knn_neighbors_graph = generate_knn_graph_save(knn_x, label, n_neigh, train_index_all, test_index_all, pwd, task, balance)
directory='C:/Users/xs/Desktop/图采样data/last data'
#ID,IM=load_data(directory)
#print(ID)
#dtp.to_csv('C:/Users/Administrator/Desktop/图采样data/text/dtp.csv')
#node_feature_label = pd.concat([dtp, knn_x], axis = 1)
#node_feature_label



#pwd = 'F:/graph data/平衡数据/'
#node_feature_label.to_csv(pwd + 'node_feature_label.csv')

************isbalance =  True
D_SSM [[1.         0.89766196 0.9695392  ... 0.93836794 0.94563883 0.93286166]
 [0.89766196 1.         0.97386686 ... 0.99178252 0.98852405 0.9811674 ]
 [0.9695392  0.97386686 1.         ... 0.98927445 0.9959657  0.98758327]
 ...
 [0.93836794 0.99178252 0.98927445 ... 1.         0.9946604  0.9929528 ]
 [0.94563883 0.98852405 0.9959657  ... 0.9946604  1.         0.99153622]
 [0.93286166 0.9811674  0.98758327 ... 0.9929528  0.99153622 1.        ]]
M_FSM [[1.         0.90766937 0.17608963 ... 0.89524413 0.01611704 0.03146066]
 [0.90766937 1.         0.0183536  ... 0.97583942 0.08079069 0.07906328]
 [0.17608963 0.0183536  1.         ... 0.04580561 0.83450014 0.84573587]
 ...
 [0.89524413 0.97583942 0.04580561 ... 1.         0.10737693 0.12459924]
 [0.01611704 0.08079069 0.83450014 ... 0.10737693 1.         0.99315765]
 [0.03146066 0.07906328 0.84573587 ... 0.12459924 0.99315765 1.        ]]
ID      index         0         1         2         3         4       

mirna_ids 184
disease_ids 578
# Drug = 184 | Mutation = 578
# Test: Drug = 36 | Mutation = 115


5it [00:00, 626.71it/s]

-------Fold  0
# Pairs: Train = 1336 | Test = 334
-------Fold  1
# Pairs: Train = 1336 | Test = 334
-------Fold  2
# Pairs: Train = 1336 | Test = 334
-------Fold  3
# Pairs: Train = 1336 | Test = 334
-------Fold  4
# Pairs: Train = 1336 | Test = 334
--------------------------n_neighbors =  1





D_SSM [[1.         0.89766196 0.9695392  ... 0.93836794 0.94563883 0.93286166]
 [0.89766196 1.         0.97386686 ... 0.99178252 0.98852405 0.9811674 ]
 [0.9695392  0.97386686 1.         ... 0.98927445 0.9959657  0.98758327]
 ...
 [0.93836794 0.99178252 0.98927445 ... 1.         0.9946604  0.9929528 ]
 [0.94563883 0.98852405 0.9959657  ... 0.9946604  1.         0.99153622]
 [0.93286166 0.9811674  0.98758327 ... 0.9929528  0.99153622 1.        ]]
M_FSM [[1.         0.90766937 0.17608963 ... 0.89524413 0.01611704 0.03146066]
 [0.90766937 1.         0.0183536  ... 0.97583942 0.08079069 0.07906328]
 [0.17608963 0.0183536  1.         ... 0.04580561 0.83450014 0.84573587]
 ...
 [0.89524413 0.97583942 0.04580561 ... 1.         0.10737693 0.12459924]
 [0.01611704 0.08079069 0.83450014 ... 0.10737693 1.         0.99315765]
 [0.03146066 0.07906328 0.84573587 ... 0.12459924 0.99315765 1.        ]]
ID      index         0         1         2         3         4         5  \
0        0  1.000000  0

D_SSM [[1.         0.89766196 0.9695392  ... 0.93836794 0.94563883 0.93286166]
 [0.89766196 1.         0.97386686 ... 0.99178252 0.98852405 0.9811674 ]
 [0.9695392  0.97386686 1.         ... 0.98927445 0.9959657  0.98758327]
 ...
 [0.93836794 0.99178252 0.98927445 ... 1.         0.9946604  0.9929528 ]
 [0.94563883 0.98852405 0.9959657  ... 0.9946604  1.         0.99153622]
 [0.93286166 0.9811674  0.98758327 ... 0.9929528  0.99153622 1.        ]]
M_FSM [[1.         0.90766937 0.17608963 ... 0.89524413 0.01611704 0.03146066]
 [0.90766937 1.         0.0183536  ... 0.97583942 0.08079069 0.07906328]
 [0.17608963 0.0183536  1.         ... 0.04580561 0.83450014 0.84573587]
 ...
 [0.89524413 0.97583942 0.04580561 ... 1.         0.10737693 0.12459924]
 [0.01611704 0.08079069 0.83450014 ... 0.10737693 1.         0.99315765]
 [0.03146066 0.07906328 0.84573587 ... 0.12459924 0.99315765 1.        ]]
ID      index         0         1         2         3         4         5  \
0        0  1.000000  0

In [None]:
node_feature_label = pd.concat([dtp, knn_x], axis = 1)
node_feature_label

In [None]:
pwd = 'F:/图采样实验数据/'
node_feature_label.to_csv(pwd + 'node_feature_label.csv')