In [1]:
import numpy as np
import random
import xlrd
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import MiniBatchKMeans

In [2]:
data_path='./data/'
DSS1 = np.loadtxt(data_path + 'DSS1.txt')
DSS2 = np.loadtxt(data_path + 'DSS2.txt')
#Integrated semantic similarity for diseases
DSS = (DSS1 + DSS2) / 2
#Gaussian interaction profile kernel similarity for diseases
DGS = np.loadtxt(data_path + 'DGS.txt')
#Integrated similarity for diseases
IDS = np.zeros(shape = (DSS.shape[0], DSS.shape[1]))
for i in range(DSS.shape[0]):
    for j in range(DSS.shape[1]):
        if DSS[i][j] == 0:
            IDS[i][j] = DGS[i][j]
        else:       
            IDS[i][j] = DSS[i][j]
#Functional similarity for miRNAs          
MFS = np.loadtxt(data_path + 'MFS.txt')
#Gaussian interaction profile kernel similarity for miRNAs
MGS = np.loadtxt(data_path + 'MGS.txt')
#Integrated similarity for miRNAs
IMS = np.zeros(shape = (MFS.shape[0], MFS.shape[1]))
for i in range(MFS.shape[0]):
    for j in range(MFS.shape[1]):
        if MFS[i][j] == 0:
            IMS[i][j] = MGS[i][j]
        else:
            IMS[i][j] = MFS[i][j]
#miRNA-disease associations matrix
MD = np.zeros(shape = (DSS.shape[0], MFS.shape[0]))
asso_file =  xlrd.open_workbook(data_path + 'Human miRNA-disease associations.xlsx')
asso_pairs = asso_file.sheets()[0]
for i in range(asso_pairs.nrows):
    asso = asso_pairs.row_values(i)
    m = int(asso[0])
    n = int(asso[1])
    MD[n-1,m-1]=1
#Verified miRNA disease pair
known=[]
#Unverified miRNA disease pair
unknown=[]
for x in range(MD.shape[0]):
    for y in range(MD.shape[1]):
        if MD[x,y]==0:
            unknown.append((x,y))
        else:
            known.append((x,y))

In [3]:
#position sample set
posi_list = []
#unlabeled sample set
unlabelled_list = []
#total sample set
all_list = []

for i in range(len(known)):
    posi=IDS[known[i][0],:].tolist() + IMS[known[i][1],:].tolist()
    posi_list.append(posi)
    all_list.append(posi)

for i in range(len(unknown)):
    unlabelled=IDS[unknown[i][0],:].tolist() + IMS[unknown[i][1],:].tolist()
    unlabelled_list.append(unlabelled)
    all_list.append(unlabelled)

#The total sample set is disordered to avoid the influence of order on the clustering results
random.shuffle(all_list)

In [4]:
#Implement MiniBatchKMeans clustering algorithm ten times for total sample set

#Record the number of times each sample is divided into a certain cluster
sum = np.zeros(len(all_list),dtype=int)
#Record the final cluster of each sample
final = np.zeros(len(all_list),dtype=int)
for i in range(10):
    #According to our experiment, we set the number of clusters to 2
    cls = MiniBatchKMeans(n_clusters=2,batch_size=3072).fit(all_list)
    yhat = cls.predict(all_list)
    #According to the multiple clustering results of the total sample set, the total sample set is divided into subsets A and B
    #Among the two subsets A and B, subset A (B) is always greater than the size of subset A (B)
    #In order to ensure that subsets a (b) can obtain the same label after each clustering: 
    #suppose that the label of small subset is 1 and the label of large subset is 0
    #After clustering, when the number of samples with label 1 is greater than the number of samples with label 0, 
    #the labels of subsets A and B are exchanged
    if len(yhat[yhat==1]) > len(yhat[yhat==0]):
        trans = yhat==0
        yhat[yhat==1] = 0
        yhat[trans] = 1
    sum = sum + yhat
    
#When the number of times a sample gets a label of 1 is greater than or equal to 9, 
#it is considered that the label of this sample is 1
final[sum<9] = 0
final[sum>=9] = 1

In [5]:
#Organize the clustering results
clusters = np.unique(final)
subsets={}
for i in clusters:
    subset=[]
    for j in range(len(all_list)):
        if final[j] == i:
            subset.append(all_list[j])
    subsets[i]=subset

In [6]:
#Store positive sample index in each subset
index_lists=[]
#Store the number of positive samples in each subset
posi_cnt =[] 

for i in clusters:
    index_list=[]
    cnt=0
    for j in range(len(subsets[i])):
        if posi_list.__contains__(subsets[i][j]):
            cnt=cnt+1
            index_list.append(j)
    index_lists.append(index_list)
    posi_cnt.append(cnt)

In [7]:
#Find the subset with the least proportion of positive samples
min_per=1
min_idx=0
print('The number of the total sample set is %d, of which the number of positive samples is %d' %(len(all_list),len(posi_list)))
for i in range(len(posi_cnt)):
    t_per=posi_cnt[i]/len(subsets[i])
    print('The total number of samples in subset%d is %d, of which the number of positive samples is %d, and the proportion of positive samples is %f' %(i,len(subsets[i]),posi_cnt[i],t_per))
    if t_per < min_per:
        min_per=t_per
        min_idx=i
print('Subset%d has the least proportion of positive samples, accounting for %f' %(min_idx,min_per))

The number of the total sample set is 189585, of which the number of positive samples is 5430
The total number of samples in subset0 is 121411, of which the number of positive samples is 895, and the proportion of positive samples is 0.007372
The total number of samples in subset1 is 68174, of which the number of positive samples is 4535, and the proportion of positive samples is 0.066521
Subset0 has the least proportion of positive samples, accounting for 0.007372


In [8]:
#Remove the positive samples from the subset with the least number of positive samples, 
#and the remaining unmarked samples in the subset are regarded as negative samples
new_nega=np.delete(subsets[min_idx], index_lists[min_idx], axis=0)
new_nega=new_nega.tolist()

In [9]:
def base_xgb_learners(base_learner_num):
    clfs = []
    for i in range(base_learner_num):
        clfs.append(XGBClassifier(max_depth=6,learning_rate=0.4,n_estimators=100))
    return clfs

In [10]:
# Feature selection based on random forest feature importance score
def feature_ranking_by_rf(data, label, sel_fea_num, sel_hp1, sel_hp2):
    fs_rf = RandomForestClassifier(n_estimators=sel_hp1, max_depth=sel_hp2, random_state=0)
    #Training random forest model
    fs_rf.fit(data, label)
    importances = fs_rf.feature_importances_
    #Sort all features in the reverse order of feature importance scores, and return the sorted index value
    indices = np.argsort(importances)[::-1]
    #Extract the top sel_fea_num features with the highest feature importance score
    most_imp = indices[:sel_fea_num]
    return most_imp

In [11]:
sel_fea = 'RF'
prop = 0.75
base_learner_num = 10
sel_fea_num = int(prop*len(posi_list[0]))
sel_hp1 = 300
sel_hp2 = 30

posi_data=posi_list
nega_data=new_nega
posi_num = len(posi_data)

base_learners = base_xgb_learners(base_learner_num)
trained_clfs = []
most_imps_list = []

posi_train_data=np.array(posi_data)
for i in range(base_learner_num):
    print('the', i+1, 'th individual learner training\n')
    samples = random.sample(nega_data, posi_num)
    nega_train_data=np.array(samples)
    X_base_train = np.concatenate((posi_train_data, nega_train_data))
    y_base_train = np.concatenate((np.ones(posi_train_data.shape[0]), np.zeros(nega_train_data.shape[0])))
    if sel_fea == 'RF':
        print('Feature selection in progress\n')
        most_imps = feature_ranking_by_rf(X_base_train, y_base_train, sel_fea_num, sel_hp1, sel_hp2)
        most_imps_list.append(most_imps)
        X_base_train = X_base_train[:,most_imps]
    clf = base_learners[i]
    clf.fit(X_base_train,y_base_train)
    trained_clfs.append(clf)

the 1 th individual learner training

Feature selection in progress





the 2 th individual learner training

Feature selection in progress





the 3 th individual learner training

Feature selection in progress





the 4 th individual learner training

Feature selection in progress





the 5 th individual learner training

Feature selection in progress





the 6 th individual learner training

Feature selection in progress





the 7 th individual learner training

Feature selection in progress





the 8 th individual learner training

Feature selection in progress





the 9 th individual learner training

Feature selection in progress





the 10 th individual learner training

Feature selection in progress







In [12]:
miRNA_file = xlrd.open_workbook(data_path + 'miRNA number.xlsx').sheets()[0]
miRNA = {}
for i in range(miRNA_file.nrows):
    miRNA[i] = miRNA_file.row_values(i)[1]

In [14]:
disease_file = xlrd.open_workbook(data_path + 'disease number.xlsx').sheets()[0]
disease = {}
for i in range(disease_file.nrows):
    disease[disease_file.row_values(i)[1]] = i

In [75]:
cancer_list = ['Breast Neoplasms', 'Colon Neoplasms', 'Lung Neoplasms']

miRNA_num = len(miRNA)
for cancer in cancer_list:
    cancer_index = disease[cancer]
    test_sample_list = []
    for i in range(miRNA_num):
        test_sample = IDS[cancer_index,:].tolist() + IMS[i,:].tolist()
        test_sample_list.append(test_sample)
    test_sample_list = np.array(test_sample_list)
    #The prediction score of each individual learner on the prediction sample
    prob_list = []
    for i,clf in enumerate(trained_clfs):
        most_imp = most_imps_list[i]
        test_data = test_sample_list[:,most_imp]
        prob = clf.predict_proba(test_data)
        prob_list.append(prob[:,1])
    prob_list = np.array(prob_list)
    base_probs = np.transpose(prob_list)
    #Soft voting strategy
    pred_final = []
    prob_final = []
    for prob in base_probs:
        mean_prob = np.mean(prob)
        prob_final.append(mean_prob)
        if mean_prob > 0.5:
            pred_final.append(1)
        else:
            pred_final.append(0)
    pred_final = np.array(pred_final)
    prob_final = np.array(prob_final)
    prob_final = prob_final[pred_final == 1]
    sort_index = np.argsort(prob_final)[::-1]
    pred_miRNA = []
    for i in sort_index:
        pred_miRNA.append(miRNA[i])
    result = 'The top 20 miRNAs most likely to be associated with ' + cancer + ' predicted by CSMDA are:\n'
    cnt = 1
    for i in range(20):
        result = result + pred_miRNA[i] + '\t'
        if cnt % 5 == 0:
            result = result + '\n'
        cnt += 1
    print(result)

The top 20 miRNAs most likely to be associated with Breast Neoplasms predicted by CSMDA are:
hsa-mir-146a	hsa-mir-92a	hsa-mir-29b	hsa-mir-484	hsa-mir-127	
hsa-mir-505	hsa-mir-133a	hsa-mir-148b	hsa-mir-424	hsa-mir-128	
hsa-mir-1290	hsa-mir-139	hsa-mir-181b	hsa-mir-223	hsa-mir-1972	
hsa-mir-221	hsa-mir-16	hsa-let-7a	hsa-mir-29a	hsa-mir-1273a	

The top 20 miRNAs most likely to be associated with Colon Neoplasms predicted by CSMDA are:
hsa-mir-1	hsa-mir-18b	hsa-let-7e	hsa-mir-1246	hsa-mir-483	
hsa-mir-21	hsa-mir-221	hsa-mir-106a	hsa-mir-152	hsa-mir-198	
hsa-mir-124	hsa-mir-18a	hsa-mir-141	hsa-mir-338	hsa-mir-16	
hsa-mir-28	hsa-mir-151a	hsa-mir-345	hsa-mir-29a	hsa-mir-210	

The top 20 miRNAs most likely to be associated with Lung Neoplasms predicted by CSMDA are:
hsa-mir-183	hsa-let-7b	hsa-mir-34c	hsa-mir-150	hsa-mir-486	
hsa-mir-203	hsa-mir-223	hsa-mir-424	hsa-mir-382	hsa-mir-101	
hsa-mir-132	hsa-mir-196b	hsa-mir-340	hsa-mir-92a	hsa-mir-18a	
hsa-mir-29a	hsa-mir-137	hsa-mir-362	hsa-mir-191	