In [None]:
import glob
import numpy as np
import sklearn.metrics.pairwise

import nbimporter
from assistant import triplets_to_tensor, vectorize_smile

In [None]:
# from sklearn.model_selection import KFold
# import matplotlib.pyplot as plt
# import copy

## Split

In [None]:
def Split_T1(labels_tensor, dirname, n_splits):   
    """
    [drug, drug, featrue] triplets are randomly assigned.
    
    # possible dirname = ["../Final_Experiments/" + "TDC" or "DCDB" or "NCI"] 
    # Output: splitted train, test triplets are saved as txt file
    """
    
    # index into triplets
    pos_idx = np.transpose(np.where(labels_tensor == 1)).astype('int16')
    neg_idx = np.transpose(np.where(labels_tensor == 0) ).astype('int16')
    neg_idx = neg_idx[(neg_idx[:,0] != neg_idx[:,1])]

    kf = KFold(n_splits=n_splits, shuffle=True)
    
    # split positive label
    i,j = 0,0
    for train_pos_idx, test_pos_idx in kf.split(pos_idx):
        test_pos = pos_idx[test_pos_idx]
        np.save(dirname+"/T1_test_pos_{}".format(i), test_pos)
        i += 1 

    # split negative label
    for train_neg_idx, test_neg_idx in kf.split(neg_idx):
        test_neg = neg_idx[test_neg_idx]
        np.save(dirname+"/T1_test_neg_{}".format(j), test_neg)
        j += 1
        

In [None]:
def Split_T2(labels_tensor, dirname, n_splits): 
    """
    Drug-Drug pairs (combination of drugs) are randomly assigned
    """
    
    # define the drug-drug pairs
    drug_pairs = np.asarray([[i,j] for i in range(len(labels_tensor)) for j in range(i+1,len(labels_tensor))]) # only one of the two symmetric versions of drug-drug pair
    
    kf = KFold(n_splits=n_splits, shuffle=True)

    i=0
    for train_index, test_index in kf.split(drug_pairs): # save train and test pairs
        train, test = drug_pairs[train_index], drug_pairs[test_index]
        np.save(dirname+"/T2_train_{}".format(i), np.asarray(train))
        np.save(dirname+"/T2_test_{}".format(i), np.asarray(test))
        i += 1

In [None]:
def Split_T3(labels_tensor, dirname, n_splits):
    """
    One drug are randomly assigned (this split also can be applicable for spltting of setting 4)
    """ 

    drugs = np.asarray([i for i in range(len(labels_tensor))])
    kf = KFold(n_splits=n_splits, shuffle=True)
    
    # Split into train, test drug
    i = 0 
    for train_drug, test_drug in kf.split(drugs): 
        np.save(dirname+"/T3_train_{}".format(i), np.asarray(train_drug))
        np.save(dirname+"/T3_test_{}".format(i), np.asarray(test_drug))
        i += 1

#### Compute the similarity kernels that use training labels as feature

In [None]:
def Kernels_T2(labels_tensor, trains, tests, dirname, effect_name):   
    """
    build drugs and labels kernel for each training
    """
    for fold in range(len(trains)): 
        train = np.load(trains[fold])
        test = np.load(tests[fold])
    
        # use the train pairs to construct kernels with drug combination as side information 
        effectfeatures = np.asarray([[labels_tensor[:,:,se][train[i][0], train[i][1]] for i in range(len(train))] for se in range(len(labels_tensor[1,1,:]))])  
        
        # cosine similarity
        K_Y_cos = sklearn.metrics.pairwise.cosine_similarity(effectfeatures, effectfeatures)        
        np.save(dirname+"/T2_train_K_{}_cos_{}".format(effect_name,fold), K_Y_cos)

        # jaccard similarity
        K_Y_jacc = 1 - sklearn.metrics.pairwise_distances(effectfeatures, metric ='jaccard')
        np.save(dirname+"/T2_train_K_{}_jacc_{}".format(effect_name, fold), K_Y_jacc)

In [None]:
def Kernels_T3(labels_tensor, trains, tests, dirname, effect_name):   
    """
    build drugs and labels kernel for each training
    """
    for fold in range(len(trains)): 
        train = np.load(trains[fold])
        test = np.load(tests[fold])
        
        # train pairs w/o symmetry
        train_pairs = [(i, j) for j in train for i in train if i > j]  # 4186 no symmetry
        
        # use the train pairs to construct kernels with drug combination as side information 
        effectfeatures = np.asarray([[labels_tensor[:,:,se][train_pairs[i][0], train_pairs[i][1]] for i in range(len(train_pairs))] for se in range(len(labels_tensor[1,1,:]))])  
        
        # cosine similarity
        K_Y_cos = sklearn.metrics.pairwise.cosine_similarity(effectfeatures, effectfeatures) 
        np.save(dirname+"/T3_train_K_{}_cos_{}".format(effect_name,fold), K_Y_cos)
        
        # jaccard similarity
        K_Y_jacc = 1 - sklearn.metrics.pairwise_distances(effectfeatures, metric ='jaccard')
        np.save(dirname+"/T3_train_K_{}_jacc_{}".format(effect_name, fold), K_Y_jacc)

### TDC

In [None]:
# TDC triplets > Yabc (3D Label Tensor) 
labels_triplets = np.loadtxt("../Final_DF/TDC_Label.txt").astype(int)
labels_tensor = triplets_to_tensor(labels_triplets, 645, 1317)

In [None]:
Split_T1(labels_tensor, "../Final_Experiments/TDC", 10)

In [None]:
Split_T2(labels_tensor, "../Final_Experiments/TDC", 10)

In [None]:
Split_T3(labels_tensor, "../Final_Experiments/TDC", 10)

In [None]:
trains = sorted(glob.glob('../Final_Experiments/TDC/T2_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/TDC/T2_test_[0-9]*'))

Kernels_T2(labels_tensor, trains, tests, "../Final_Experiments/TDC", "SE")

In [None]:
trains = sorted(glob.glob('../Final_Experiments/TDC/T3_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/TDC/T3_test_[0-9]*'))

Kernels_T3(labels_tensor, trains, tests, "../Final_Experiments/TDC", "SE")

### DCDB

In [None]:
# DCDB triplets > Yabc (3D Label Tensor) 
labels_triplets = np.loadtxt("../Final_DF/DCDB_Label.txt").astype(int)
labels_tensor = triplets_to_tensor(labels_triplets, 546, 268)

In [None]:
Split_T1(labels_tensor, "../Final_Experiments/DCDB", 10)

In [None]:
Split_T2(labels_tensor, "../Final_Experiments/DCDB", 10)

In [None]:
Split_T3(labels_tensor, "../Final_Experiments/DCDB", 10)

In [None]:
trains = sorted(glob.glob('../Final_Experiments/DCDB/T2_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/DCDB/T2_test_[0-9]*'))

Kernels_T2(labels_tensor, trains, tests, "../Final_Experiments/DCDB", "ICD10")

In [None]:
trains = sorted(glob.glob('../Final_Experiments/DCDB/T3_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/DCDB/T3_test_[0-9]*'))

Kernels_T3(labels_tensor, trains, tests, "../Final_Experiments/DCDB", "ICD10")

### NCI

In [None]:
# NCI triplets > Yabc (3D Label Tensor) 
labels_triplets = np.loadtxt("../Final_DF/NCI_Label_filtered.txt").astype(int)[:,0:3]
labels_tensor = triplets_to_tensor(labels_triplets, 103, 60)

In [None]:
## further experiments
# labels_triplets_10 = np.loadtxt("../Final_DF/NCI_Label_filtered_10.txt").astype(int)[:,0:3]
# labels_triplets_5 = np.loadtxt("../Final_DF/NCI_Label_filtered_5.txt").astype(int)[:,0:3]
# labels_triplets_2 = np.loadtxt("../Final_DF/NCI_Label_filtered_2.txt").astype(int)[:,0:3]
# labels_triplets_1 = np.loadtxt("../Final_DF/NCI_Label_filtered_1.txt").astype(int)[:,0:3]

In [None]:
Split_T1(labels_tensor, "../Final_Experiments/NCI/", 10)

In [None]:
Split_T2(labels_tensor, "../Final_Experiments/NCI", 10)

In [None]:
Split_T3(labels_tensor, "../Final_Experiments/NCI", 10)

In [None]:
trains = sorted(glob.glob('../Final_Experiments/NCI/_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/NCI/test_[0-9]*'))

Kernels_T2(labels_tensor, trains, tests, "../Final_Experiments/NCI", "cellLine")

In [None]:
trains = sorted(glob.glob('../Final_Experiments/NCI/T3_train_[0-9]*'))
tests = sorted(glob.glob('../Final_Experiments/NCI/T3_test_[0-9]*'))

Kernels_T3(labels_tensor, trains, tests, "../Final_Experiments/NCI", "cellLine")