In [1]:
import pandas as pd
import numpy as np
import itertools

### Assitant Functions

In [2]:
# Extract distinct drugs and Y [SE/Diesease/CellLine]
def distinct(data, drug1, drug2, Y): 
    distinct_drugs = set(data[drug1]).union(set(data[drug2])) 
    distinct_Y = set(data[Y])  
    return(distinct_drugs, distinct_Y)

In [3]:
# Get Index for drug, Y [SE/Diesease/CellLine]
def drug_index(distinct_Drug):
    drug = pd.DataFrame(distinct_Drug, columns = ['Drug_ID'])
    drug["Drug_Index"] = drug.index
    return(drug) 

def y_index(distinct_Y, y_name):  
    Y = pd.DataFrame(distinct_Y, columns=[y_name])
    Y[y_name+"_Index"] = Y.index
    return(Y)

In [4]:
# 3D tensor built (Yabc) 
def labels_to_tensor(labels_triplets, Drug, Y):
    labels_tensor = np.zeros((len(Drug), len(Drug), len(Y)))

    # Classification(binary) problem
    labels_tensor[labels_triplets[:,0], labels_triplets[:,1], labels_triplets[:,2]] = 1 
    labels_tensor[labels_triplets[:,1], labels_triplets[:,0], labels_triplets[:,2]] = 1 
    
    return(labels_tensor)

In [5]:
# 3D tensor built (Yabc) 
def triplets_to_tensor(labels_triplets, Drug_num , Y_num):
    """
    labels_triplets = [[ Drug1, Drug2, Y], [ ] [ ] ,,, [ ]]
    Drug_num = distinct number of Drug
    Y_num = distinct number of Y 
    """
    labels_tensor = np.zeros((Drug_num, Drug_num, Y_num))

    labels_tensor[labels_triplets[:,0], labels_triplets[:,1], labels_triplets[:,2]] = 1 
    labels_tensor[labels_triplets[:,1], labels_triplets[:,0], labels_triplets[:,2]] = 1 
    return(labels_tensor)

In [6]:
def save(array, name):
    np.savetxt("../Final_DF/{}".format(name), array)

In [8]:
def vectorize_smile(smiles):
    # fingerprint
    charset = set("".join(list(smiles)))
    char_to_int = dict((c,i) for i,c in enumerate(charset)) 
    int_to_char = dict((i,c) for i,c in enumerate(charset)) 
    embed = max([len(smile) for smile in smiles]) + 1
    
    ## one hot coding
    one_hot =  np.zeros((smiles.shape[0], embed , len(charset)),dtype=np.int8)
    for i, smile in enumerate(smiles):
        for j, c in enumerate(smile):
            one_hot[i,j+1,char_to_int[c]] = 1
    return one_hot

In [9]:
def merge_smile(data_indexed, drug_idx, columns):
    """
    Merge Smile string into the drug_idx 
    """
    data_drug1_indexed = data_indexed[columns[0]].drop_duplicates().sort_values(columns[0][0])
    data_drug2_indexed  = data_indexed[columns[1]].drop_duplicates().sort_values(columns[1][0])

    # fillna(0) as each drug set do not contain all the distinct drugs 
    Drug1 = drug_idx.merge(data_drug1_indexed, left_on = "Drug_Index", right_on = columns[0][0], how='outer').fillna(0)
    Drug_all = Drug1.merge(data_drug2_indexed, left_on = "Drug_Index", right_on = columns[1][0], how='outer').fillna(0)
    
    # merge them into one 
    case1 = Drug_all[Drug_all[columns[1][1]]==0].index
    case2 = Drug1[Drug1[columns[0][1]]==0].index
    for i in case1:
        Drug_all.at[i,columns[1][1]]= Drug_all.at[i,columns[0][1]]
    for i in case2: 
        Drug_all.at[i,columns[0][1]] = Drug_all.at[i,columns[1][1]]
        
    return(Drug_all)

In [10]:
def drug_combi_index(drug_idx):
    
    # all possible drug combination 
    drug_combi = list(itertools.combinations(drug_idx["Drug_Index"], 2))
    
    # index for drug combination
    combi_idx = pd.DataFrame(drug_combi,  columns =["Drug1_Index", "Drug2_Index"])
    combi_idx["Combi_Index"] = combi_idx.index 
    
    return(combi_idx)

In [11]:
def data_indexed_combi(data_indexed, combi_idx):

    # consider the oppositie orders too 
    combi_df = combi_idx
    combi_df2 = combi_df.rename(columns = {"Drug1_Index" : "Drug2_Index", "Drug2_Index" : "Drug1_Index", "Combi_Index":"Combi_Index2"})
    combi_df2 = combi_df2[["Drug1_Index", "Drug2_Index","Combi_Index2"]]
    
    # merge data_indexed with drug combination index
    data_indexed_combi = data_indexed.merge(combi_df, how= 'left').merge(combi_df2, how = 'left')
    data_indexed_combi["Combi_Index3"] = data_indexed_combi.Combi_Index.fillna(0).astype(int) + data_indexed_combi.Combi_Index2.fillna(0).astype(int)    
    data_indexed_combi = data_indexed_combi.drop(columns={"Combi_Index","Combi_Index2"}).rename(columns={"Combi_Index3":"Combi_Index"})
    return(data_indexed_combi)

In [13]:
def vectorize_drugcombi(labels_triplets_drugcombi, combi_idx, Y_idx):
    """
    Vectorizing Y using drug-drug-pair existence as side information
    
    Input: [data with indexed, drug combination indexed, Y_idx, Y_name]
    Output: 2D tensor shaped as [Y, Drug_combination]
    """
    # first column = Y_idx, second column = DrugCombi_idx
    Y_drugcombi_idx = labels_triplets_drugcombi[:,2:4]

    # vectorize into tensor as shaped of [Y, drugcombi]
    Y_vectorized_drugcombi = np.zeros((len(Y_idx), len(combi_idx)))
    Y_vectorized_drugcombi[Y_drugcombi_idx[:,0], Y_drugcombi_idx[:,1]] = 1 
    return(Y_vectorized_drugcombi)

In [14]:
def tanimoto_similarity(drug1, drug2):
    
    both, onlyA ,onlyB = 0, 0, 0
    
    # for diagnoal 
    if np.array_equal(drug1, drug2):
        return 1 
    
    for i in range(len(drug1)):
        if drug1[i] == 1 and drug2[i] == 1:
            both +=1
        elif drug1[i] == 1 and drug2[i] == 0:
            onlyA +=1
        elif drug1[i] == 0 and drug2[i] == 1:
            onlyB +=1 
            
    return(both/(onlyA +onlyB + both))

In [15]:
def tanimoto_similarity_matrix(drug_vectorized):
    
    similarity = np.zeros((drug_vectorized.shape[0], drug_vectorized.shape[0]))
    for i, drug1 in enumerate(drug_vectorized):
        for j, drug2 in enumerate(drug_vectorized):
            similarity[i,j] = tanimoto_similarity(drug1, drug2)

    return(similarity) 