In [1]:
import csv
import sqlite3
import time
import numpy as np
import torch
from torch.utils import data

import pandas as pd
from pandas import DataFrame
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [2]:
event_num = 65 # event 
vector_size = 572 
feature_list = ["target","enzyme","smile"]

In [3]:
def feature_vector(feature_name, df, vector_size):
    # df are the 572 kinds of drugs
    # Jaccard Similarity
    def Jaccard(matrix):
        matrix = np.mat(matrix)
        numerator = matrix * matrix.T
        denominator = np.ones(np.shape(matrix)) * matrix.T + matrix * np.ones(np.shape(matrix.T)) - matrix * matrix.T
        return numerator / denominator

    all_feature = []

    drug_list = np.array(df[feature_name]).tolist()
    # Features for each drug, for example, when feature_name is target, drug_list=["P30556|P05412","P28223|P46098|……"]
    for i in drug_list:
        for each_feature in i.split('|'):
            if each_feature not in all_feature:
                # 881 pubchem fingerprints 数据集中用了多少种-->583
                all_feature.append(each_feature)  # obtain all the features 
    # smiles_feature_matrix             
    feature_matrix = np.zeros((len(drug_list), len(all_feature)), dtype=float)
    df_feature = DataFrame(feature_matrix, columns=all_feature)  # Consrtuct feature matrices with key of dataframe
    
    for i in range(len(drug_list)):
        for each_feature in df[feature_name].iloc[i].split('|'):
            df_feature[each_feature].iloc[i] = 1

    sim_matrix = Jaccard(np.array(df_feature))

    # sim_matrix1 = np.array(sim_matrix)
    # count = 0

    pca = PCA(n_components=vector_size)  # PCA dimension 
    pca.fit(sim_matrix)
    sim_matrix = pca.transform(sim_matrix)
    return sim_matrix


In [4]:
def prepare_tr(df_drug, feature_list, vector_size,mechanism,action,trmod):
    d_label = {}
    d_feature = {}
    # Transfrom the interaction event to number
    # Splice the features
    d_event=[]
    # machanism increase / decrease
    for i in range(len(mechanism)):
        d_event.append(mechanism[i]+" "+action[i])
    count={}
    for i in d_event:
        if i in count:
            count[i]+=1
        else:
            count[i]=1
    # 取数字从高到低排列        
    list1 = sorted(count.items(), key=lambda x: x[1],reverse=True)
    '''[('The metabolism decrease', 9810),
    #  ('The risk or severity of adverse effects increase', 9496),
    #  ('The serum concentration increase', 5646),
    #  ('The serum concentration decrease', 2386),...]
    '''
    # d_label : (machansim)(int)
    for i in range(len(list1)):
        d_label[list1[i][0]]=i # dic

    # array([], shape=(572, 0), dtype=float64)
    vector = np.zeros((len(np.array(df_drug['name']).tolist()), 0), dtype=float)

    for i in feature_list:
        vector = np.hstack((vector, feature_vector(i, df_drug, vector_size))) # feature_vector 返回相似度矩阵 hstack列维度拼接

    df_feature_tr_char = torch.load('data/tr572.pt') # 572种药物tr分子预训练后的向量
    df_feature_tr_word = torch.load('data/tr_word_572.pt') # 572种药物tr子结构预训练后的向量 word效果最好
    tr_char = df_feature_tr_char
    tr_word = df_feature_tr_word
    if trmod == 'char':
        vector = np.hstack((vector,tr_char))
    
    if trmod == 'word': 
        vector = np.hstack((vector,tr_word))


    if trmod == 'n':
        pass

    # Transfrom the drug ID to feature vector
    for i in range(len(np.array(df_drug['name']).tolist())):
        d_feature[np.array(df_drug['name']).tolist()[i]] = vector[i] # 

    # Use the dictionary to obtain feature vector and label
    new_feature = []
    new_label = []
    # name_to_id = {}
    for i in range(len(d_event)):
        new_feature.append(np.hstack((d_feature[drugA[i]], d_feature[drugB[i]]))) #(37264, 1144)
        new_label.append(d_label[d_event[i]]) # (37264, 1144)
    new_feature = np.array(new_feature)
    new_label = np.array(new_label)
    return (new_feature, new_label,event_num)


In [5]:
conn = sqlite3.connect('data/event.db') # 读数据库 
extraction = pd.read_sql('select * from extraction;', conn)
mechanism = extraction['mechanism']
action = extraction['action'] # increase decrease
drugA = extraction['drugA']
drugB = extraction['drugB']
df_drug = pd.read_sql('select * from drug;', conn)
dgs = pd.read_csv('data/df_drug.csv')# 

In [6]:
new_feature, new_label, event_num = prepare_tr(df_drug, feature_list, vector_size, mechanism, action,trmod='word')

In [None]:
# 构建数据


In [None]:
# ccatp_feature.pt
ccatp_feature = torch.tensor(new_feature)
torch.save(ccatp_feature,"ccatp_feature.pt")

In [7]:
# ddi.csv
with open("ddi.csv","w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["drug1","drug2","label"])
    for idx,i in enumerate (extraction.itertuples()):
        
        drug1 = dgs.loc[dgs["name"] == i[4],"smile"].iloc[0]
        drug2 = dgs.loc[dgs["name"] == i[5],"smile"].iloc[0]
        label = new_label[idx]
        writer.writerow([drug1,drug2,label])

In [8]:
# smile.csv
with open("smile.csv","w") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["name","smile"])
    for idx,i in enumerate (dgs.itertuples()):
        writer.writerow([i[8],i[7]])