In [1]:
import os
import glob
import pickle
import copy
import argparse
import itertools
import pandas as pd
import time
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem
import tqdm

In [2]:
def parse_DDI_input_file(input_file, output_file):
    drug_pair_info = {}
    drug_smiles_info = {}
    with open(input_file, 'r') as fp:
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug_name = sptlist[1].strip()
            smiles = sptlist[2].strip()

            drug_smiles_info[(prescription, drug_name)] = smiles
            # 0	drug d(vitamin c)	[H][C@@]1(OC(=O)C(O)=C1O)[C@@H](O)CO
            # 0	lemon(cholesterol)	CC(C)CCCC(C)C1CCC2C3CC=C4CC(O)CCC4(C)C3CCC12C

            # --> 0: drug d(vitamin c),lemon(cholesterol)
            if prescription not in drug_pair_info:
                drug_pair_info[prescription] = [drug_name]
            else:
                drug_pair_info[prescription].append(drug_name)


    out_fp = open(output_file, 'w')
    for each_prescription in drug_pair_info:
        drug_names = drug_pair_info[each_prescription]
        for each_set in itertools.combinations(drug_names, 2):
            drug1 = each_set[0].strip()
            drug1_smiles = drug_smiles_info[(each_prescription, drug1)]

            drug2 = each_set[1].strip()
            drug2_smiles = drug_smiles_info[(each_prescription, drug2)]
            out_fp.write('%s\t%s\t%s\t%s\t%s\n'%(each_prescription, drug1, drug1_smiles, drug2, drug2_smiles))
    out_fp.close()
    return

In [3]:
parse_DDI_input_file('data/parsed_input.txt', 'test/DDI_input.txt')
print("over")

over


In [4]:
drug_dir='./data/DrugBank5.0_Approved_drugs/' 
input_file='output/DDI_input.txt'
output='output/similarity_profile.csv' 
drug_list = []
with open('./data/DrugList.txt', 'r') as fp:
    for line in fp:
        drug_list.append(line.strip())

In [5]:
 # drug_dir:'./data/DrugBank5.0_Approved_drugs/' input_file:'output/DDI_input.txt'
    # output:'output/similarity_profile.csv' drug_list:  ./data/DrugList.txt
def calculate_structure_similarity(drug_dir, input_file, output_file, drug_list):
    drugbank_drugs = glob.glob(drug_dir + '*')
    all_input_drug_info = {}
    with open(input_file, 'r')as fp:
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug1 = sptlist[1].strip()
            smiles1 = sptlist[2].strip()
            drug2 = sptlist[3].strip()
            smiles2 = sptlist[4].strip()
            if drug1 not in all_input_drug_info:
                all_input_drug_info[drug1] = smiles1
            if drug2 not in all_input_drug_info:
                all_input_drug_info[drug2] = smiles2            
        
    drug_similarity_info = {}
    for input_drug_id in all_input_drug_info:   
        try:
            each_smiles = all_input_drug_info[input_drug_id]
            drug2_mol = Chem.MolFromSmiles(each_smiles)
            drug2_mol = AllChem.AddHs(drug2_mol)            
        except:
            continue
        drug_similarity_info[input_drug_id] = {}
        for each_drug_id1 in drugbank_drugs:            
            drugbank_id = os.path.basename(each_drug_id1).split('.')[0]
            
            drug1_mol = Chem.MolFromMolFile(each_drug_id1)        
            drug1_mol = AllChem.AddHs(drug1_mol)    
            
            fps = AllChem.GetMorganFingerprint(drug1_mol, 2)
            fps2 = AllChem.GetMorganFingerprint(drug2_mol, 2)
            score = DataStructs.TanimotoSimilarity(fps, fps2)
            drug_similarity_info[input_drug_id][drugbank_id] = score
            
    df = pd.DataFrame.from_dict(drug_similarity_info)
    print(df.head(10))
    df = df.T
    df = df[drug_list]
    df.to_csv(output_file)

In [6]:
calculate_structure_similarity(drug_dir, input_file, output, drug_list)

         current drug(vitamin c)  other drug a(vitamin a)  \
DB00006                 0.037940                 0.084088   
DB00014                 0.061810                 0.147303   
DB00027                 0.029455                 0.087447   
DB00035                 0.062670                 0.121359   
DB00050                 0.053892                 0.125704   
DB00080                 0.056673                 0.105442   
DB00091                 0.046559                 0.121905   
DB00093                 0.054645                 0.114355   
DB00104                 0.083565                 0.132678   
DB00106                 0.053571                 0.133459   

         other drug b(riboflavin)  other drug b(acetaminophen)  \
DB00006                  0.076129                     0.055172   
DB00014                  0.134298                     0.090909   
DB00027                  0.083333                     0.051205   
DB00035                  0.119804                     0.101983  

In [9]:
def generate_input_profile(input_file, pca_profile_file):    
    df = pd.read_csv(pca_profile_file, index_col=0)
    # df.index = df.index.map(str)
    
    all_drugs = []
    interaction_list = []
    with open(input_file, 'r') as fp:
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug1 = sptlist[1].strip()
            drug2 = sptlist[3].strip()
            all_drugs.append(drug1)
            all_drugs.append(drug2)
            if drug1 in df.index and drug2 in df.index:
                interaction_list.append([prescription, drug1, drug2])
                interaction_list.append([prescription, drug2, drug1])
    
    drug_feature_info = {}
    columns = ['PC_%d' % (i + 1) for i in range(50)]
    for row in df.itertuples():
        drug = row.Index
        feature = []
        drug_feature_info[drug] = {}
        for col in columns:
            val = getattr(row, col) #遍历columns列表中的每个列名col，使用getattr(row, col)获取该药物在当前特征列上的值，并将其存储到feature列表中
            feature.append(val)
            drug_feature_info[drug][col] = val

    new_col1 = ['1_%s'%(i) for i in columns]
    new_col2 = ['2_%s'%(i) for i in columns]
    
    DDI_input = {}
    for each_drug_pair in tqdm.tqdm(interaction_list):
        prescription = each_drug_pair[0]
        drug1 = each_drug_pair[1]
        drug2 = each_drug_pair[2]
        key = '%s_%s_%s' % (prescription, drug1, drug2)
        
        DDI_input[key] = {}
        for col in columns:
            new_col = '1_%s'%(col)
            DDI_input[key][new_col] = drug_feature_info[drug1][col]
            
        for col in columns:
            new_col = '2_%s'%(col)
            DDI_input[key][new_col] = drug_feature_info[drug2][col]

    new_columns = []
    for i in [1,2]:
        for j in range(1, 51):
            new_key = '%s_PC_%s'%(i, j)
            new_columns.append(new_key)
            
    df = pd.DataFrame.from_dict(DDI_input)
    df = df.T
    df = df[new_columns]
    print(df.head())
    # df.to_csv(output_file)
    return df

In [10]:
pca_similarity_profile = 'output/PCA_transformed_similarity_profile.csv'
generate_input_profile(input_file, pca_similarity_profile)

100%|██████████| 16/16 [00:00<00:00, 14788.20it/s]

                                                      1_PC_1    1_PC_2  \
0_current drug(vitamin c)_other drug a(vitamin a)   4.779081 -0.669763   
0_other drug a(vitamin a)_current drug(vitamin c)   1.157777  2.189545   
1_current drug(vitamin c)_other drug b(riboflavin)  4.779081 -0.669763   
1_other drug b(riboflavin)_current drug(vitamin c)  0.849620 -0.201319   
2_current drug(vitamin c)_other drug b(acetamin...  4.779081 -0.669763   

                                                      1_PC_3    1_PC_4  \
0_current drug(vitamin c)_other drug a(vitamin a)   0.315355 -1.068710   
0_other drug a(vitamin a)_current drug(vitamin c)   0.193845 -0.175013   
1_current drug(vitamin c)_other drug b(riboflavin)  0.315355 -1.068710   
1_other drug b(riboflavin)_current drug(vitamin c)  0.764182 -0.292304   
2_current drug(vitamin c)_other drug b(acetamin...  0.315355 -1.068710   

                                                      1_PC_5    1_PC_6  \
0_current drug(vitamin c)_other drug




Unnamed: 0,1_PC_1,1_PC_2,1_PC_3,1_PC_4,1_PC_5,1_PC_6,1_PC_7,1_PC_8,1_PC_9,1_PC_10,...,2_PC_41,2_PC_42,2_PC_43,2_PC_44,2_PC_45,2_PC_46,2_PC_47,2_PC_48,2_PC_49,2_PC_50
0_current drug(vitamin c)_other drug a(vitamin a),4.779081,-0.669763,0.315355,-1.06871,0.328521,-0.143517,0.341388,0.918898,-0.337638,-0.156341,...,0.508362,0.279498,0.232231,0.05207,0.04537,-0.05268,0.113405,-0.14138,0.162729,0.117012
0_other drug a(vitamin a)_current drug(vitamin c),1.157777,2.189545,0.193845,-0.175013,-0.644596,0.285035,-0.08224,-0.22479,0.170635,0.093212,...,0.000991,-0.01812,0.090887,-0.017237,0.080754,-0.075362,-0.097176,0.009142,0.000163,-0.087791
1_current drug(vitamin c)_other drug b(riboflavin),4.779081,-0.669763,0.315355,-1.06871,0.328521,-0.143517,0.341388,0.918898,-0.337638,-0.156341,...,0.22773,0.048207,0.137785,-0.146022,-0.097824,-0.071615,-0.031287,-0.004953,-0.051761,-0.10423
1_other drug b(riboflavin)_current drug(vitamin c),0.84962,-0.201319,0.764182,-0.292304,-0.206393,-0.273284,-0.238824,0.543933,-0.066288,-0.881632,...,0.000991,-0.01812,0.090887,-0.017237,0.080754,-0.075362,-0.097176,0.009142,0.000163,-0.087791
2_current drug(vitamin c)_other drug b(acetaminophen),4.779081,-0.669763,0.315355,-1.06871,0.328521,-0.143517,0.341388,0.918898,-0.337638,-0.156341,...,-0.079871,0.068774,0.096829,-0.03117,-0.051972,-0.046326,0.095257,-0.054974,-0.229733,-0.012452
2_other drug b(acetaminophen)_current drug(vitamin c),1.484298,-2.561948,-0.015537,-0.670673,0.813794,0.884796,-0.125139,-0.224663,-0.19337,0.417956,...,0.000991,-0.01812,0.090887,-0.017237,0.080754,-0.075362,-0.097176,0.009142,0.000163,-0.087791
3_current drug(vitamin c)_other drug c(formoterol),4.779081,-0.669763,0.315355,-1.06871,0.328521,-0.143517,0.341388,0.918898,-0.337638,-0.156341,...,0.065523,-0.058084,0.193115,-0.100984,0.00614,-0.125687,0.044221,-0.219984,-0.164081,0.067981
3_other drug c(formoterol)_current drug(vitamin c),-1.676893,-0.926036,0.668989,0.875413,-0.238925,0.440258,0.238083,0.266604,-0.640567,-0.417227,...,0.000991,-0.01812,0.090887,-0.017237,0.080754,-0.075362,-0.097176,0.009142,0.000163,-0.087791
4_current drug(ritonavir)_other drug a(vitamin a),0.694746,0.872233,0.647547,2.358324,0.463532,-0.29043,0.125962,-0.240189,0.070273,-0.011636,...,0.508362,0.279498,0.232231,0.05207,0.04537,-0.05268,0.113405,-0.14138,0.162729,0.117012
4_other drug a(vitamin a)_current drug(ritonavir),1.157777,2.189545,0.193845,-0.175013,-0.644596,0.285035,-0.08224,-0.22479,0.170635,0.093212,...,-0.004691,-0.081305,0.058039,0.02745,-0.246305,-0.072478,-0.012652,-0.072926,-0.077315,-0.102


In [48]:
def predict_DDI(output_file, df, trained_model, trained_weight, threshold, binarizer_file, model_type):  
    with open(binarizer_file, 'rb') as fid:
        lb = pickle.load(fid)
    
    #print(dir(lb)) 
    
    print(df.head(10))
    
    #df = pd.read_csv(ddi_input_file, index_col=0)    
    ddi_pairs = list(df.index)
    print("ddi_pairs:")
    print(len(ddi_pairs))
    X = df.values    
    
    json_file = open(trained_model, "r")
    loaded_model_json = json_file.read() 
    json_file.close()

    model = model_from_json(loaded_model_json)
    model.load_weights(trained_weight)

    mc_predictions = []
    iter_num = 10
    for i in range(iter_num):
        y_predicted = model.predict(X) #(16,113)
        mc_predictions.append(y_predicted)

    print("mc_predictions:")
    arr = np.asarray(mc_predictions)
    print(arr.shape)
    
    y_predicted_mean = np.mean(arr, axis=0)
    
    print(" y_predicted_mean_before:")
    print( y_predicted_mean.shape)
    
    y_predicted_std = np.std(arr, axis=0)
    
    original_predicted_ddi = copy.deepcopy(y_predicted_mean)
    original_predicted_ddi_std = copy.deepcopy(y_predicted_std)

    y_predicted_mean[y_predicted_mean >= threshold] = 1 # 根据阈值将预测概率大于threshold的项设置为1
    y_predicted_mean[y_predicted_mean < threshold] = 0
    
    print(" y_predicted_mean_after:")
    print(y_predicted_mean[1])
        
    y_predicted_inverse = lb.inverse_transform(y_predicted_mean)   # 将二进制的结果返回标签 即为1的项是对应的标签 所以预测的结果可能不止一个
    
    print("y_predicted_inverse:")
    print( y_predicted_inverse)
    
    fp = open(output_file, 'w')
    fp.write('Drug pair\tPredicted class\tScore\tSTD\n')
    for i in range(len(ddi_pairs)):
        predicted_ddi_score = original_predicted_ddi[i]
        predicted_ddi_std = original_predicted_ddi_std[i]
        predicted_ddi = y_predicted_inverse[i] # 使用之前加载的二值化器对象(lb)将二进制形式的预测结果转换回原始的分类标签
        each_ddi = ddi_pairs[i]           
        for each_predicted_ddi in predicted_ddi:
            if model_type == 'model2':
                fp.write('%s\t%s\t%s\t%s\n'%(each_ddi, each_predicted_ddi+113, predicted_ddi_score[each_predicted_ddi-1], predicted_ddi_std[each_predicted_ddi-1]))

            else:
                fp.write('%s\t%s\t%s\t%s\n'%(each_ddi, each_predicted_ddi, predicted_ddi_score[each_predicted_ddi-1], predicted_ddi_std[each_predicted_ddi-1]))

    fp.close()

In [49]:
from keras.models import model_from_json 
import numpy as np
output_file = 'test/DDI_result.txt'
ddi_trained_model = './data/models/ddi_model.json'
ddi_trained_model_weight='./data/models/ddi_model.h5'
model1_threshold=0.4
binarizer_file='./data/multilabelbinarizer.pkl'
model_type='model1'
pca_df = generate_input_profile(input_file, pca_similarity_profile)
predict_DDI(output_file, pca_df, ddi_trained_model, ddi_trained_model_weight, model1_threshold, binarizer_file, model_type)

100%|██████████| 16/16 [00:00<00:00, 21536.86it/s]


                                                      1_PC_1    1_PC_2  \
0_current drug(vitamin c)_other drug a(vitamin a)   4.779081 -0.669763   
0_other drug a(vitamin a)_current drug(vitamin c)   1.157777  2.189545   
1_current drug(vitamin c)_other drug b(riboflavin)  4.779081 -0.669763   
1_other drug b(riboflavin)_current drug(vitamin c)  0.849620 -0.201319   
2_current drug(vitamin c)_other drug b(acetamin...  4.779081 -0.669763   

                                                      1_PC_3    1_PC_4  \
0_current drug(vitamin c)_other drug a(vitamin a)   0.315355 -1.068710   
0_other drug a(vitamin a)_current drug(vitamin c)   0.193845 -0.175013   
1_current drug(vitamin c)_other drug b(riboflavin)  0.315355 -1.068710   
1_other drug b(riboflavin)_current drug(vitamin c)  0.764182 -0.292304   
2_current drug(vitamin c)_other drug b(acetamin...  0.315355 -1.068710   

                                                      1_PC_5    1_PC_6  \
0_current drug(vitamin c)_other drug

In [46]:
def calculate_structure_similarity(drug_dir, input_file, output_file, drug_list):
    drugbank_drugs = glob.glob(drug_dir + '*')
    print(len(drugbank_drugs)) # 2386
    all_input_drug_info = {}
    with open(input_file, 'r')as fp:
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug1 = sptlist[1].strip()
            smiles1 = sptlist[2].strip()
            drug2 = sptlist[3].strip()
            smiles2 = sptlist[4].strip()
            if drug1 not in all_input_drug_info:
                all_input_drug_info[drug1] = smiles1
            if drug2 not in all_input_drug_info:
                all_input_drug_info[drug2] = smiles2            
        
    drug_similarity_info = {}
    for input_drug_id in all_input_drug_info:   
        try:
            each_smiles = all_input_drug_info[input_drug_id]
            drug2_mol = Chem.MolFromSmiles(each_smiles)
            drug2_mol = AllChem.AddHs(drug2_mol)            
        except:
            continue
        drug_similarity_info[input_drug_id] = {}
        for each_drug_id1 in drugbank_drugs:            
            drugbank_id = os.path.basename(each_drug_id1).split('.')[0]
            
            drug1_mol = Chem.MolFromMolFile(each_drug_id1)        
            drug1_mol = AllChem.AddHs(drug1_mol)    
            
            fps = AllChem.GetMorganFingerprint(drug1_mol, 2)
            fps2 = AllChem.GetMorganFingerprint(drug2_mol, 2)
            score = DataStructs.TanimotoSimilarity(fps, fps2)
            drug_similarity_info[input_drug_id][drugbank_id] = score
            
    df = pd.DataFrame.from_dict(drug_similarity_info)
    df = df.T
    df = df[drug_list]
    df.to_csv(output_file)

In [47]:
drug_dir='./data/DrugBank5.0_Approved_drugs/' 
input_file='output/DDI_input.txt'
output='test/similarity_profile.csv'
calculate_structure_similarity(drug_dir, input_file, output_file, drug_list)

2386


In [33]:
import torch
import os
import glob
import pickle
import pandas as pd
import openpyxl
from tqdm import tqdm
from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit import Chem
import numpy as np

In [16]:
def concat_drkg(input_file):
    with open('./data/drkg/index.pkl','rb') as file:
        index = pickle.load(file)
    print(len(index.keys()))
    node_emb = np.load('./data/drkg/DRKG_TransE_l2_entity.npy')
    print(node_emb.shape)
    non = set()
    all_drugs = []
    interaction_list = []
    with open(input_file, 'r') as fp:
        next(fp)  # 跳过第一行
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug1 = sptlist[1].strip()
            drug2 = sptlist[3].strip()
            all_drugs.append(drug1)
            all_drugs.append(drug2)
            if drug1 in index.keys() and drug2 in index.keys():
                interaction_list.append([prescription, drug1, drug2])
                interaction_list.append([prescription, drug2, drug1])
            elif drug1 not in index.keys():
                non.add(drug1)
            elif drug2 not in index.keys():
                non.add(drug2)
    print(non)
    # with open('./non.pkl','wb') as file:
    #           pickle.dump(non,file)
    drug_feature_info = {}
    columns = ['PC_%d' % (i + 1) for i in range(50)]
    DDI_input = {}
    for each_drug_pair in tqdm(interaction_list):
        prescription = each_drug_pair[0]
        drug1 = each_drug_pair[1]
        drug2 = each_drug_pair[2]
        key = '%s_%s_%s' % (prescription, drug1, drug2)

        DDI_input[key] = {}
        count = 0
        for col in columns:
            new_col = '1_%s' % (col)
            DDI_input[key][new_col] = node_emb[int(index[drug1])][count]
            count+=1

        count = 0
        for col in columns:
            new_col = '2_%s' % (col)
            DDI_input[key][new_col] = node_emb[int(index[drug2])][count]
            count+=1


    new_columns = []
    for i in [1, 2]:
        for j in range(1, 51):
            new_key = '%s_PC_%s' % (i, j)
            new_columns.append(new_key)

    df = pd.DataFrame.from_dict(DDI_input)
    df = df.T
    df = df[new_columns] # 选择特定的列
    print("数据集长度：")
    print(df.shape)
    df.to_csv('./test/drkg.csv')
    return df

In [17]:
concat_drkg('./test/my_ddi_input.txt')
print("over")

97167
(97238, 50)
{'DB11255', 'DB09323', 'DB13116'}


100%|██████████| 443636/443636 [00:41<00:00, 10796.71it/s]


数据集长度：
(443636, 100)
over


In [None]:
print("over")

In [10]:
import pandas as pd
from tqdm import tqdm

# 读取CSV文件
df = pd.read_csv('./test/my_pca_merge_ddi.csv')
print(df.head())

# 将index列按下划线分割成三列
df[['col1', 'col2', 'col3']] = df['Unnamed: 0'].str.split('_', expand=True) # df[['col1', 'col2', 'col3']]：选择DataFrame中的列，并指定列名为 col1、col2 和 col3

# 创建一个集合，包含需要删除的值
values_to_delete = {'DB11255', 'DB09323', 'DB13116'}

# 创建一个空的DataFrame用于保存筛选后的数据
filtered_df = pd.DataFrame()

# 遍历每一行，并使用tqdm显示进度条
for _, row in tqdm(df.iterrows(), total=len(df), desc='Processing Rows'):
    if row['col2'] not in values_to_delete or row['col3'] not in values_to_delete:
        filtered_df = filtered_df.append(row)

# 重置索引
filtered_df = filtered_df.reset_index(drop=True)

# 删除不需要的列
filtered_df = filtered_df.drop(['col1', 'col2', 'col3'], axis=1)

print(filtered_df.shape)

# 保存结果到新的CSV文件
filtered_df.to_csv('./test/new_pca50_merge_ddi.csv', index=False)


          Unnamed: 0    1_PC_1    1_PC_2    1_PC_3    1_PC_4    1_PC_5  \
0  0_DB01115_DB08807 -2.398428 -1.182381  0.706467 -0.146689 -0.174887   
1  0_DB08807_DB01115 -3.971019 -0.445289  1.269032  1.903749 -0.258912   
2  1_DB01115_DB00187 -2.398428 -1.182381  0.706467 -0.146689 -0.174887   
3  1_DB00187_DB01115 -3.724583  0.188417  2.805852  0.289996 -0.195068   
4  2_DB01115_DB00871 -2.398428 -1.182381  0.706467 -0.146689 -0.174887   

     1_PC_6    1_PC_7    1_PC_8    1_PC_9  ...   2_PC_41   2_PC_42   2_PC_43  \
0 -0.128857 -0.292951 -0.375137  0.017772  ...  0.039695 -0.028915  0.152359   
1  0.101450  0.344638 -0.170433 -0.279006  ...  0.245272 -0.252027  0.369327   
2 -0.128857 -0.292951 -0.375137  0.017772  ... -0.097658 -0.082437  0.022612   
3  0.773993 -0.344943  0.091687 -0.876016  ...  0.245272 -0.252027  0.369327   
4 -0.128857 -0.292951 -0.375137  0.017772  ... -0.046064 -0.098426  0.153622   

    2_PC_44   2_PC_45   2_PC_46   2_PC_47   2_PC_48   2_PC_49   2_PC_50  


Processing Rows:   7%|▋         | 31310/444254 [08:08<1:47:20, 64.12it/s]


KeyboardInterrupt: 

In [13]:
non = {'DB11255', 'DB09323', 'DB13116'}
def generate_input_profile(input_file, pca_profile_file):
    df = pd.read_csv(pca_profile_file, index_col=0)
    all_drugs = []
    interaction_list = []
    with open(input_file, 'r') as fp:
        next(fp)  # 跳过第一行
        for line in fp:
            sptlist = line.strip().split('\t')
            prescription = sptlist[0].strip()
            drug1 = sptlist[1].strip()
            drug2 = sptlist[3].strip()
            all_drugs.append(drug1)
            all_drugs.append(drug2)
            if drug1 not in non and drug2 not in non:
                interaction_list.append([prescription, drug1, drug2])
                interaction_list.append([prescription, drug2, drug1])

    drug_feature_info = {}
    columns = ['PC_%d' % (i + 1) for i in range(50)]
    for row in df.itertuples():
        drug = row.Index
        feature = []
        drug_feature_info[drug] = {}
        for col in columns:
            val = getattr(row, col)  # 遍历columns列表中的每个列名col，使用getattr(row, col)获取该药物在当前特征列上的值，并将其存储到feature列表中
            feature.append(val)
            drug_feature_info[drug][col] = val

    new_col1 = ['1_%s' % (i) for i in columns]
    new_col2 = ['2_%s' % (i) for i in columns]

    DDI_input = {}
    for each_drug_pair in tqdm(interaction_list):
        prescription = each_drug_pair[0]
        drug1 = each_drug_pair[1]
        drug2 = each_drug_pair[2]
        key = '%s_%s_%s' % (prescription, drug1, drug2)

        DDI_input[key] = {}
        for col in columns:
            new_col = '1_%s' % (col)
            DDI_input[key][new_col] = drug_feature_info[drug1][col]

        for col in columns:
            new_col = '2_%s' % (col)
            DDI_input[key][new_col] = drug_feature_info[drug2][col]


    new_columns = []
    for i in [1, 2]:
        for j in range(1, 51):
            new_key = '%s_PC_%s' % (i, j)
            new_columns.append(new_key)

    df = pd.DataFrame.from_dict(DDI_input)
    df = df.T
    df = df[new_columns] # 选择特定的列
    print("数据集长度：")
    print(df.shape)
    df.to_csv('./test/new_pca50_merge_ddi.csv')
    return df


In [14]:
generate_input_profile('./test/my_ddi_input.txt', './data/drug_tanimoto_PCA50.csv')
print("over")

100%|██████████| 443636/443636 [00:21<00:00, 20800.84it/s]


数据集长度：
(443636, 100)
over


Processing: 100%|██████████| 222127/222127 [00:00<00:00, 560232.40it/s]


标签长度：


FileNotFoundError: [Errno 2] No such file or directory: '../test/new_label.csv'

In [15]:
def generate_label(input_file,pca_profile_file):
    df = pd.read_csv(pca_profile_file, index_col=0)
    Label = []
    with open(input_file, 'r') as fp:
        next(fp)
        total_lines = sum(1 for line in fp)  # 获取文件总行数
        fp.seek(0)  # 重置文件指针到开头
        next(fp)  # 跳过第一行
        for line in tqdm(fp, total=total_lines, desc="Processing"):  # 添加进度条
            sptlist = line.strip().split('\t')
            drug1 = sptlist[1].strip()
            drug2 = sptlist[3].strip()
            label1 = label2= sptlist[5].strip()
            # print(label1)
            if drug1 not in non and drug2 not in non:
                Label.append(label1)
                Label.append(label2)
        # 创建包含 Label 数据的 DataFrame
    label_df = pd.DataFrame(Label, columns=['label'])
    print("标签长度：")
    # 将 DataFrame 保存为 CSV 文件
    label_df.to_csv('./test/new_label.csv', index=False)
    return
generate_label('./test/my_ddi_input.txt', './data/drug_tanimoto_PCA50.csv')
print("over2")

Processing: 100%|██████████| 222127/222127 [00:00<00:00, 576695.77it/s]


标签长度：
over2


# 相加 DRKG 和 pca50

In [26]:
# 读取第一个DRKG
df1 = pd.read_csv('./test/drkg.csv')

# 读取第二个pca
df2 = pd.read_csv('./test/new_pca50_merge_ddi.csv')

# 将DataFrame转换为NumPy数组
array1 = df1.to_numpy()
array2 = df2.to_numpy()

# 对两个数组进行相加
result = array1 + array2

# 创建包含结果的DataFrame
result_df = pd.DataFrame(result)

# 将结果保存到新的CSV文件
result_df.to_csv('./test/embedding_merge.csv',index=False)
print("over")

over


In [29]:
df1 = pd.read_csv('./test/embedding_merge.csv')
print(df1.shape)

(443636, 101)


In [30]:
df1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0_DB01115_DB088070_DB01115_DB08807,-1.849719,-1.214734,-0.172227,-0.746839,-1.003626,-0.801818,-0.139167,-0.823107,1.01498,...,0.894547,-0.161951,-1.172961,1.06269,1.13053,1.168596,0.422025,-0.412213,-0.833719,1.34741
1,0_DB08807_DB011150_DB08807_DB01115,-3.249301,0.298642,0.640146,1.141439,-0.714042,-0.608179,0.585138,0.599142,0.900083,...,0.337215,0.557635,-0.716982,1.910869,0.773416,1.722536,0.788302,-0.682883,-0.881405,0.365298
2,1_DB01115_DB001871_DB01115_DB00187,-1.849719,-1.214734,-0.172227,-0.746839,-1.003626,-0.801818,-0.139167,-0.823107,1.01498,...,0.527571,1.311746,-0.60131,1.724721,1.048649,1.840106,0.676813,-0.959984,-0.356633,1.485138
3,1_DB00187_DB011151_DB00187_DB01115,-3.522489,-0.053816,1.850403,-1.225066,-2.050826,0.023559,0.509622,0.182884,0.397888,...,0.337215,0.557635,-0.716982,1.910869,0.773416,1.722536,0.788302,-0.682883,-0.881405,0.365298
4,2_DB01115_DB008712_DB01115_DB00871,-1.849719,-1.214734,-0.172227,-0.746839,-1.003626,-0.801818,-0.139167,-0.823107,1.01498,...,0.823506,1.224304,-1.024477,1.536554,0.62031,1.772415,1.305437,-0.871615,-0.449548,0.411876


In [28]:
df2 = pd.read_csv('./test/my_pca_merge_ddi.csv')
df2.head()

Unnamed: 0.1,Unnamed: 0,1_PC_1,1_PC_2,1_PC_3,1_PC_4,1_PC_5,1_PC_6,1_PC_7,1_PC_8,1_PC_9,...,2_PC_41,2_PC_42,2_PC_43,2_PC_44,2_PC_45,2_PC_46,2_PC_47,2_PC_48,2_PC_49,2_PC_50
0,0_DB01115_DB08807,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,0.039695,-0.028915,0.152359,-0.05443,-0.085126,0.047993,0.178879,0.047851,-0.019014,0.166939
1,0_DB08807_DB01115,-3.971019,-0.445289,1.269032,1.903749,-0.258912,0.10145,0.344638,-0.170433,-0.279006,...,0.245272,-0.252027,0.369327,0.238358,0.125903,0.21813,0.028721,0.017302,-0.108301,-0.030185
2,1_DB01115_DB00187,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,-0.097658,-0.082437,0.022612,-0.067926,-0.027426,0.181933,0.011387,-0.193106,-0.014879,0.050949
3,1_DB00187_DB01115,-3.724583,0.188417,2.805852,0.289996,-0.195068,0.773993,-0.344943,0.091687,-0.876016,...,0.245272,-0.252027,0.369327,0.238358,0.125903,0.21813,0.028721,0.017302,-0.108301,-0.030185
4,2_DB01115_DB00871,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,-0.046064,-0.098426,0.153622,-0.055052,-0.104575,-0.018069,-0.012775,-0.032028,-0.049096,0.093242


In [31]:
df3 = pd.read_csv('./test/drkg.csv')
df3.head()

Unnamed: 0.1,Unnamed: 0,1_PC_1,1_PC_2,1_PC_3,1_PC_4,1_PC_5,1_PC_6,1_PC_7,1_PC_8,1_PC_9,...,2_PC_41,2_PC_42,2_PC_43,2_PC_44,2_PC_45,2_PC_46,2_PC_47,2_PC_48,2_PC_49,2_PC_50
0,0_DB01115_DB08807,0.54871,-0.032353,-0.878695,-0.60015,-0.828739,-0.672961,0.153784,-0.44797,0.997208,...,0.854851,-0.133036,-1.32532,1.117119,1.215656,1.120603,0.243147,-0.460065,-0.814705,1.180471
1,0_DB08807_DB01115,0.721718,0.743931,-0.628886,-0.76231,-0.45513,-0.709629,0.2405,0.769575,1.179089,...,0.091943,0.809662,-1.086309,1.67251,0.647513,1.504406,0.759581,-0.700185,-0.773104,0.395483
2,1_DB01115_DB00187,0.54871,-0.032353,-0.878695,-0.60015,-0.828739,-0.672961,0.153784,-0.44797,0.997208,...,0.625229,1.394182,-0.623922,1.792647,1.076075,1.658172,0.665426,-0.766879,-0.341754,1.434189
3,1_DB00187_DB01115,0.202094,-0.242233,-0.955449,-1.515062,-1.855757,-0.750434,0.854565,0.091198,1.273904,...,0.091943,0.809662,-1.086309,1.67251,0.647513,1.504406,0.759581,-0.700185,-0.773104,0.395483
4,2_DB01115_DB00871,0.54871,-0.032353,-0.878695,-0.60015,-0.828739,-0.672961,0.153784,-0.44797,0.997208,...,0.86957,1.32273,-1.178099,1.591607,0.724884,1.790485,1.318212,-0.839588,-0.400452,0.318634


In [32]:
df4 = pd.read_csv('./test/new_pca50_merge_ddi.csv')
df4.head()

Unnamed: 0.1,Unnamed: 0,1_PC_1,1_PC_2,1_PC_3,1_PC_4,1_PC_5,1_PC_6,1_PC_7,1_PC_8,1_PC_9,...,2_PC_41,2_PC_42,2_PC_43,2_PC_44,2_PC_45,2_PC_46,2_PC_47,2_PC_48,2_PC_49,2_PC_50
0,0_DB01115_DB08807,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,0.039695,-0.028915,0.152359,-0.05443,-0.085126,0.047993,0.178879,0.047851,-0.019014,0.166939
1,0_DB08807_DB01115,-3.971019,-0.445289,1.269032,1.903749,-0.258912,0.10145,0.344638,-0.170433,-0.279006,...,0.245272,-0.252027,0.369327,0.238358,0.125903,0.21813,0.028721,0.017302,-0.108301,-0.030185
2,1_DB01115_DB00187,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,-0.097658,-0.082437,0.022612,-0.067926,-0.027426,0.181933,0.011387,-0.193106,-0.014879,0.050949
3,1_DB00187_DB01115,-3.724583,0.188417,2.805852,0.289996,-0.195068,0.773993,-0.344943,0.091687,-0.876016,...,0.245272,-0.252027,0.369327,0.238358,0.125903,0.21813,0.028721,0.017302,-0.108301,-0.030185
4,2_DB01115_DB00871,-2.398428,-1.182381,0.706467,-0.146689,-0.174887,-0.128857,-0.292951,-0.375137,0.017772,...,-0.046064,-0.098426,0.153622,-0.055052,-0.104575,-0.018069,-0.012775,-0.032028,-0.049096,0.093242


In [47]:

# 读取第一个DRKG
df1 = pd.read_csv('./test/drkg.csv')

# 读取第二个pca
df2 = pd.read_csv('./test/new_pca50_merge_ddi.csv')

# 拼接两个DataFrame
result_df = pd.concat([df1, df2], axis=1)  # 按列拼接


NameError: name 'component' is not defined

In [48]:
result_df = result_df.drop(result_df.columns[101],axis = 1)


Merged data saved.


In [51]:
# 创建要插入的新列数据
new_column_data = df1['Unnamed: 0']  # 根据实际需求替换为你的新列数据

# 插入新列作为第一列
result_df.insert(0, 'component', new_column_data)


In [52]:
result_df.shape

(443636, 201)

In [53]:
result_df.columns

Index(['component', '1_PC_1', '1_PC_2', '1_PC_3', '1_PC_4', '1_PC_5', '1_PC_6',
       '1_PC_7', '1_PC_8', '1_PC_9',
       ...
       '2_PC_41', '2_PC_42', '2_PC_43', '2_PC_44', '2_PC_45', '2_PC_46',
       '2_PC_47', '2_PC_48', '2_PC_49', '2_PC_50'],
      dtype='object', length=201)

In [56]:
result_df.columns[100]

'2_PC_50'

In [57]:
# 将结果保存到新的CSV文件
result_df.to_csv('./test/concat_data.csv', index=False)
print("Merged data saved.")

Merged data saved.


In [None]:
import pandas as pd
import numpy as np
import pickle

# 读取csv文件
df = pd.read_csv('./data/ddi_description_label.csv', header=None, names=['Drug A','Drug B','DDI sentence','num_of_words','label'])

# 划分训练集和测试集
train_ratio = 0.8
train_size = int(len(df) * train_ratio)
train_data = df[:train_size].reset_index(drop=True)
test_data = df[train_size:].reset_index(drop=True)

# 保存为pkl文件
data = {'train_data': train_data, 'test_data': test_data}
with open('data/ddi_label.pkl', 'wb') as f:
    pickle.dump(data, f)

print('数据已保存到 data/ddi_label.pkl')

In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split

# 读取数据集
data = pd.read_csv('./data/ddi_description_label.csv')

# 合并 Drug A 和 Drug B 列，并去重
all_drugs = pd.concat([data['Drug A'], data['Drug B']]).unique()

# 确保合并后的药品数量为 1861
print("Number of unique drugs: ", len(all_drugs))

# 按药品分组，并随机从每组中选择一定比例的样本放入训练集中
train_groups = []
test_groups = []
for drug in all_drugs:
    group = data[(data['Drug A'] == drug) | (data['Drug B'] == drug)]
    if len(group) > 1:
        train_group, test_group = train_test_split(group, test_size=0.2, train_size=0.8, random_state=42)
    else:
        train_group = group
        test_group = group
    train_groups.append(train_group)
    test_groups.append(test_group)

# 将所有的训练集和测试集合并
train_data = pd.concat(train_groups)
test_data = pd.concat(test_groups)

# 输出训练集和测试集的大小
print("Number of training examples: ", len(train_data))
print("Number of testing examples: ", len(test_data))

# 保存为 pkl 文件
data = {'train_data': train_data, 'test_data': test_data}
with open('data/ddi_label.pkl', 'wb') as f:
    pickle.dump(data, f)
