In [2]:
import pandas as pd

In [3]:
dti_data = pd.read_csv('dti.csv',sep='\t',header=None,names=['drug','relation','target'])
dti_data

Unnamed: 0,drug,relation,target
0,DB00001,inhibitor,P00734
1,DB00002,binder,P00533
2,DB00002,binder,O75015
3,DB00002,binder,P02745
4,DB00002,binder,P02746
...,...,...,...
30398,DB17851,binder,Q04609
30399,DB18236,inhibitor,Q07912
30400,DB18704,stimulator,Q99062
30401,DB18716,inhibitor,P37321


In [4]:
drug_disease_data = pd.read_csv('drug_disease.txt.gz', compression='gzip', delimiter='\t',header=None,names=['drug','relation','disease'])
drug_disease_data

Unnamed: 0,drug,relation,disease
0,DB01015,DRUG_DISEASE_ASSOCIATION,D010211
1,DB11251,DRUG_DISEASE_ASSOCIATION,D006529
2,DB01220,DRUG_DISEASE_ASSOCIATION,D006501
3,DB11231,DRUG_DISEASE_ASSOCIATION,D015430
4,DB00806,DRUG_DISEASE_ASSOCIATION,D014012
...,...,...,...
69432,DB01223,DRUG_DISEASE_ASSOCIATION,D015431
69433,DB01296,DRUG_DISEASE_ASSOCIATION,D010024
69434,DB01406,DRUG_DISEASE_ASSOCIATION,D052456
69435,DB00158,DRUG_DISEASE_ASSOCIATION,D000013


In [5]:
ppi_data = pd.read_csv('ppi.txt.gz', compression='gzip', delimiter='\t',header=None,names=['protein1','relation','protein2'])
ppi_data

Unnamed: 0,protein1,relation,protein2
0,P40305,PPI,P55056
1,O76024,PPI,Q68G74
2,Q96AN5,PPI,Q9UHE5
3,O14656,PPI,Q9NX07
4,P06396,PPI,Q8N302
...,...,...,...
131401,P01733,PPI,P19174
131402,Q8N2H9,PPI,Q9HAT8
131403,P10636,PPI,Q8TCE9
131404,P11388,PPI,P35222


In [6]:
protein_disease_data = pd.read_csv('protein_disease.txt.gz', compression='gzip', delimiter='\t',header=None,names=['protein','relation','disease'])
protein_disease_data

Unnamed: 0,protein,relation,disease
0,P30279,PROTEIN_DISEASE_ASSOCIATION,D003110
1,P61585,PROTEIN_DISEASE_ASSOCIATION,D017496
2,P17661,PROTEIN_DISEASE_ASSOCIATION,D001281
3,O96017,PROTEIN_DISEASE_ASSOCIATION,D011471
4,P24623,PROTEIN_DISEASE_ASSOCIATION,D001327
...,...,...,...
116672,P01308,PROTEIN_DISEASE_ASSOCIATION,D010292
116673,Q29056,PROTEIN_DISEASE_ASSOCIATION,D009103
116674,Q9ET77,PROTEIN_DISEASE_ASSOCIATION,D015179
116675,P12527,PROTEIN_DISEASE_ASSOCIATION,D001172


In [7]:
drug_entity = pd.concat([dti_data['drug'],drug_disease_data['drug']],axis=0,ignore_index=True).drop_duplicates().reset_index(drop=True)
drug_entity.shape

(9018,)

In [8]:
protein_entity = pd.concat([dti_data['target'],ppi_data['protein1'],ppi_data['protein2'],protein_disease_data['protein']],axis=0,ignore_index=True).drop_duplicates().reset_index(drop=True)
protein_entity.shape

(41413,)

In [9]:
disease_entity = pd.concat([drug_disease_data['disease'],protein_disease_data['disease']],axis=0,ignore_index=True).drop_duplicates().reset_index(drop=True)
disease_entity.shape

(5857,)

In [10]:
# 统一列名为 head, relation, tail
dti_data.columns = ['head', 'relation', 'tail']
drug_disease_data.columns = ['head', 'relation', 'tail']
# ppi_data.columns = ['head', 'relation', 'tail']
protein_disease_data.columns = ['head', 'relation', 'tail']
# 合并数据
all_data = pd.concat([dti_data, drug_disease_data,  protein_disease_data], axis=0, ignore_index=True)
# 查看合并后的数据形状和前几行数据
print(all_data.shape)
print(all_data.head())

(216517, 3)
      head   relation    tail
0  DB00001  inhibitor  P00734
1  DB00002     binder  P00533
2  DB00002     binder  O75015
3  DB00002     binder  P02745
4  DB00002     binder  P02746


In [11]:
from sklearn.model_selection import train_test_split

# 先划分出训练集和剩余集 (包含验证集和测试集)
train_data, remaining_data = train_test_split(all_data, test_size=0.15, random_state=42)

# 再从剩余集中划分验证集和测试集
val_data, test_data = train_test_split(remaining_data, test_size=1/3, random_state=42)

In [12]:
train_data

Unnamed: 0,head,relation,tail
76927,DB00537,DRUG_DISEASE_ASSOCIATION,D005334
73806,DB14085,DRUG_DISEASE_ASSOCIATION,D007333
68436,DB00997,DRUG_DISEASE_ASSOCIATION,D009336
73628,DB00945,DRUG_DISEASE_ASSOCIATION,D007003
109901,P22091,PROTEIN_DISEASE_ASSOCIATION,D054218
...,...,...,...
119879,P81187,PROTEIN_DISEASE_ASSOCIATION,D064419
103694,P14844,PROTEIN_DISEASE_ASSOCIATION,D007333
131932,P07711,PROTEIN_DISEASE_ASSOCIATION,D045169
146867,Q6YJI5,PROTEIN_DISEASE_ASSOCIATION,D005909


In [13]:
val_data

Unnamed: 0,head,relation,tail
151361,Q92734,PROTEIN_DISEASE_ASSOCIATION,C535717
31080,DB00493,DRUG_DISEASE_ASSOCIATION,D008586
174241,P35875,PROTEIN_DISEASE_ASSOCIATION,D000505
37768,DB00650,DRUG_DISEASE_ASSOCIATION,D009202
82330,DB01234,DRUG_DISEASE_ASSOCIATION,D009410
...,...,...,...
184745,Q6PZD9,PROTEIN_DISEASE_ASSOCIATION,D054198
129408,P51671,PROTEIN_DISEASE_ASSOCIATION,D004715
133803,O95661,PROTEIN_DISEASE_ASSOCIATION,D006528
43571,DB12802,DRUG_DISEASE_ASSOCIATION,D009101


In [14]:
test_data

Unnamed: 0,head,relation,tail
101521,Q0V8L6,PROTEIN_DISEASE_ASSOCIATION,D009976
187376,Q6PHW0,PROTEIN_DISEASE_ASSOCIATION,C562770
145021,Q9VH08,PROTEIN_DISEASE_ASSOCIATION,D006332
146689,Q15714,PROTEIN_DISEASE_ASSOCIATION,D002471
14745,DB03222,unknown,P07071
...,...,...,...
149280,P17917,PROTEIN_DISEASE_ASSOCIATION,D003110
7329,DB00857,substrate,P11712
87663,DB00531,DRUG_DISEASE_ASSOCIATION,D004802
196069,Q07817,PROTEIN_DISEASE_ASSOCIATION,D013274


In [15]:
train_data.to_csv('../data/PharmRG/train',sep='\t',index=False,header=False)
val_data.to_csv('../data/PharmRG/valid',sep='\t',index=False,header=False)
test_data.to_csv('../data/PharmRG/test',sep='\t',index=False,header=False)