In [1]:
import csv #用于读写csv文件
import collections #用于高效实现数据类型
import xml.etree.ElementTree as ET #用于处理xml文件
import pandas #用于处理数据

In [2]:
#读取xml文件
xml_path = 'download/full database.xml' #xml文件路径
with open(xml_path) as f: #打开xml文件
    tree = ET.parse(f) #解析xml文件
root = tree.getroot() #获取根节点

提取药物信息

In [3]:
name1='{http://www.drugbank.ca}'    #命名空间

#提取xml文件中的数据
rows = list()
for drug in root:
    row = collections.OrderedDict()
    assert drug.tag == name1 + 'drug'
    row['id'] = drug.findtext(name1 + 'drugbank-id[@primary="true"]')
    row['name'] = drug.findtext(name1 + 'name')
    row['type'] = drug.get('type')
    row['groups'] = [group.text for group in 
        drug.findall("{ns}groups/{ns}group".format(ns = name1))]
    row['atc_codes']= [code.get('code') for code in
        drug.findall("{ns}atc-codes/{ns}atc-code".format(ns = name1))] 
    row['categories'] = [x.findtext(name1 + 'category') for x in
        drug.findall("{ns}categories/{ns}category".format(ns = name1))]
    row['smiles'] = drug.findtext( "{ns}calculated-properties/{ns}property[{ns}kind='SMILES']/{ns}value".format(ns = name1))
    row['pubmed_id'] = drug.findtext( "{ns}general-references/{ns}articles/{ns}article/{ns}pubmed-id".format(ns=name1))
    row['synonyms'] = [synonym.text for synonym in
        drug.findall("{ns}synonyms/{ns}synonym".format(ns=name1))]
    row['inchi'] = drug.findtext( "{ns}calculated-properties/{ns}property[{ns}kind='InChI']/{ns}value".format(ns=name1))
    row['inchikey'] = drug.findtext( "{ns}calculated-properties/{ns}property[{ns}kind='InChIKey']/{ns}value".format(ns=name1))
    row['description'] = drug.findtext(name1 + 'description')
    
    rows.append(row)

In [4]:
# 使用'|'合并多个值到一格中
def collapse_list_values(row):
    for key, value in row.items():
        if isinstance(value, list):
            row[key] = '|'.join(value)
    return row

rows = list(map(collapse_list_values, rows))

In [5]:
drug_df = pandas.DataFrame(rows, columns=row.keys())
drug_df.head()

Unnamed: 0,id,name,type,groups,atc_codes,categories,smiles,pubmed_id,synonyms,inchi,inchikey,description
0,DB00001,Lepirudin,biotech,approved|withdrawn,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,16244762,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...",,,Lepirudin is a recombinant hirudin formed by 6...
1,DB00002,Cetuximab,biotech,approved,L01FE01,"Amino Acids, Peptides, and Proteins|Antibodies...",,11752352,Cetuximab|Cétuximab|Cetuximabum,,,Cetuximab is a recombinant chimeric human/mous...
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,8792953,Deoxyribonuclease (human clone 18-1 protein mo...,,,Dornase alfa is a biosynthetic form of human d...
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,17187516,Denileukin|Denileukin diftitox|Interleukin-2/d...,,,A recombinant DNA-derived cytotoxic protein co...
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,27463856,Etanercept|etanercept-szzs|etanercept-ykro|Rec...,,,Dimeric fusion protein consisting of the extra...


In [6]:
#增加一列datasource数值全为drugbank
drug_df['datasource'] = 'drugbank'
drug_df.head()

Unnamed: 0,id,name,type,groups,atc_codes,categories,smiles,pubmed_id,synonyms,inchi,inchikey,description,datasource
0,DB00001,Lepirudin,biotech,approved|withdrawn,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,16244762,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...",,,Lepirudin is a recombinant hirudin formed by 6...,drugbank
1,DB00002,Cetuximab,biotech,approved,L01FE01,"Amino Acids, Peptides, and Proteins|Antibodies...",,11752352,Cetuximab|Cétuximab|Cetuximabum,,,Cetuximab is a recombinant chimeric human/mous...,drugbank
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,8792953,Deoxyribonuclease (human clone 18-1 protein mo...,,,Dornase alfa is a biosynthetic form of human d...,drugbank
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,17187516,Denileukin|Denileukin diftitox|Interleukin-2/d...,,,A recombinant DNA-derived cytotoxic protein co...,drugbank
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,27463856,Etanercept|etanercept-szzs|etanercept-ykro|Rec...,,,Dimeric fusion protein consisting of the extra...,drugbank


In [7]:
drug_slim_df = drug_df[
    drug_df.groups.map(lambda x: 'approved' in x)
]
drug_slim_df.head()

Unnamed: 0,id,name,type,groups,atc_codes,categories,smiles,pubmed_id,synonyms,inchi,inchikey,description,datasource
0,DB00001,Lepirudin,biotech,approved|withdrawn,B01AE02,"Amino Acids, Peptides, and Proteins|Anticoagul...",,16244762,"[Leu1, Thr2]-63-desulfohirudin|Desulfatohirudi...",,,Lepirudin is a recombinant hirudin formed by 6...,drugbank
1,DB00002,Cetuximab,biotech,approved,L01FE01,"Amino Acids, Peptides, and Proteins|Antibodies...",,11752352,Cetuximab|Cétuximab|Cetuximabum,,,Cetuximab is a recombinant chimeric human/mous...,drugbank
2,DB00003,Dornase alfa,biotech,approved,R05CB13,"Amino Acids, Peptides, and Proteins|Cough and ...",,8792953,Deoxyribonuclease (human clone 18-1 protein mo...,,,Dornase alfa is a biosynthetic form of human d...,drugbank
3,DB00004,Denileukin diftitox,biotech,approved|investigational,L01XX29,"ADP Ribose Transferases|Amino Acids, Peptides,...",,17187516,Denileukin|Denileukin diftitox|Interleukin-2/d...,,,A recombinant DNA-derived cytotoxic protein co...,drugbank
4,DB00005,Etanercept,biotech,approved|investigational,L04AB01,"Agents reducing cytokine levels|Amino Acids, P...",,27463856,Etanercept|etanercept-szzs|etanercept-ykro|Rec...,,,Dimeric fusion protein consisting of the extra...,drugbank


In [8]:
drug_df.to_csv('data/drug.csv', index=False)
drug_slim_df.to_csv('data/Approved_drug.csv', index=False)

提取靶点信息

In [9]:
target_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(name1 + "drugbank-id[@primary='true']")
    for category in ['target', 'enzyme', 'carrier', 'transporter']:
        proteins = drug.findall('{ns}{cat}s/{ns}{cat}'.format(ns=name1, cat=category))
        for protein in proteins:
            row['target_id'] = protein.findtext('{}id'.format(name1))
            row['target_name'] = protein.findtext('{}name'.format(name1))
            row = {'drugbank_id': drugbank_id, 'category': category}
            row['organism'] = protein.findtext('{}organism'.format(name1))
            row['known_action'] = protein.findtext('{}known-action'.format(name1))
            actions = protein.findall('{ns}actions/{ns}action'.format(ns=name1))
            row['actions'] = '|'.join(action.text for action in actions)
            row['pubmed_id'] = protein.findtext('{ns}references/{ns}articles/{ns}article/{ns}pubmed-id'.format(ns=name1))
            uniprot_ids = [polypep.text for polypep in protein.findall(
                "{ns}polypeptide/{ns}external-identifiers/{ns}external-identifier[{ns}resource='UniProtKB']/{ns}identifier".format(ns=name1))]            
            if len(uniprot_ids) != 1:
                continue
            row['uniprot_id'] = uniprot_ids[0]
            target_rows.append(row)

target_df1 = pandas.DataFrame.from_dict(target_rows)

In [10]:
target_df1

Unnamed: 0,drugbank_id,category,organism,known_action,actions,pubmed_id,uniprot_id,target_id,target_name
0,DB00001,target,Humans,yes,inhibitor,10505536,P00734,BE0000767,Epidermal growth factor receptor
1,DB00002,target,Humans,yes,binder,11752352,P00533,BE0000901,Low affinity immunoglobulin gamma Fc region re...
2,DB00002,target,Humans,unknown,binder,16336752,O75015,BE0002094,Complement C1q subcomponent subunit A
3,DB00002,target,Humans,unknown,binder,32117299,P02745,BE0002095,Complement C1q subcomponent subunit B
4,DB00002,target,Humans,unknown,binder,32117299,P02746,BE0002096,Complement C1q subcomponent subunit C
...,...,...,...,...,...,...,...,...,...
28320,DB17083,transporter,Humans,no,substrate,,Q9UNQ0,BE0001066,Solute carrier family 22 member 6
28321,DB17083,transporter,Humans,no,inhibitor,,Q4U2R8,BE0000968,Histamine H3 receptor
28322,DB17087,target,Humans,yes,antagonist,28424564,Q9Y5N1,BE0010203,Plasmepsin X
28323,DB17096,target,Plasmodium falciparum (isolate 3D7),yes,inhibitor|binder,36216349,Q8IAS0,BE0010204,Plasmepsin X


In [11]:
#重新排列列的顺序
target_df = target_df1[['target_id', 'target_name','drugbank_id', 'category', 'organism', 'known_action', 'actions','pubmed_id', 'uniprot_id']]
#改变列的名称
target_df.columns = ['id', 'target_name','drugbank_id', 'category', 'organism', 'known_action', 'actions', 'pubmed_id','uniprot_id']
target_df.to_csv('data/targetdf.csv', index=False)
target_df

Unnamed: 0,id,target_name,drugbank_id,category,organism,known_action,actions,pubmed_id,uniprot_id
0,BE0000767,Epidermal growth factor receptor,DB00001,target,Humans,yes,inhibitor,10505536,P00734
1,BE0000901,Low affinity immunoglobulin gamma Fc region re...,DB00002,target,Humans,yes,binder,11752352,P00533
2,BE0002094,Complement C1q subcomponent subunit A,DB00002,target,Humans,unknown,binder,16336752,O75015
3,BE0002095,Complement C1q subcomponent subunit B,DB00002,target,Humans,unknown,binder,32117299,P02745
4,BE0002096,Complement C1q subcomponent subunit C,DB00002,target,Humans,unknown,binder,32117299,P02746
...,...,...,...,...,...,...,...,...,...
28320,BE0001066,Solute carrier family 22 member 6,DB17083,transporter,Humans,no,substrate,,Q9UNQ0
28321,BE0000968,Histamine H3 receptor,DB17083,transporter,Humans,no,inhibitor,,Q4U2R8
28322,BE0010203,Plasmepsin X,DB17087,target,Humans,yes,antagonist,28424564,Q9Y5N1
28323,BE0010204,Plasmepsin X,DB17096,target,Plasmodium falciparum (isolate 3D7),yes,inhibitor|binder,36216349,Q8IAS0


In [12]:
#增加一列datasource数值全为drugbank,新建一个df删除drugbank_id列
target_df['datasource'] = 'drugbank'
target_df2=target_df.drop('drugbank_id',axis=1)
target_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_df['datasource'] = 'drugbank'


Unnamed: 0,id,target_name,category,organism,known_action,actions,pubmed_id,uniprot_id,datasource
0,BE0000767,Epidermal growth factor receptor,target,Humans,yes,inhibitor,10505536,P00734,drugbank
1,BE0000901,Low affinity immunoglobulin gamma Fc region re...,target,Humans,yes,binder,11752352,P00533,drugbank
2,BE0002094,Complement C1q subcomponent subunit A,target,Humans,unknown,binder,16336752,O75015,drugbank
3,BE0002095,Complement C1q subcomponent subunit B,target,Humans,unknown,binder,32117299,P02745,drugbank
4,BE0002096,Complement C1q subcomponent subunit C,target,Humans,unknown,binder,32117299,P02746,drugbank


In [13]:
#输出target.csv文件
target_df2.to_csv('data/target.csv', index=False)

提取药物-靶点关系信息

In [14]:
#提取出target_df中的id和drugbank_id信息
drug_target_df=target_df[['drugbank_id','id']]
drug_target_df.columns = ['src_vid','dst_vid']
drug_target_df.head()

Unnamed: 0,src_vid,dst_vid
0,DB00001,BE0000767
1,DB00002,BE0000901
2,DB00002,BE0002094
3,DB00002,BE0002095
4,DB00002,BE0002096


In [15]:
#输出drug_target_df到csv文件
drug_target_df.to_csv('drug_target.csv', index=False)

提取pathway信息

In [16]:
#提取pathway信息
pathway_rows = list()
for i, drug in enumerate(root):
    drugbank_id = drug.findtext(name1 + "drugbank-id[@primary='true']")
    pathways = drug.findall('{ns}pathways/{ns}pathway'.format(ns=name1))
    for pathway in pathways:
        row = {'drugbank_id': drugbank_id}
        row['smpdb_id'] =pathway.findtext('{}smpdb-id'.format(name1))
        row['pathway_name'] = pathway.findtext('{}name'.format(name1))
        row['pathway_category'] = pathway.findtext('{}category'.format(name1))
        pathway_rows.append(row)

pathway_df = pandas.DataFrame.from_dict(pathway_rows)

In [17]:
#加一个datasource列
pathway_df['datasource']='drugbank'
pathway_df

Unnamed: 0,drugbank_id,smpdb_id,pathway_name,pathway_category,datasource
0,DB00001,SMP0000278,Lepirudin Action Pathway,drug_action,drugbank
1,DB00002,SMP0000474,Cetuximab Action Pathway,drug_action,drugbank
2,DB00006,SMP0000277,Bivalirudin Action Pathway,drug_action,drugbank
3,DB00009,SMP0000280,Alteplase Action Pathway,drug_action,drugbank
4,DB00013,SMP0000284,Urokinase Action Pathway,drug_action,drugbank
...,...,...,...,...,...
3775,DB13679,SMP0056811,Dexchlorpheniramine H1-Antihistamine Action,drug_action,drugbank
3776,DB13711,SMP0062895,Tritoqualine H1-Antihistamine Action,drug_action,drugbank
3777,DB13808,SMP0061052,Mebhydrolin H1-Antihistamine Action,drug_action,drugbank
3778,DB13820,SMP0059738,Oxomemazine H1-Antihistamine Action,drug_action,drugbank


In [19]:
#提取信息
drug_pathway_df=pathway_df[['drugbank_id','smpdb_id']]
drug_pathway_df.columns = ['src_vid','dst_vid']
drug_pathway_df2=pathway_df[['smpdb_id','pathway_name','pathway_category','datasource']]
drug_pathway_df2.columns = ['id','pathway_name','pathway_category','datasource']

#输出pathway
drug_pathway_df.to_csv('drug-pathway.csv', index=False)
drug_pathway_df2.to_csv('pathway.csv', index=False)