In [1]:
import xml.etree.ElementTree as ET
import pandas as pd

# XML tree 불러오기
tree = ET.parse("2024 drugbank full database.xml")

# 불러온 파일의 root 태그 찾기
root = tree.getroot()

In [2]:
# Drug

# data를 담을 list 생성
data_id = []
data_name = []
data_description = []
data_cas_number = []
data_average_mass = []
data_monoisotopic_mass = []
data_state = []

# tree 순회하며 데이터 뽑기
for child in root.findall("{http://www.drugbank.ca}drug"):
    id = child.find("{http://www.drugbank.ca}drugbank-id[@primary='true']")
    name = child.find("{http://www.drugbank.ca}name")
    description = child.find("{http://www.drugbank.ca}description")
    cas_number = child.find("{http://www.drugbank.ca}cas-number")
    average_mass = child.find("{http://www.drugbank.ca}average-mass")
    state = child.find("{http://www.drugbank.ca}state") 
    monoisotopic_mass = child.find("{http://www.drugbank.ca}monoisotopic-mass")

    # list에 값 추가, 비어있으면 None(NULL)
    data_id.append(id.text if id is not None else None)
    data_name.append(name.text if name is not None else None)
    data_description.append(description.text if description is not None else None)
    data_cas_number.append(cas_number.text if cas_number is not None else None)
    data_average_mass.append(average_mass.text if average_mass is not None else None)
    data_monoisotopic_mass.append(monoisotopic_mass.text if monoisotopic_mass is not None else None)
    data_state.append(state.text if state is not None else None)

# dataframe 변환
df_id = pd.DataFrame({"drugbank_id":data_id})
df_name = pd.DataFrame({"name":data_name})
df_description = pd.DataFrame({"description":data_description})
df_cas_number = pd.DataFrame({"cas_number":data_cas_number})
df_average_mass = pd.DataFrame({"average_mass":data_average_mass})
df_monoisotopic_mass = pd.DataFrame({"monoisotopic_mass":data_monoisotopic_mass})
df_state = pd.DataFrame({"state":data_state})

# table로 합치기
drug = pd.concat([df_id, df_name, df_description, df_cas_number, df_average_mass, df_monoisotopic_mass, df_state], axis=1)

# csv파일 내보내기, index값 출력 X
drug.to_csv("Drug.csv", index=False)

In [3]:
# Calculated_properties

# data를 담을 list 생성
data_c_id = []
data_kind = []
data_value = []
data_source = []

# tree 순회하며 데이터 뽑기
for child in root.findall("{http://www.drugbank.ca}drug"):
    id = child.find("{http://www.drugbank.ca}drugbank-id[@primary='true']")

    calculated_properties = child.find("{http://www.drugbank.ca}calculated-properties")

    # calculated_properteis가 None, 즉 없는 경우도 있으니 예외처리
    if calculated_properties is not None:
        for property in calculated_properties.findall("{http://www.drugbank.ca}property"):
            kind = property.find("{http://www.drugbank.ca}kind")
            value = property.find("{http://www.drugbank.ca}value")
            source = property.find("{http://www.drugbank.ca}source")

            # list에 값 추가, 비어있으면 None(NULL)
            data_c_id.append(id.text if id is not None else None)
            data_kind.append(kind.text if kind is not None else None)
            data_value.append(value.text if value is not None else None)
            data_source.append(source.text if source is not None else None)

# dataframe 변환
df_c_id = pd.DataFrame({"drugbank_id":data_c_id})
df_kind = pd.DataFrame({"kind":data_kind})
df_value = pd.DataFrame({"value":data_value})
df_source = pd.DataFrame({"source":data_source})

# table로 합치기
cp = pd.concat([df_c_id, df_kind, df_value, df_source], axis=1)

# csv파일 내보내기, index값 출력 X
cp.to_csv("Calculated_properties.csv", index=False)

In [4]:
# Reactions

# data를 담을 list 생성
data_r_id = []
data_sequence = []
data_LE_id = []
data_LE_name = []
data_RE_id = []
data_RE_name = []

# tree 순회하며 데이터 뽑기
for child in root.findall("{http://www.drugbank.ca}drug"):
    id = child.find("{http://www.drugbank.ca}drugbank-id[@primary='true']")

    reactions = child.find("{http://www.drugbank.ca}reactions")

    for reaction in reactions.findall("{http://www.drugbank.ca}reaction"):
        sequence = reaction.find("{http://www.drugbank.ca}sequence")
        # left_element 값 가져오기
        left_element = reaction.find("{http://www.drugbank.ca}left-element")
        left_element_drugbank_id = left_element.find("{http://www.drugbank.ca}drugbank-id")
        left_element_name = left_element.find("{http://www.drugbank.ca}name")
        # right_element 값 가져오기
        right_element = reaction.find("{http://www.drugbank.ca}right-element")
        right_element_drugbank_id = right_element.find("{http://www.drugbank.ca}drugbank-id")
        right_element_name = right_element.find("{http://www.drugbank.ca}name")
        # list에 값 추가, 비어있으면 None(NULL)
        data_r_id.append(id.text if id is not None else None)
        data_sequence.append(sequence.text if sequence is not None else None)
        data_LE_id.append(left_element_drugbank_id.text if left_element_drugbank_id is not None else None)
        data_LE_name.append(left_element_name.text if left_element_name is not None else None)
        data_RE_id.append(right_element_drugbank_id.text if right_element_drugbank_id is not None else None)
        data_RE_name.append(right_element_name.text if right_element_name is not None else None)

# dataframe 변환
df_r_id = pd.DataFrame({"drugbank-id":data_r_id})
df_sequence = pd.DataFrame({"sequence":data_sequence})
df_LE_id = pd.DataFrame({"left_element_id":data_LE_id})
df_LE_name = pd.DataFrame({"left_element_name":data_LE_name})
df_RE_id = pd.DataFrame({"right_element_id":data_RE_id})
df_RE_name = pd.DataFrame({"right_element_name":data_RE_name})

# table로 합치기
ra = pd.concat([df_r_id,df_sequence,df_LE_id,df_LE_name,df_RE_id,df_RE_name], axis=1)

# csv파일 내보내기, index값 출력 X
ra.to_csv("Reactions.csv", index=False)


In [5]:
# General_reference

# article data 담을 list 생성
data_a_id = []
data_ref_id_a = []
data_pubmed_id = []
data_citation = []

# link data 담을 list 생성
data_l_id = []
data_ref_id_l = []
data_title = []
data_url = []

# tree 순회하며 데이터 뽑기
for child in root.findall("{http://www.drugbank.ca}drug"):
    id = child.find("{http://www.drugbank.ca}drugbank-id[@primary='true']")

    general_references = child.find("{http://www.drugbank.ca}general-references")
    articles = general_references.find("{http://www.drugbank.ca}articles")
    links = general_references.find("{http://www.drugbank.ca}links")

    # article 데이터 list에 추가하기
    for article in articles.findall("{http://www.drugbank.ca}article"):

        ref_id_a = article.find("{http://www.drugbank.ca}ref-id")
        pubmed_id = article.find("{http://www.drugbank.ca}pubmed-id")
        citation = article.find("{http://www.drugbank.ca}citation")

        data_a_id.append(id.text if id is not None else None)
        data_ref_id_a.append(ref_id_a.text if ref_id_a is not None else None)
        data_pubmed_id.append(pubmed_id.text if pubmed_id is not None else None)
        data_citation.append(citation.text if citation is not None else None)

    # link 데이터 list에 추가하기
    for link in links.findall("{http://www.drugbank.ca}link"):
        
        ref_id_l = link.find("{http://www.drugbank.ca}ref-id")
        title = link.find("{http://www.drugbank.ca}title")
        url = link.find("{http://www.drugbank.ca}url")

        data_l_id.append(id.text if id is not None else None)
        data_ref_id_l.append(ref_id_l.text if ref_id_l is not None else None)
        data_title.append(title.text if title is not None else None)
        data_url.append(url.text if url is not None else None)

# article
df_a_id = pd.DataFrame({"drugbank_id":data_a_id})
df_ref_id_a = pd.DataFrame({"ref_id_a":data_ref_id_a})
df_pubmed_id = pd.DataFrame({"pubmed_id":data_pubmed_id})
df_citation = pd.DataFrame({"citation":data_citation})

ac = pd.concat([df_a_id, df_ref_id_a, df_pubmed_id, df_citation], axis=1)

ac.to_csv("General_reference_article.csv", index=False)

#link
df_l_id = pd.DataFrame({"drugbank_id":data_l_id})
df_ref_id_l = pd.DataFrame({"ref_id_l":data_ref_id_l})
df_title = pd.DataFrame({"title":data_title})
df_url = pd.DataFrame({"url":data_url})

li = pd.concat([df_l_id, df_ref_id_l, df_title, df_url], axis=1)

li.to_csv("General_reference_link.csv", index=False)

In [7]:
# Patents

# data를 담을 list 생성
data_patent_number = []
data_p_id = []
data_country = []
data_approved = []
data_expires = []
data_pdiatric_extension = []

# tree 순회하며 데이터 뽑기
for child in root.findall("{http://www.drugbank.ca}drug"):
    id = child.find("{http://www.drugbank.ca}drugbank-id[@primary='true']")

    patents = child.find("{http://www.drugbank.ca}patents")
    
    for patent in patents.findall("{http://www.drugbank.ca}patent"):
        number = patent.find("{http://www.drugbank.ca}number")
        country = patent.find("{http://www.drugbank.ca}country")
        approved = patent.find("{http://www.drugbank.ca}approved")
        expires = patent.find("{http://www.drugbank.ca}expires")
        pediatric_extension = patent.find("{http://www.drugbank.ca}pediatric-extension")

        # list에 값 추가, 비어있으면 None(NULL)
        data_p_id.append(id.text if id is not None else None)
        data_patent_number.append(number.text if number is not None else None)
        data_country.append(country.text if country is not None else None)
        data_approved.append(approved.text if approved is not None else None)
        data_expires.append(expires.text if expires is not None else None)
        data_pdiatric_extension.append(pediatric_extension.text if pediatric_extension is not None else None)

# dataframe 변환
df_p_id = pd.DataFrame({"drugbank_id":data_p_id})
df_patent_number = pd.DataFrame({"patent_number":data_patent_number})
df_country = pd.DataFrame({"country":data_country})
df_approved = pd.DataFrame({"approved":data_approved})
df_expires = pd.DataFrame({"expires":data_expires})
df_pediatric_extension = pd.DataFrame({"pediatric_extension":data_pdiatric_extension})

# table로 합치기
Patent = pd.concat([df_p_id, df_patent_number, df_country, df_approved, df_expires, df_pediatric_extension], axis=1)

# csv파일 내보내기, index값 출력 X
Patent.to_csv("Patents.csv", index=False)