In [5]:
!pip install lxml
!pip install tqdm
from lxml import etree
import gzip
from collections import Counter
from tqdm import tqdm

dblp_path = "data/dblp.xml"

def context_iter(path):
    if path.endswith(".gz"):
        path = gzip.open(path)
    return etree.iterparse(path, events=("end",), load_dtd=True)



In [6]:
booktitle_counter = Counter()

for _, elem in tqdm(context_iter(dblp_path)):
    if elem.tag == "inproceedings":
        for child in elem:
            if child.tag == "booktitle" and child.text:
                text = child.text.lower()
                if "biomedical" in text or "health informatics" in text:
                    booktitle_counter[child.text] += 1
    elem.clear()

110550387it [01:21, 1350406.03it/s]


In [11]:
from xml.etree.ElementTree import tostring

count = 0
for _, elem in context_iter(dblp_path):
    if elem.tag in ("article", "inproceedings"):
        print(tostring(elem, encoding="unicode"))
        print("=" * 80)
        count += 1
    if count == 100:
        break

<article mdate="2017-06-08" key="dblpnote/neverpublished" publtype="informal">
<title>(was never published)</title>
</article>
<article mdate="2017-06-08" key="dblpnote/error" publtype="informal">
<title>(error)</title>
</article>
<article mdate="2017-06-08" key="dblpnote/ellipsis" publtype="informal">
<title>…</title>
</article>
<inproceedings mdate="2022-10-02" key="series/sapere/Freed13">
<author orcid="0000-0002-4182-3228">Sam Freed</author>
<title>Practical Introspection as Inspiration for AI.</title>
<pages>167-177</pages>
<year>2011</year>
<booktitle>PT-AI</booktitle>
<ee>https://doi.org/10.1007/978-3-642-31674-6_12</ee>
<crossref>series/sapere/2013-5</crossref>
<url>db/series/sapere/sapere5.html#Freed13</url>
</inproceedings>
<inproceedings mdate="2023-03-21" key="series/sapere/Steiner13">
<author orcid="0000-0002-0041-5973">Pierre Steiner</author>
<title>C.S. Peirce and Artificial Intelligence: Historical Heritage and (New) Theoretical Stakes.</title>
<pages>265-276</pages>
<y

In [13]:
count = 0
for _, elem in context_iter(dblp_path):
    if elem.tag == "inproceedings":
        key = elem.attrib.get("key", "")
        if key.startswith("conf/bhi"):
            print("KEY:", key)
            print(tostring(elem, encoding="unicode"))
            print("=" * 80)
            count += 1
    if count == 3:
        break

KEY: conf/bhi/LiWWXW17
<inproceedings mdate="2018-11-28" key="conf/bhi/LiWWXW17">
<author orcid="0000-0002-3398-9738">Danping Li</author>
<author>Lei Wang 0079</author>
<author>Jiajun Wang</author>
<author>Zhong Xue</author>
<author>Stephen T. C. Wong</author>
<title>Transductive local fisher discriminant analysis for gene expression profile-based cancer classification.</title>
<pages>49-52</pages>
<year>2017</year>
<booktitle>BHI</booktitle>
<ee>https://doi.org/10.1109/BHI.2017.7897202</ee>
<crossref>conf/bhi/2017</crossref>
<url>db/conf/bhi/bhi2017.html#LiWWXW17</url>
</inproceedings>
KEY: conf/bhi/HoutteGSZ21
<inproceedings mdate="2025-04-01" key="conf/bhi/HoutteGSZ21">
<author>Jeroen Van Houtte</author>
<author>Xiaoru Gao</author>
<author orcid="0000-0003-4225-2487">Jan Sijbers</author>
<author>Guoyan Zheng</author>
<title>2D/3D Registration with a Statistical Deformation Model Prior Using Deep Learning.</title>
<pages>1-4</pages>
<year>2021</year>
<booktitle>BHI</booktitle>
<ee>ht

In [19]:
count = 0
for _, elem in context_iter(dblp_path):
    if elem.tag == "article":
        key = elem.attrib.get("key", "")
        if key.startswith("journals/titb"):
            print("KEY:", key)
            print(tostring(elem, encoding="unicode"))
            print("=" * 80)
            count += 1
    if count == 3:
        break

KEY: journals/titb/DoustyZ21
<article mdate="2021-06-01" key="journals/titb/DoustyZ21">
<author orcid="0000-0001-6738-1651">Mehdy Dousty</author>
<author orcid="0000-0002-8842-745X">José Zariffa</author>
<title>Tenodesis Grasp Detection in Egocentric Video.</title>
<pages>1463-1470</pages>
<year>2021</year>
<volume>25</volume>
<journal>IEEE J. Biomed. Health Informatics</journal>
<number>5</number>
<ee>https://doi.org/10.1109/JBHI.2020.3003643</ee>
<ee>https://www.wikidata.org/entity/Q98200805</ee>
<url>db/journals/titb/titb25.html#DoustyZ21</url>
</article>
KEY: journals/titb/ZhuLHG21a
<article mdate="2022-03-16" key="journals/titb/ZhuLHG21a">
<author orcid="0000-0002-9782-3470">Taiyu Zhu</author>
<author orcid="0000-0003-3073-3128">Kezhi Li</author>
<author orcid="0000-0002-7088-5807">Pau Herrero</author>
<author orcid="0000-0003-2476-3857">Pantelis Georgiou</author>
<title>Deep Learning for Diabetes: A Systematic Review.</title>
<pages>2744-2757</pages>
<year>2021</year>
<volume>25<