In [21]:
import requests
from bs4 import BeautifulSoup
from requests_html import AsyncHTMLSession
from prettyprinter import pprint
import bibtexparser
import bib as bibHelper
from bibtexparser.bwriter import BibTexWriter

CDS URLs

In [2]:
url = "https://cds.cern.ch/record/2871986"
data = requests.get(url, timeout=20).text
soup = BeautifulSoup(data, "html.parser")

In [12]:
def get_cds_bibtext(cds_id: int):
    """Get the BibTeX entry from CDS."""
    url = f"https://cds.cern.ch/record/{cds_id}/export/hx"
    data = requests.get(url, timeout=20).text
    soup = BeautifulSoup(data, "html.parser")
    bib_text = soup.find("pre").get_text()
    # removing whitespaces at the beiginning of a line.
    # and convert double quotes to brakets.
    # bib_text = "\n".join([" " + line.strip().replace('"', "{").replace('"', "}") for line in bib_text.split("\n")])
    return bib_text

In [13]:
bib_text = get_cds_bibtext(2871986)

In [14]:
print(bib_text)


@techreport{Caillou:2871986,
      author        = "Caillou, Sylvain and Calafiura, Paolo and Farrell, Steven
                       Andrew and Ju, Xiangyang and Murnane, Daniel Thomas and
                       Pham, Minh Tuan and Rougier, Charline and Stark, Jan and
                       Vallier, Alexis",
      collaboration = "ATLAS",
      title         = "{Physics Performance of the ATLAS GNN4ITk Track
                       Reconstruction Chain}",
      institution   = "CERN",
      reportNumber  = "ATL-SOFT-PROC-2023-038",
      address       = "Geneva",
      year          = "2023",
      url           = "https://cds.cern.ch/record/2871986",
}



In [19]:
authors = bibHelper.get_author_list(bib_text)

In [20]:
authors

['Sylvain Caillou',
 'Paolo Calafiura',
 'Steven\nAndrew Farrell',
 'Xiangyang Ju',
 'Daniel Thomas Murnane',
 'Minh Tuan Pham',
 'Charline Rougier',
 'Jan Stark',
 'Alexis Vallier']

In [22]:
bib_writer = BibTexWriter()
new_bib_tex = bib_writer.write(bibtexparser.loads(bib_text))

In [23]:
print(new_bib_tex)

@techreport{Caillou:2871986,
 address = {Geneva},
 author = {Caillou, Sylvain and Calafiura, Paolo and Farrell, Steven
Andrew and Ju, Xiangyang and Murnane, Daniel Thomas and
Pham, Minh Tuan and Rougier, Charline and Stark, Jan and
Vallier, Alexis},
 collaboration = {ATLAS},
 institution = {CERN},
 reportnumber = {ATL-SOFT-PROC-2023-038},
 title = {{Physics Performance of the ATLAS GNN4ITk Track
Reconstruction Chain}},
 url = {https://cds.cern.ch/record/2871986},
 year = {2023}
}



In [21]:
table = soup.find_all("table")[1]
table_content = {}
if table:
    rows = table.find_all("tr")
    for ridx, row in enumerate(rows):
        cells = row.find_all("td")
        if len(cells) != 2:
            continue
        key = cells[0].get_text().strip()
        value = cells[1].get_text()
        if value:
            table_content[key] = value.strip()
        else:
            table_content[key] = "N/A"

In [22]:
pprint(table_content)

{
    'Report number': 'ATL-SOFT-PROC-2023-038',
    'Title': 'Physics Performance of the ATLAS GNN4ITk Track Reconstruction Chain',
    'Author(s)':
        'Caillou, Sylvain (Centre National de la Recherche Scientifique (FR)) '
        '; Calafiura, Paolo (Lawrence Berkeley National Lab. (US)) ; Farrell, '
        'Steven Andrew ; Ju, Xiangyang (Lawrence Berkeley National Lab. (US)) '
        '; Murnane, Daniel Thomas (Lawrence Berkeley National Lab. (US)) ; '
        'Pham, Minh Tuan (University of Wisconsin Madison (US)) ; Rougier, '
        'Charline (Centre National de la Recherche Scientifique (FR)) ; '
        'Stark, Jan (Centre National de la Recherche Scientifique (FR)) ; '
        'Vallier, Alexis (Centre National de la Recherche Scientifique (FR))',
    'Corporate \nAuthor(s)': 'The ATLAS collaboration',
    'Collaboration': 'ATLAS Collaboration',
    'Imprint': '21 Sep 2023. - 7 p.',
    'In:':
        '26th International Conference on Computing in High Energy & Nuclear '

In [23]:
def remove_instiution_name(authors: str):
    """Remove the institution name in parenthesis from the authors' names."""
    authors = authors.split(";")
    authors = [author.split("(")[0].strip() for author in authors]

In [27]:
bib_url = "https://cds.cern.ch/record/2871986/export/hx"
bib_data = requests.get(bib_url, timeout=20).text
bib_soup = BeautifulSoup(bib_data, "html.parser")
bib_text = bib_soup.find("pre").get_text()

In [29]:
print(bib_text)


@techreport{Caillou:2871986,
      author        = "Caillou, Sylvain and Calafiura, Paolo and Farrell, Steven
                       Andrew and Ju, Xiangyang and Murnane, Daniel Thomas and
                       Pham, Minh Tuan and Rougier, Charline and Stark, Jan and
                       Vallier, Alexis",
      collaboration = "ATLAS",
      title         = "{Physics Performance of the ATLAS GNN4ITk Track
                       Reconstruction Chain}",
      institution   = "CERN",
      reportNumber  = "ATL-SOFT-PROC-2023-038",
      address       = "Geneva",
      year          = "2023",
      url           = "https://cds.cern.ch/record/2871986",
}



In [30]:
def get_cds_bibtext(cds_id: int):
    """Get the BibTeX entry from CDS."""
    url = f"https://cds.cern.ch/record/{cds_id}/export/hx"
    data = requests.get(url, timeout=20).text
    soup = BeautifulSoup(data, "html.parser")
    bib_text = soup.find("pre").get_text()
    return bib_text

In [25]:
title = table_content.get("Title")
authors = remove_instiution_name(table_content.get("Author(s)"))
journal_ref = table_content.get("Report number")
paper_doi = "N/A"
category = "hep-ex"
preprint_date = table_content.get("Imprint").split(".")[0]
cite_count = "N/A"

paper_data = {
    "texkeys": "",
    "citation_count": -1,
    "citation_count_without_self_citations": -1,
    "doi": paper_doi,
    "arxiv_eprints": "N/A",
    "arxiv_category": category,
    "preprint_date": preprint_date,
    "title": title,
    "bibtex": get_cds_bibtext(2871986),
    "inspire_id": "N/A",
    "authors": authors,
    "cite_info": journal_ref,
}

In [23]:
recid = "2759110"
inspire_api = f"https://inspirehep.net/api/literature/{recid}"
data = requests.get(inspire_api).json()
bibtex_url = data["links"]["bibtex"]
bibtex_url

'https://inspirehep.net/api/literature/2759110?format=bibtex'

In [24]:
bibtex = requests.get(bibtex_url, timeout=20).content
bibtex

b'@inproceedings{Huang:2024voo,\n    author = "Huang, Andris and Melkani, Yash and Calafiura, Paolo and Lazar, Alina and Murnane, Daniel Thomas and Pham, Minh-Tuan and Ju, Xiangyang",\n    title = "{A Language Model for Particle Tracking}",\n    booktitle = "{Connecting The Dots 2023}",\n    eprint = "2402.10239",\n    archivePrefix = "arXiv",\n    primaryClass = "hep-ph",\n    reportNumber = "PROC-CTD2023-33",\n    month = "2",\n    year = "2024"\n}\n'

In [25]:
data["metadata"]["citation_count"]

1

In [4]:
url = "https://arxiv.org/abs/2210.12247"
data = requests.get(url).text
soup = BeautifulSoup(data, "html.parser")

In [26]:
[x.text for x in soup.find_all("div", "authors")[0].find_all("a")]

['xiangyang Ju',
 'Yunsong Wang',
 'Daniel Murnane',
 'Nicholas Choma',
 'Steven Farrell',
 'Paolo Calafiura']

In [22]:
soup.find_all("h1", "title mathjax")[0].contents[1]

'Benchmarking GPU and TPU Performance with Graph Neural Networks'

In [27]:
soup.find_all("td", "tablecell jref")

[]

In [55]:
journal_ref_list = soup.find_all("td", "tablecell jref")
journal_refs = ", ".join([x.get_text() for x in journal_ref_list])
journal_refs

'CMS-SUS-19-005, CERN-EP-2019-180, Eur. Phys. J. C 80 (2020) 3'

In [53]:
journal_refs

'CMS-SUS-19-005, CERN-EP-2019-180,Eur. Phys. J. C 80 (2020) 3'

In [56]:
soup.find_all("td", "tablecell label")

[<td class="tablecell label">Comments:</td>,
 <td class="tablecell label">Subjects:</td>,
 <td class="tablecell label">Report number:</td>,
 <td class="tablecell label">Cite as:</td>,
 <td class="tablecell label"> </td>,
 <td class="tablecell label"> </td>,
 <td class="tablecell label">Journal reference:</td>,
 <td class="tablecell label">
 <abbr title="Digital Object Identifier">Related DOI</abbr>:
           </td>]

In [93]:
soup.find_all("td", "tablecell label")[7].next_sibling.next_sibling.find_all("a")[0].get("href")

'https://doi.org/10.1140/epjc/s10052-019-7493-x'

In [96]:
soup.find_all("td", "tablecell label")[7].text

'\nRelated DOI:\n          '

In [63]:
soup.find_all("span", "descriptor")

[<span class="descriptor">Title:</span>,
 <span class="descriptor">Authors:</span>,
 <span class="descriptor">Abstract:</span>,
 <span class="descriptor">Full-text links:</span>]

In [76]:
j = soup.find_all("span", "descriptor")[0]
type(j.next_sibling.get_text())

str

In [66]:
soup.find_all("span", "descriptor")[1].next_sibling.contents[0]

'CMS Collaboration'

In [77]:
soup.find_all("span", "descriptor")[1].next_sibling.get_text()

'CMS Collaboration'

In [70]:
soup.find_all("span", "descriptor")[2].next_sibling.text

'Two related searches for phenomena beyond the standard model (BSM) are performed using events with hadronic jets and significant transverse momentum imbalance. The results are based on a sample of proton-proton collisions at a center-of-mass energy of 13 TeV, collected by the CMS experiment at the LHC in 2016-2018 and corresponding to an integrated luminosity of 137 fb$^{-1}$. The first search is inclusive, based on signal regions defined by the hadronic energy in the event, the jet multiplicity, the number of jets identified as originating from bottom quarks, and the value of the kinematic variable $M_\\mathrm{T2}$ for events with at least two jets. For events with exactly one jet, the transverse momentum of the jet is used instead. The second search looks in addition for disappearing tracks produced by BSM long-lived charged particles that decay within the volume of the tracking detector. No excess event yield is observed above the predicted standard model background. This is used t

In [74]:
soup.find_all("span", "descriptor")[3].next_sibling.next_sibling.next_sibling.next_sibling

<ul>
<div hidden="" id="download-button-info">
Download a PDF of the paper titled Searches for physics beyond the standard model with the $M_\mathrm{T2}$ variable in hadronic final states with and without disappearing tracks in proton-proton collisions at $\sqrt{s} =$ 13 TeV, by CMS Collaboration</div><li><a accesskey="f" aria-describedby="download-button-info" class="abs-button download-pdf" href="/pdf/1909.03460">Download PDF</a></li><li><a class="abs-button download-eprint" href="/src/1909.03460">TeX Source</a></li><li><a class="abs-button download-format" href="/format/1909.03460">Other Formats</a></li></ul>

In [109]:
soup.find_all("span", "primary-subject")[0].text

'High Energy Physics - Experiment (hep-ex)'

In [110]:
import re
a = soup.find_all("span", "primary-subject")[0].text
# find str inside ()
re.search(r'\((.*?)\)', a).group(1)

'hep-ex'

In [113]:
soup.find_all("div", "dateline")[0].text

'\n  [Submitted on 8 Sep 2019 (v1), last revised 7 Jan 2020 (this version, v2)]'

In [128]:
a = soup.find_all("div", "submission-history")[0].text
print(a)
# find the date in the string, date is in the format "dd mm yyyy"




Submission history From: The CMS Collaboration [view email]       [v1]
        Sun, 8 Sep 2019 13:22:01 UTC (6,664 KB)
[v2]
        Tue, 7 Jan 2020 13:34:15 UTC (6,665 KB)



In [133]:
re.search(r"(\d+ \w+ \d+)", soup.find_all("div", "submission-history")[0].text).group(1)

'8 Sep 2019'

In [134]:
arxiv_index = "1909.03460"
cite_url = f"https://scholar.google.com/scholar?q=arXiv:{arxiv_index}"
cite_data = requests.get(cite_url, timeout=20).text
cite_soup = BeautifulSoup(cite_data, "html.parser")
cite_count = cite_soup.find_all("div", "gs_ab_mdw")[0].text

In [144]:
b = [x for x in cite_soup.find_all("a") if "Cited by" in x.text]
int(b[0].text.split()[-1])

126

In [142]:
b[0].text.split()[-1]

[<a href="/scholar?cites=6530631809825731064&amp;as_sdt=2005&amp;sciodt=0,5&amp;hl=en&amp;oe=ASCII">Cited by 126</a>]

In [145]:
soup.find_all("td", "tablecell label")

[<td class="tablecell label">Comments:</td>,
 <td class="tablecell label">Subjects:</td>,
 <td class="tablecell label">Report number:</td>,
 <td class="tablecell label">Cite as:</td>,
 <td class="tablecell label"> </td>,
 <td class="tablecell label"> </td>,
 <td class="tablecell label">Journal reference:</td>,
 <td class="tablecell label">
 <abbr title="Digital Object Identifier">Related DOI</abbr>:
           </td>]

In [153]:
for label in soup.find_all("td", "tablecell label"):
    label_text = label.get_text()
    print(label_text, type(label_text))
    if "Journal" in label_text:
        print(label.next_sibling.next_sibling.get_text())
        print(label.next_sibling)
        print("HERE")
        journal_ref = label.next_sibling.next_sibling.get_text()
    if "DOI" in label_text:
        paper_doi = label.next_sibling.next_sibling.get_text()

Comments: <class 'str'>
Subjects: <class 'str'>
Report number: <class 'str'>
Cite as: <class 'str'>
  <class 'str'>
  <class 'str'>
Journal reference: <class 'str'>
Eur. Phys. J. C 80 (2020) 3


HERE

Related DOI:
           <class 'str'>


In [166]:
soup.find_all("td", "tablecell label")[-1].next_sibling.next_sibling.find_all("a")[0].get("href")

'https://doi.org/10.1140/epjc/s10052-019-7493-x'

In [189]:

inspire_url = f"https://inspirehep.net/arxiv/{arxiv_index}"
asession = AsyncHTMLSession()
r = await asession.get(inspire_url)

In [32]:
import asyncio

arxiv_index = "1909.03460"
inspire_url = f"https://inspirehep.net/arxiv/{arxiv_index}"

async def get_inspire_id(url):
    asession = AsyncHTMLSession()
    new_url = await asession.get(url)
    return new_url.url.split("/")[-1]

inspire_id = asyncio.run(get_inspire_id(inspire_url))

RuntimeError: asyncio.run() cannot be called from a running event loop