In [1]:
import pandas as pd
import requests
import xml.etree.ElementTree as ET

In [2]:
def get_citation_count(doi):
    """
    Given a DOI, fetch the number of citations from the OpenCitations Index API.
    Returns the citation count as an integer. If the DOI is not found, returns None.
    """
    if not doi:
        return None
    
    api_url = f"https://opencitations.net/index/api/v2/citation-count/doi:{doi}"
    response = requests.get(api_url)
    
    if response.status_code == 200:
        data = response.json()
        if data and 'count' in data[0]:
            return data[0]['count']
    
    return None

def get_dblp_publications(pid):
    """
    Given a DBLP PID (e.g. '01/2345'), fetch its XML page from DBLP
    and parse out only four items per publication:
      1) doi (without "https://doi.org/")
      2) title
      3) venue
      4) citation count
    Returns a list of dicts, one dict per publication.
    """
    url = f"https://dblp.org/pid/{pid}.xml"
    resp = requests.get(url)
    
    if resp.status_code != 200:
        print(f"Could not fetch data for PID={pid}, status code={resp.status_code}")
        return []
    
    root = ET.fromstring(resp.content)
    publications = []
    
    for r_element in root.findall('r'):
        if not len(r_element):
            continue
        
        pub_node = r_element[0]
        
        title_node = pub_node.find('title')
        title = title_node.text if title_node is not None else None
        
        journal_node = pub_node.find('journal')
        booktitle_node = pub_node.find('booktitle')
        venue = None
        if journal_node is not None:
            venue = journal_node.text
        elif booktitle_node is not None:
            venue = booktitle_node.text
        
        doi = None
        ee_nodes = pub_node.findall('ee')
        for ee in ee_nodes:
            if ee.text and "doi.org" in ee.text:
                doi = ee.text.replace("https://doi.org/", "")
                break
        
        citation_count = get_citation_count(doi) if doi else None
        
        publications.append({
            'doi': doi,
            'title': title,
            'venue': venue,
            'citation_count': citation_count
        })
    
    return publications

In [3]:
# URL of the CSV file
url = "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/all-researchers.csv"

# Load the CSV into a DataFrame
df = pd.read_csv(url, names=['researcher', 'institution', 'pid'])

In [4]:
# Print out the first 5 rows
print(df.head())

                        researcher institution               pid
0  Abel Guilhermino da Silva Filho        UFPE           90/4151
1                   Adenauer Yamin       UFPEL  y/AdenauerCYamin
2                   Adenilso Simao    ICMC/USP            24/261
3               Adenilton da Silva        UFPE           03/8672
4              Adiel Almeida Filho        UFPE           51/8647


In [5]:
all_publications = get_dblp_publications('90/4151')

In [6]:
publication_df = pd.DataFrame(all_publications)

In [7]:
publication_df.head()

Unnamed: 0,doi,title,venue,citation_count
0,10.1007/978-3-031-47721-8_54,HyMO-RF: Automatic Hyperparameter Tuning for E...,IntelliSys (1),0.0
1,,Analysis of the Influence of Information Flow ...,SBRC,
2,10.1016/j.compeleceng.2018.11.012,Electromyography-controlled car: A proof of co...,Comput. Electr. Eng.,19.0
3,10.1016/j.neucom.2018.10.063,Deep convolutional extreme learning machines: ...,Neurocomputing,25.0
4,10.1016/j.micpro.2018.09.008,Autonomous power management in mobile devices ...,Microprocess. Microsystems,8.0


In [9]:
# # Example loop to retrieve & combine publications for all authors in df
# all_publications = []
# for idx, row in df.iterrows():
#     pid = row["pid"]
#     pubs_for_author = get_dblp_publications(pid)
#     all_publications.extend(pubs_for_author)

# # Convert to DataFrame, if desired
# publications_df = pd.DataFrame(all_publications)

# # Print a small sample
# print(publications_df.head())