In [6]:
#!/usr/bin/env python3
"""
This script loads CSIndexBR area mappings and then fetches DBLP publications along with their citation counts.
While fetching publications for a given DBLP PID, each publication is assigned a sub-area based on its venue.
Publications that do not match any sub-area are ignored.
"""

import re
import csv
import requests
import pandas as pd
import xml.etree.ElementTree as ET

# -------------------------------
# Functions to fetch DBLP publications and citations
# -------------------------------

def get_citation_count(doi):
    """
    Get the citation count for a given DOI using the OpenCitations API.
    """
    if not doi:
        return None
    api_url = f"https://opencitations.net/index/api/v2/citation-count/doi:{doi}"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        if data and 'count' in data[0]:
            return data[0]['count']
    return None

def get_dblp_publications(pid, mapping):
    """
    Fetch publications from DBLP for the given PID and assign sub-area based on CSIndexBR mapping.
    Only publications with a matching sub-area (venue in mapping) are retained.
    """
    url = f"https://dblp.org/pid/{pid}.xml"
    print(f"\nFetching DBLP publications for PID: {pid}")
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Could not fetch data for PID={pid}, status code={resp.status_code}")
        return []
    root = ET.fromstring(resp.content)
    publications = []
    for r_element in root.findall('r'):
        if not len(r_element):
            continue
        pub_node = r_element[0]
        title = pub_node.find('title').text if pub_node.find('title') is not None else None
        # Determine venue from either journal or booktitle tag
        journal_node = pub_node.find('journal')
        booktitle_node = pub_node.find('booktitle')
        venue = (journal_node.text if journal_node is not None 
                 else (booktitle_node.text if booktitle_node is not None else None))
        doi = None
        # Extract DOI from the "ee" element
        for ee in pub_node.findall('ee'):
            if ee.text and "doi.org" in ee.text:
                doi = ee.text.replace("https://doi.org/", "")
                break
        
        if doi and venue:
            # Check if the publication's venue exists in the CSIndexBR mapping
            if venue in mapping:
                sub_area = mapping[venue][0]
                citation_count = get_citation_count(doi)
                pub = {
                    'doi': doi,
                    'title': title,
                    'venue': venue,
                    'citation_count': citation_count,
                    'sub_area': sub_area
                }
                publications.append(pub)
                print(f"Added: '{title}' | Venue: '{venue}' | Sub-area: '{sub_area}'")
            else:
                print(f"Skipping publication '{title}' – venue '{venue}' not found in CSIndexBR mapping.")
    print(f"\nTotal publications with assigned sub-area: {len(publications)}")
    return publications

def extract_doi(doi_str):
    """
    Extract DOI from a given string.
    """
    match = re.search(r'doi:(\S+)', doi_str)
    if match:
        return match.group(1)
    return doi_str

def get_references(doi):
    """
    Fetch references (publications cited by the given DOI) using the OpenCitations API.
    """
    url = f"https://opencitations.net/index/api/v2/references/doi:{doi}"
    response = requests.get(url)
    edges = []
    if response.status_code == 200:
        data = response.json()
        for record in data:
            cited_raw = record.get("cited")
            if cited_raw:
                cleaned_doi = extract_doi(cited_raw)
                edges.append({
                    "origin_doi": doi,
                    "target_doi": cleaned_doi
                })
    else:
        print(f"Error {response.status_code} when fetching references for {doi}")
    return edges

def get_citations(doi):
    """
    Fetch citations (publications that cite the given DOI) using the OpenCitations API.
    """
    url = f"https://opencitations.net/index/api/v2/citations/doi:{doi}"
    response = requests.get(url)
    edges = []
    if response.status_code == 200:
        data = response.json()
        for record in data:
            citing_raw = record.get("citing")
            if citing_raw:
                cleaned_doi = extract_doi(citing_raw)
                edges.append({
                    "origin_doi": cleaned_doi,
                    "target_doi": doi
                })
    else:
        print(f"Error {response.status_code} when fetching citations for {doi}")
    return edges

# -------------------------------
# Functions to load and merge CSIndexBR mappings
# -------------------------------

def load_csv_mapping(url, key_index=0, value_index=1):
    """
    Load a CSV from the given URL and return a mapping dictionary from key to list of values.
    """
    mapping = {}
    print(f"Loading CSV mapping from: {url}")
    response = requests.get(url)
    if response.status_code == 200:
        reader = csv.reader(response.text.splitlines())
        for row in reader:
            if len(row) <= value_index:
                continue
            key = row[key_index].strip()
            value = row[value_index].strip()
            mapping.setdefault(key, []).append(value)
    else:
        print(f"Failed to load CSV from {url}")
    return mapping

def load_all_area_mappings():
    """
    Iterates over a hardcoded list of CSIndexBR research areas and loads the corresponding
    conference and journal mappings into a global dictionary.
    """
    print("\nLoading global CSIndexBR area mappings...")
    base_url = "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/"
    areas = ["ai", "arch", "bio", "chi", "cse", "data", "dbis", "ds",
             "formal", "graphics", "hardware", "ir", "net", "or", "pl", "robotics",
             "se", "security", "theory", "vision"]
    
    global_mapping = {}
    for area in areas:
        # Load conference mappings
        for conf_file in [f"{area}-confs.csv", f"{area}-out-confs.csv"]:
            conf_url = base_url + conf_file
            conf_mapping = load_csv_mapping(conf_url, key_index=0, value_index=1)
            for venue, sub_areas in conf_mapping.items():
                global_mapping.setdefault(venue, []).append(area)
        # Load journal mappings
        journal_file = f"{area}-out-journals.csv"
        journal_url = base_url + journal_file
        journal_mapping = load_csv_mapping(journal_url, key_index=0, value_index=0)
        for journal, sub_areas in journal_mapping.items():
            global_mapping.setdefault(journal, []).append(area)
    print("Completed loading CSIndexBR mappings.")
    return global_mapping

In [7]:
# -------------------------------
# MAIN PROCESS
# -------------------------------
if __name__ == "__main__":
    # Step 0: Load researchers' data
    # URL of the CSV file
    researchers_url = "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/all-researchers.csv"
    researchers_df = pd.read_csv(researchers_url, names=['researcher', 'institution', 'pid'])

    # Step 1: Load CSIndexBR area mappings.
    print("Step 1: Loading CSIndexBR area mappings...")
    csindex_mapping = load_all_area_mappings()

    # Step 2: Fetch publications from DBLP for a given PID,
    #         assigning sub-area as part of the fetching process.
    pid = researchers_df['pid'][0]
    print(f"\nStep 2: Fetching publications for DBLP PID: {pid} (only those with a matching sub-area)")
    publications = get_dblp_publications(pid, csindex_mapping)

    # Step 3: Display the final publications DataFrame.
    if publications:
        publication_df = pd.DataFrame(publications)
        print("\nFinal Publication DataFrame (first 5 rows):")
        print(publication_df.head())
    else:
        print("No publications with a valid sub-area were retrieved.")

    # Step 4: (Optional) Build the citation edge list for a sample DOI.
    doi_input = "10.1016/j.neucom.2018.10.063"
    print(f"\nStep 4: Fetching citation data for DOI: {doi_input}")
    outgoing_edges = get_references(doi_input)
    incoming_edges = get_citations(doi_input)
    all_edges = outgoing_edges + incoming_edges
    if all_edges:
        citations_df = pd.DataFrame(all_edges).drop_duplicates()
        print("\nCitations DataFrame:")
        print(citations_df)
    else:
        print("No citation edges were retrieved.")

Step 1: Loading CSIndexBR area mappings...

Loading global CSIndexBR area mappings...
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/ai-confs.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/ai-out-confs.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/ai-out-journals.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/arch-confs.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/arch-out-confs.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/arch-out-journals.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/bio-confs.csv
Loading CSV mapping from: https://raw.githubusercontent.com/aserg-ufmg/CSIndex/r

In [8]:
publication_df

Unnamed: 0,doi,title,venue,citation_count,sub_area
0,10.1016/j.neucom.2018.10.063,Deep convolutional extreme learning machines: ...,Neurocomputing,25,ai
1,10.1016/j.asoc.2015.11.040,An adaptive semi-supervised Fuzzy GrowCut algo...,Appl. Soft Comput.,26,ai
2,10.1016/j.asoc.2016.09.006,Multi-objective optimization applied to unifie...,Appl. Soft Comput.,5,ai
3,10.1016/j.eswa.2016.08.016,A semi-supervised fuzzy GrowCut algorithm to s...,Expert Syst. Appl.,40,ai
4,10.1109/HiPC.2013.6799114,Performance and energy consumption analysis of...,HiPC,0,arch
5,10.1145/1046192.1046226,A petri-net based Pre-runtime scheduler for dy...,FPGA,0,hardware
6,10.1145/1046192.1046254,A partial reconfigurable FPGA implementation f...,FPGA,0,hardware
7,10.1109/IPDPS.2005.72,A Timed Petri Net Approach for Pre-Runtime Sch...,IPDPS,6,arch


In [9]:
publication_df['sub_area'].value_counts()

sub_area
ai          4
arch        2
hardware    2
Name: count, dtype: int64

In [10]:
publication_df['citation_count'].value_counts()

citation_count
0     3
25    1
26    1
5     1
40    1
6     1
Name: count, dtype: int64