In [None]:
"""
This script loads CSIndexBR area mappings and fetches DBLP publications along with citation counts.
Publications are assigned a sub-area based on venue using CSIndexBR mappings. Publications without
a matching sub-area are included with `sub_area` set to None.
"""

import re
import csv
from typing import Dict, List, Optional, Tuple
import requests
import pandas as pd
import xml.etree.ElementTree as ET

# -------------------------------
# Constants
# -------------------------------
CSINDEX_BASE_URL = (
    "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/"
)
RESEARCHERS_CSV_URL = (
    "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/all-researchers.csv"
)
OPENCITATIONS_API_BASE = "https://opencitations.net/index/api/v2"
DBLP_BASE_URL = "https://dblp.org/pid"

# CSIndexBR research areas (hardcoded as per original implementation)
CSINDEX_AREAS = [
    "ai", "arch", "bio", "chi", "cse", "data", "dbis", "ds", "formal", "graphics",
    "hardware", "ir", "net", "or", "pl", "robotics", "se", "security", "theory", "vision"
]

# -------------------------------
# API Clients
# -------------------------------

class DBLPClient:
    """Client for fetching and processing DBLP publication data."""
    
    @staticmethod
    def get_publications(pid: str, area_mapping: Dict[str, List[str]]) -> List[dict]:
        """
        Fetches publications for a DBLP PID and assigns sub-areas using CSIndexBR mappings.
        
        Args:
            pid: DBLP researcher PID
            area_mapping: Venue-to-subarea mapping from CSIndexBR
            
        Returns:
            List of publication dictionaries with DOI, title, venue, citations, and sub-area
        """
        url = f"{DBLP_BASE_URL}/{pid}.xml"
        print(f"\nFetching DBLP publications for PID: {pid}")
        
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to fetch data for PID={pid} (HTTP {response.status_code})")
            return []

        return DBLPClient._parse_publications(response.content, area_mapping)

    @staticmethod
    def _parse_publications(xml_content: bytes, area_mapping: Dict[str, List[str]]) -> List[dict]:
        """Parses XML response from DBLP and processes publication entries."""
        publications = []
        root = ET.fromstring(xml_content)
        
        for publication_element in root.findall('r'):
            if len(publication_element) == 0:  # Skip empty elements
                continue
            
            pub_data = DBLPClient._extract_publication_data(publication_element[0])
            if not pub_data:
                continue

            doi, title, venue = pub_data
            sub_area = area_mapping.get(venue, [None])[0]  # Take first sub-area if multiple
            
            publications.append({
                'doi': doi,
                'title': title,
                'venue': venue,
                'citation_count': OpenCitationsClient.get_citation_count(doi),
                'sub_area': sub_area
            })

        print(f"Processed {len(publications)} publications from DBLP response")
        return publications

    @staticmethod
    def _extract_publication_data(pub_node: ET.Element) -> Optional[Tuple[str, str, str]]:
        """Extracts DOI, title, and venue from a publication XML node."""
        title_node = pub_node.find('title')
        title = title_node.text if title_node is not None else None
        
        # Extract venue from journal or booktitle
        venue_node = pub_node.find('journal') or pub_node.find('booktitle')
        venue = venue_node.text if venue_node is not None else None
        
        # Extract DOI from ee elements
        doi = None
        for ee in pub_node.findall('ee'):
            if ee.text and "doi.org" in ee.text:
                doi = ee.text.replace("https://doi.org/", "")
                break

        return (doi, title, venue) if doi and venue else None


class OpenCitationsClient:
    """Client for interacting with OpenCitations API endpoints."""
    
    @staticmethod
    def get_citation_count(doi: Optional[str]) -> Optional[int]:
        """Retrieves citation count for a DOI from OpenCitations."""
        if not doi:
            return 0
            
        response = requests.get(
            f"{OPENCITATIONS_API_BASE}/citation-count/doi:{doi}"
        )
        if response.status_code == 200:
            return response.json()[0].get('count', None)
        return 0

    @staticmethod
    def get_references(doi: str) -> List[dict]:
        """Fethes references for a DOI and returns cleaned citation edges."""
        response = requests.get(
            f"{OPENCITATIONS_API_BASE}/references/doi:{doi}"
        )
        return OpenCitationsClient._process_citation_edges(response, "cited")

    @staticmethod
    def get_citations(doi: str) -> List[dict]:
        """Fetches citations for a DOI and returns cleaned citation edges."""
        response = requests.get(
            f"{OPENCITATIONS_API_BASE}/citations/doi:{doi}"
        )
        return OpenCitationsClient._process_citation_edges(response, "citing")

    @staticmethod
    def _process_citation_edges(response: requests.Response, key: str) -> List[dict]:
        """Helper to process citation/reference API responses."""
        if response.status_code != 200:
            print(f"API Error {response.status_code}")
            return []

        edges = []
        for record in response.json():
            raw_doi = record.get(key)
            if raw_doi:
                cleaned_doi = OpenCitationsClient._extract_doi(raw_doi)
                edges.append({"doi": cleaned_doi})
        return edges

    @staticmethod
    def _extract_doi(doi_str: str) -> str:
        """Extracts normalized DOI from string."""
        match = re.search(r'doi:(\S+)', doi_str)
        return match.group(1) if match else doi_str


# -------------------------------
# CSIndexBR Data Loading
# -------------------------------

class CSIndexLoader:
    """Handles loading and merging CSIndexBR research area mappings."""
    
    @staticmethod
    def load_mappings() -> Dict[str, List[str]]:
        """Loads and merges all conference/journal mappings from CSIndexBR."""
        print("\nLoading CSIndexBR area mappings...")
        merged_mapping = {}
        
        for area in CSINDEX_AREAS:
            CSIndexLoader._load_conference_mappings(area, merged_mapping)
            CSIndexLoader._load_journal_mappings(area, merged_mapping)
        
        print(f"Loaded mappings for {len(merged_mapping)} venues")
        return merged_mapping

    @staticmethod
    def _load_conference_mappings(area: str, merged_mapping: Dict[str, List[str]]):
        """Loads conference mappings for a research area."""
        for conf_type in ["confs", "out-confs"]:
            url = f"{CSINDEX_BASE_URL}{area}-{conf_type}.csv"
            for venue, areas in CSIndexLoader._load_csv_mapping(url, 0, 1).items():
                merged_mapping.setdefault(venue, []).extend(areas)

    @staticmethod
    def _load_journal_mappings(area: str, merged_mapping: Dict[str, List[str]]):
        """Loads journal mappings for a research area."""
        url = f"{CSINDEX_BASE_URL}{area}-out-journals.csv"
        for journal, areas in CSIndexLoader._load_csv_mapping(url, 0, 0).items():
            merged_mapping.setdefault(journal, []).extend(areas)

    @staticmethod
    def _load_csv_mapping(url: str, key_col: int, val_col: int) -> Dict[str, List[str]]:
        """Generic CSV loader for CSIndexBR mapping files."""
        mapping = {}
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to load {url}")
            return mapping

        for row in csv.reader(response.text.splitlines()):
            if len(row) > max(key_col, val_col):
                key = row[key_col].strip()
                val = row[val_col].strip()
                mapping.setdefault(key, []).append(val)
        return mapping


# -------------------------------
# Citation Network Construction
# -------------------------------

def build_citation_network(publications: List[dict]) -> pd.DataFrame:
    """
    Constructs citation network from publications with sub-area annotations.
    
    Args:
        publications: List of publications with DOI and sub-area
        
    Returns:
        DataFrame with citation edges and sub-area information
    """
    doi_to_subarea = {pub["doi"]: pub["sub_area"] for pub in publications}
    edges = []

    for pub in publications:
        origin_doi = pub["doi"]
        origin_area = pub["sub_area"]

        # Process references (outgoing edges)
        for ref in OpenCitationsClient.get_references(origin_doi):
            edges.append({
                'origin_doi': origin_doi,
                'origin_sub-area': origin_area,
                'target_doi': ref["doi"],
                'target_sub-area': doi_to_subarea.get(ref["doi"], None)
            })

        # Process citations (incoming edges)
        for cit in OpenCitationsClient.get_citations(origin_doi):
            edges.append({
                'origin_doi': cit["doi"],
                'origin_sub-area': doi_to_subarea.get(cit["doi"], None),
                'target_doi': origin_doi,
                'target_sub-area': origin_area
            })

    return pd.DataFrame(edges).drop_duplicates()


# -------------------------------
# Main Execution
# -------------------------------


"""Main execution flow"""
# Load researcher data
researchers_df = pd.read_csv(RESEARCHERS_CSV_URL, names=['researcher', 'institution', 'pid'])
example_pid = researchers_df['pid'][0]  # Example implementation

# Load area mappings and fetch publications
area_mapping = CSIndexLoader.load_mappings()
publications = DBLPClient.get_publications(example_pid, area_mapping)

if not publications:
    print("No publications found with valid venue/DOI")
else:
    # Build and display citation network
    publication_df = pd.DataFrame(publications)
    citation_network = build_citation_network(publications)



Loading CSIndexBR area mappings...
Failed to load https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/or-out-confs.csv
Loaded mappings for 594 venues

Fetching DBLP publications for PID: 90/4151


  if not publication_element:  # Skip empty elements
  venue_node = pub_node.find('journal') or pub_node.find('booktitle')


Processed 53 publications from DBLP response


In [2]:
publication_df

Unnamed: 0,doi,title,venue,citation_count,sub_area
0,10.1007/978-3-031-47721-8_54,HyMO-RF: Automatic Hyperparameter Tuning for E...,IntelliSys (1),0.0,
1,10.1007/978-3-030-30241-2_40,Smart Home Appliances Usage Recommendation Usi...,EPIA (1),0.0,
2,10.5753/sbrc.2019.7374,"BlindMobi: A system for bus identification, ba...",SBRC,0.0,
3,10.1109/VTCFall.2019.8891367,Detecting Anomalies in the Engine Coolant Sens...,VTC Fall,1.0,
4,10.1109/ISCC.2018.8538607,Improving QoS in Vehicular ad-hoc Networks usi...,ISCC,0.0,
5,10.1109/LASCAS.2018.8399980,Identifying power consumption signatures in LT...,LASCAS,0.0,
6,10.1145/3272036.3272046,Multi-objective Approaches to Improve QoS in V...,DIVANet@MSWiM,3.0,
7,10.1007/978-3-319-57351-9_18,SmartHome Energy Saving Using a Multi-objectiv...,Canadian AI,2.0,
8,10.1109/DSD.2017.68,Autonomous Power Management for Embedded Syste...,DSD,3.0,
9,10.1007/978-3-319-46723-8_46,Evaluation-Oriented Training via Surrogate Met...,MICCAI (2),0.0,
