In [7]:
import re
import requests
import pandas as pd
import concurrent.futures
from time import time

# Global session to reuse HTTP connections.
session = requests.Session()


def extract_doi(doi_str: str) -> str:
    """
    Extract the DOI from a given string.

    Handles:
      - URLs (e.g. "https://doi.org/10.xxx/yyy")
      - Text with "doi:" prefix.

    Parameters:
        doi_str (str): The input DOI string.

    Returns:
        str: The cleaned DOI (e.g., "10.xxx/yyy").
    """
    doi_str = doi_str.strip()
    if doi_str.lower().startswith("https://doi.org/"):
        return doi_str.split("https://doi.org/")[-1]
    match = re.search(r"doi:(\S+)", doi_str, re.IGNORECASE)
    if match:
        return match.group(1)
    return doi_str


def get_references(doi: str) -> list:
    """
    Retrieve outgoing reference edges for the given DOI via the OpenCitations API.

    For each edge, the publication is the origin (which cites others)
    and the returned "cited" DOI is the target.

    Parameters:
        doi (str): The DOI to query.

    Returns:
        list: List of dictionaries; each dictionary contains:
              "origin_doi" and "target_doi".
    """
    url = f"https://opencitations.net/index/api/v2/references/doi:{doi}"
    try:
        response = session.get(url, timeout=10)
    except Exception as e:
        print(f"Request error (references) for {doi}: {e}")
        return []

    edges = []
    if response.status_code == 200:
        try:
            data = response.json()
        except Exception as e:
            print(f"JSON parse error for references of {doi}: {e}")
            return []
        for record in data:
            cited_raw = record.get("cited")
            if cited_raw:
                cleaned_doi = extract_doi(cited_raw)
                edges.append({"origin_doi": doi, "target_doi": cleaned_doi})
    else:
        print(f"Error {response.status_code} when fetching references for {doi}")
    return edges


def get_citations(doi: str) -> list:
    """
    Retrieve incoming citation edges for the given DOI via the OpenCitations API.

    For each edge, the returned "citing" DOI is the origin,
    and the given DOI is the target.

    Parameters:
        doi (str): The DOI to query.

    Returns:
        list: List of dictionaries with keys "origin_doi" and "target_doi".
    """
    url = f"https://opencitations.net/index/api/v2/citations/doi:{doi}"
    try:
        response = session.get(url, timeout=10)
    except Exception as e:
        print(f"Request error (citations) for {doi}: {e}")
        return []

    edges = []
    if response.status_code == 200:
        try:
            data = response.json()
        except Exception as e:
            print(f"JSON parse error for citations of {doi}: {e}")
            return []
        for record in data:
            citing_raw = record.get("citing")
            if citing_raw:
                cleaned_doi = extract_doi(citing_raw)
                edges.append({"origin_doi": cleaned_doi, "target_doi": doi})
    else:
        print(f"Error {response.status_code} when fetching citations for {doi}")
    return edges


def process_doi(doi: str, sub_area: str) -> list:
    """
    Process a single publication (identified by DOI and its sub-area) by:
      - Retrieving its references (outgoing edges) and setting:
          * origin_sub_area = sub_area
          * target_sub_area = None (to be filled later if available)
      - Retrieving its citations (incoming edges) and setting:
          * origin_sub_area = None
          * target_sub_area = sub_area

    Parameters:
        doi (str): The publication's DOI.
        sub_area (str): The publication's known sub-area.

    Returns:
        list: A combined list of edge dictionaries.
    """
    # Outgoing: publication is origin.
    refs = get_references(doi)
    for edge in refs:
        edge["origin_sub_area"] = sub_area
        edge["target_sub_area"] = None

    # Incoming: publication is target.
    cits = get_citations(doi)
    for edge in cits:
        edge["origin_sub_area"] = None
        edge["target_sub_area"] = sub_area

    return refs + cits


def load_publications_for_area(file_url: str, sub_area: str) -> list:
    """
    Load publication data from a CSV file at the given URL and tag each publication
    with the provided sub-area. Assumes the CSV file (without headers) has the DOI
    in column index 5.

    Parameters:
        file_url (str): URL to the CSV file.
        sub_area (str): The sub-area for these publications.

    Returns:
        list: List of tuples (doi, sub_area).
    """
    try:
        df = pd.read_csv(file_url, header=None)
    except Exception as e:
        print(f"Error loading CSV from {file_url}: {e}")
        return []
    doi_series = df.iloc[:, 5].dropna().astype(str)
    publications = []
    for doi in doi_series:
        if doi.strip().lower() != "null":
            cleaned = extract_doi(doi)
            publications.append((cleaned, sub_area))
    return list(set(publications))


def main():
    base_url = (
        "https://raw.githubusercontent.com/aserg-ufmg/CSIndex/refs/heads/master/data/"
    )
    areas = [
        "ai",
        "arch",
        "bio",
        "chi",
        "cse",
        "data",
        "dbis",
        "ds",
        "formal",
        "graphics",
        "hardware",
        "ir",
        "net",
        "or",
        "pl",
        "robotics",
        "se",
        "security",
        "theory",
        "vision",
    ]

    all_publications = []
    print("Loading publication data for each area...")
    for area in areas:
        file_url = f"{base_url}{area}-out-papers.csv"
        pubs = load_publications_for_area(file_url, area)
        print(f"Area {area}: found {len(pubs)} publications.")
        all_publications.extend(pubs)

    # Build a dictionary mapping each publication DOI to its sub-area.
    repo_dict = {doi: sub_area for doi, sub_area in all_publications}
    print(f"Total unique repository publications: {len(repo_dict)}")

    all_edges = []
    start_time = time()

    # Process each publication concurrently.
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        future_to_pub = {
            executor.submit(process_doi, doi, sub_area): (doi, sub_area)
            for doi, sub_area in all_publications
        }
        for future in concurrent.futures.as_completed(future_to_pub):
            doi, sub_area = future_to_pub[future]
            try:
                edges = future.result()
                all_edges.extend(edges)
            except Exception as e:
                print(f"Error processing {doi} ({sub_area}): {e}")

    elapsed = time() - start_time
    print(f"Processed {len(all_publications)} publications in {elapsed:.2f} seconds.")

    # Filter edges: keep only those where both origin and target are in our repository.
    filtered_edges = []
    for edge in all_edges:
        origin_doi = edge.get("origin_doi")
        target_doi = edge.get("target_doi")
        if origin_doi in repo_dict and target_doi in repo_dict:
            # If sub-area information is missing, fill it from our repository dictionary.
            if edge.get("origin_sub_area") is None:
                edge["origin_sub_area"] = repo_dict[origin_doi]
            if edge.get("target_sub_area") is None:
                edge["target_sub_area"] = repo_dict[target_doi]
            filtered_edges.append(edge)

    print(f"Total citation edges after filtering: {len(filtered_edges)}")

    # Create final DataFrame with desired column order.
    df_edges = pd.DataFrame(filtered_edges)
    for col in ["origin_doi", "origin_sub_area", "target_doi", "target_sub_area"]:
        if col not in df_edges.columns:
            df_edges[col] = None
    df_edges = df_edges[
        ["origin_doi", "origin_sub_area", "target_doi", "target_sub_area"]
    ]
    df_edges.drop_duplicates(inplace=True)

    print(f"Unique citation edges in the repository-only network: {len(df_edges)}")
    df_edges.to_csv("citations_edge_list_repo_only.csv", index=False)
    print("Filtered citation edge list saved to 'citations_edge_list_repo_only.csv'.")


if __name__ == "__main__":
    main()

Loading publication data for each area...
Area ai: found 602 publications.
Area arch: found 227 publications.
Area bio: found 59 publications.
Area chi: found 86 publications.
Area cse: found 25 publications.
Area data: found 145 publications.
Area dbis: found 106 publications.
Area ds: found 109 publications.
Area formal: found 33 publications.
Area graphics: found 219 publications.
Area hardware: found 69 publications.
Area ir: found 55 publications.
Area net: found 339 publications.
Area or: found 270 publications.
Area pl: found 20 publications.
Area robotics: found 121 publications.
Area se: found 441 publications.
Area security: found 50 publications.
Area theory: found 170 publications.
Area vision: found 143 publications.
Total unique repository publications: 3289
Error 500 when fetching citations for 10.24963/ijcai.2020/679
Error 500 when fetching references for 10.1016/j.eswa.2021.115589
Error 500 when fetching citations for 10.1016/j.eswa.2021.115064
Error 500 when fetching 