In [None]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import datetime
import re
from time import sleep  # optional: to be nice to the API

In [None]:
# ----- Run for a specific author -----
author_id = "R.Suzuki.2"

In [None]:
# --- GET AUTHOR'S PAPERS ---
def fetch_author_recids(author_id):
    base_url = "https://inspirehep.net/api/literature"
    params = {
        "q": f"a {author_id}",
        "size": 1000,
        "fields": "control_number"
    }
    response = requests.get(base_url, params=params)
    response.raise_for_status()
    data = response.json()
    recids = []
    for hit in data.get("hits", {}).get("hits", []):
        recid = hit.get("metadata", {}).get("control_number")
        if recid:
            recids.append(recid)
    return recids

In [None]:
# --- fetch paper IDs ---
recids = fetch_author_recids(author_id)
print(f"Found {len(recids)} papers. Getting citations...")

In [None]:
# --- GET CITING DATES FOR A SINGLE PAPER ---
def get_citing_dates(recid):
    all_dates = []
    url    = "https://inspirehep.net/api/literature"
    params = {
        "q":    f"refersto:recid:{recid}",  
        "size": 250,                        
        "sort": "mostrecent"                
    }

    while url:
        resp = requests.get(url, params=params)
        resp.raise_for_status()
        data = resp.json()
        raw  = resp.text

        # 1) full dates: YYYY-MM-DD
        for d in re.findall(r'"earliest_date":"(\d{4}-\d{2}-\d{2})"', raw):
            all_dates.append(datetime.strptime(d, "%Y-%m-%d"))

        # 2) year‑month only: YYYY-MM  (but not YYYY-MM-DD -> Day = 01)
        for ym in re.findall(r'"earliest_date":"(\d{4}-\d{2})"(?!-)', raw):
            all_dates.append(datetime.strptime(f"{ym}-01", "%Y-%m-%d"))

        # 3) year only: YYYY (but not YYYY- or YYYY-MM -> Month,Day = July,01)
        for y in re.findall(r'"earliest_date":"(\d{4})"(?![-\d])', raw):
            all_dates.append(datetime.strptime(f"{y}-07-01", "%Y-%m-%d"))

        # follow pagination
        url    = data.get("links", {}).get("next")
        params = {}  # next links already include all query parameters

    return sorted(all_dates)

In [None]:
get_citing_dates(836695)

In [None]:
print(len(_))

In [None]:
# --- ACCUMULATE AND PLOT CITATIONS ---
def build_citation_df(dates):
    df = pd.DataFrame(dates, columns=["date"])
    df["count"] = 1
    df = df.groupby("date").sum().sort_index()
    df["cumulative"] = df["count"].cumsum()
    return df

def plot_citations(df):
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(df.index, df["cumulative"], marker="o")
    ax.set_title("Total Cumulative Citations Over Time")
    ax.set_xlabel("Year")
    ax.set_ylabel("Citations")
    ax.grid(True)

    import matplotlib.dates as mdates
    ax.xaxis.set_major_locator(mdates.YearLocator(2))
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right")

    plt.tight_layout()
    return fig

In [None]:
# --- MAIN LOGIC ---

all_dates = []
for i, recid in enumerate(recids, 1):
    print(f"[{i}/{len(recids)}] Processing recid {recid}")
    dates = get_citing_dates(recid)
    all_dates.extend(dates)
    sleep(0.5)  # optional: prevent overwhelming the API

if all_dates:
    df = build_citation_df(all_dates)
    plot_citations(df)
else:
    print("No citation dates found for any paper.")

In [None]:
from IPython.display import display
fig = plot_citations(df)

In [None]:
# Save to PDF:
fig.savefig("cumulative_citations.pdf", format="pdf", bbox_inches="tight")