In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path


ORIG = "mentions_original_fill_cleaned.csv"
RETR = "mentions_retraction_fill_cleaned.csv"
OUT  = Path("overall_time_analysis_full")
OUT.mkdir(parents=True, exist_ok=True)


def load_df(path):
    df = pd.read_csv(path, dtype=str, keep_default_na=False, na_values=[])
    df.columns = [c.strip() for c in df.columns]
    df["Mention Date"] = pd.to_datetime(df["Mention Date"], errors="coerce", utc=True)
    df["Publication Date"] = pd.to_datetime(df["Publication Date"], errors="coerce", utc=True)
    df = df[~df["Mention Date"].isna()].copy()
    df["delay_days"] = (df["Mention Date"] - df["Publication Date"]).dt.days
    return df

o2 = load_df(ORIG)
r2 = load_df(RETR)


o_delays = o2["delay_days"].dropna().astype(int).values
r_delays = r2["delay_days"].dropna().astype(int).values


def delay_stats(arr):
    if arr.size == 0:
        return {}
    return {
        "count": int(arr.size),
        "min": int(arr.min()),
        "p01": float(np.percentile(arr,1)),
        "median": float(np.median(arr)),
        "p99": float(np.percentile(arr,99)),
        "max": int(arr.max()),
        "neg_share": float((arr<0).mean()),
        "share_0_30d": float(((arr>=0)&(arr<=30)).mean()),
        "share_0_365d": float(((arr>=0)&(arr<=365)).mean()),
        "share_-30_+30d": float(((arr>=-30)&(arr<=30)).mean()),
    }

stats_o = delay_stats(o_delays)
stats_r = delay_stats(r_delays)

pd.DataFrame([stats_o]).to_csv(OUT/"original_delay_stats.csv", index=False)
pd.DataFrame([stats_r]).to_csv(OUT/"retraction_delay_stats.csv", index=False)


plt.figure(figsize=(10,6))
plt.hist(o_delays, bins=120, alpha=0.5, label="Original")
plt.hist(r_delays, bins=120, alpha=0.5, label="Retraction")
plt.xlabel("Delay (days)")
plt.ylabel("Mentions count")
plt.title("Delay Distribution (Original vs Retraction, full range)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(OUT/"delay_hist_full_overlay.png")
plt.close()


if o_delays.size and r_delays.size:
    lo = min(np.percentile(o_delays,1), np.percentile(r_delays,1))
    hi = max(np.percentile(o_delays,99), np.percentile(r_delays,99))
else:
    lo, hi = None, None

if lo is not None:
    o_clip = o_delays[(o_delays>=lo)&(o_delays<=hi)]
    r_clip = r_delays[(r_delays>=lo)&(r_delays<=hi)]

    plt.figure(figsize=(10,6))
    plt.hist(o_clip, bins=100, alpha=0.5, label="Original")
    plt.hist(r_clip, bins=100, alpha=0.5, label="Retraction")
    plt.xlabel("Delay (days)")
    plt.ylabel("Mentions count")
    plt.title(f"Delay Distribution (zoomed: {int(lo)} to {int(hi)} days)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(OUT/"delay_hist_zoom_overlay.png")
    plt.close()


def cumulative_curve(arr):
    if arr.size == 0:
        return pd.DataFrame(columns=["delay_days","count","cum_mentions"])
    s = pd.Series(arr).value_counts().sort_index().reset_index()
    s.columns = ["delay_days","count"]
    s["cum_mentions"] = s["count"].cumsum()
    return s

o_cum = cumulative_curve(o_delays)
r_cum = cumulative_curve(r_delays)
o_cum.to_csv(OUT/"original_cumulative_by_delay.csv", index=False)
r_cum.to_csv(OUT/"retraction_cumulative_by_delay.csv", index=False)

plt.figure(figsize=(10,6))
if not o_cum.empty:
    plt.plot(o_cum["delay_days"], o_cum["cum_mentions"], label="Original")
if not r_cum.empty:
    plt.plot(r_cum["delay_days"], r_cum["cum_mentions"], label="Retraction")
plt.xlabel("Delay (days)")
plt.ylabel("Cumulative mentions")
plt.title("Cumulative Mentions by Delay (full range)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(OUT/"cumulative_by_delay_full.png")
plt.close()


if lo is not None:
    o_cz = o_cum[(o_cum["delay_days"]>=lo)&(o_cum["delay_days"]<=hi)]
    r_cz = r_cum[(r_cum["delay_days"]>=lo)&(r_cum["delay_days"]<=hi)]
    plt.figure(figsize=(10,6))
    if not o_cz.empty:
        plt.plot(o_cz["delay_days"], o_cz["cum_mentions"], label="Original")
    if not r_cz.empty:
        plt.plot(r_cz["delay_days"], r_cz["cum_mentions"], label="Retraction")
    plt.xlabel("Delay (days)")
    plt.ylabel("Cumulative mentions")
    plt.title("Cumulative Mentions by Delay (zoomed to 1–99th pct)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(OUT/"cumulative_by_delay_zoom.png")
    plt.close()

def monthly_counts(df):
    d = df.copy()
    d["month"] = d["Mention Date"].dt.to_period("M").dt.to_timestamp()
    g = d.groupby("month").size().reset_index(name="mentions")
    return g

o_month = monthly_counts(o2)
r_month = monthly_counts(r2)
o_month.to_csv(OUT/"original_monthly_mentions.csv", index=False)
r_month.to_csv(OUT/"retraction_monthly_mentions.csv", index=False)

plt.figure(figsize=(12,6))
plt.plot(o_month["month"], o_month["mentions"], marker="o", label="Original")
plt.plot(r_month["month"], r_month["mentions"], marker="o", label="Retraction")
plt.xlabel("Month")
plt.ylabel("Mentions")
plt.title("Monthly Mentions Trend (Original vs Retraction)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig(OUT/"monthly_trend_original_vs_retraction.png")
plt.close()

metrics = ["0-30 days","0-365 days","±30 days","Negative delays"]
orig_vals = [stats_o["share_0_30d"], stats_o["share_0_365d"], stats_o["share_-30_+30d"], stats_o["neg_share"]]
retr_vals = [stats_r["share_0_30d"], stats_r["share_0_365d"], stats_r["share_-30_+30d"], stats_r["neg_share"]]

x = np.arange(len(metrics))
w = 0.35
plt.figure(figsize=(10,6))
plt.bar(x-w/2, orig_vals, width=w, label="Original")
plt.bar(x+w/2, retr_vals, width=w, label="Retraction")
plt.xticks(x, metrics)
plt.ylabel("Proportion")
plt.title("Key Delay Metrics Comparison")
plt.legend()
plt.grid(True, axis="y")
plt.tight_layout()
plt.savefig(OUT/"key_delay_metrics_bar.png")
plt.close()

print("All figures saved to:", OUT)

All figures saved to: overall_time_analysis_full


  d["month"] = d["Mention Date"].dt.to_period("M").dt.to_timestamp()
  d["month"] = d["Mention Date"].dt.to_period("M").dt.to_timestamp()
