In [None]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

PATH_MASTER = "dataset_14_08_2025_completion.csv"
PATH_ORIG   = "mentions_original_fill_cleaned.csv"
PATH_RETR   = "mentions_retraction_fill_cleaned.csv"

OUT_ORIG = "altmetric_mentions_original_labeled.csv"
OUT_RETR = "altmetric_mentions_retraction_labeled.csv"

def norm_str(x):
    if pd.isna(x): return np.nan
    x = str(x).strip()
    return np.nan if x in {"", "0", "nan", "NaN", "none", "None", "null", "Null"} else x

def norm_doi(x):
    x = norm_str(x)
    if pd.isna(x): return np.nan
    x = x.lower()
    if x.startswith("doi:"):
        x = x[4:].strip()
    return x

def canonical_url(x):
    x = norm_str(x)
    if pd.isna(x): return np.nan
    u = x.strip()
    if "#" in u:
        u = u.split("#", 1)[0]
    return u

master = pd.read_csv(PATH_MASTER, dtype=str, keep_default_na=False)
for c in ["OriginalPaperDate", "RetractionDate"]:
    master[c] = pd.to_datetime(master[c], errors="coerce", utc=True)

master["orig_doi_norm"] = master.get("OriginalPaperDOI").map(norm_doi)
master["retr_doi_norm"] = master.get("RetractionDOI").map(norm_doi)

map_origdoi_to_retrdate = (
    master.dropna(subset=["orig_doi_norm", "RetractionDate"])
          .groupby("orig_doi_norm")["RetractionDate"].min()
)
map_retrdoi_to_retrdate = (
    master.dropna(subset=["retr_doi_norm", "RetractionDate"])
          .groupby("retr_doi_norm")["RetractionDate"].min()
)

orig = pd.read_csv(PATH_ORIG, dtype=str, keep_default_na=False)
retr = pd.read_csv(PATH_RETR, dtype=str, keep_default_na=False)

orig_cols = orig.columns.tolist()
retr_cols = retr.columns.tolist()

def prep(df, side="orig"):
    dfw = df.copy()
    dfw["Mention Date"] = pd.to_datetime(dfw["Mention Date"], errors="coerce", utc=True)
    dfw["DOI_norm"] = dfw.get("DOI", "").map(norm_doi)
    dfw["URL_norm"] = dfw.get("Mention URL", "").map(canonical_url)
    if side == "orig":
        dfw["RetractionDate_mapped"] = dfw["DOI_norm"].map(map_origdoi_to_retrdate)
    else:
        dfw["RetractionDate_mapped"] = dfw["DOI_norm"].map(map_retrdoi_to_retrdate)
    dfw["delay_days"] = (dfw["Mention Date"] - dfw["RetractionDate_mapped"]).dt.days
    return dfw

orig_w = prep(orig, "orig")
retr_w = prep(retr, "retr")

urls_orig_valid = set(orig_w.loc[orig_w["delay_days"].notna(), "URL_norm"].dropna())
urls_retr_valid = set(retr_w.loc[retr_w["delay_days"].notna(), "URL_norm"].dropna())

kw_list = [
    "retraction", "retracted", "withdrawn",
    "correction", "corrected", "erratum",
    "fraud", "scandal", "misconduct", "fake", "error",
    "verify", "verified", "confirm", "confirmed",
    "clarify", "clarified"
]
KW_PATTERN = re.compile("|".join(kw_list), flags=re.IGNORECASE)

def has_correction_keywords(df):
    """Check keywords in Mention Title + Research Output Title."""
    mtitle = df.get("Mention Title", "")
    rtitle = df.get("Research Output Title", "")
    combined = mtitle.astype(str) + " " + rtitle.astype(str)
    return combined.str.contains(KW_PATTERN, na=False)


orig_label = pd.Series("UNCLASSIFIED", index=orig_w.index, dtype="object")

mask_o_valid   = orig_w["delay_days"].notna()
mask_o_before  = mask_o_valid & (orig_w["delay_days"] < 0)
mask_o_after   = mask_o_valid & (orig_w["delay_days"] >= 0)

mask_o_after_comention = mask_o_after & orig_w["URL_norm"].isin(urls_retr_valid)
mask_o_after_exclusive = mask_o_after & ~orig_w["URL_norm"].isin(urls_retr_valid)

mask_o_kw = has_correction_keywords(orig.loc[mask_o_after_exclusive, :])

mask_o_after_excl_true_idx  = orig.loc[mask_o_after_exclusive, :].index[mask_o_kw]
mask_o_after_excl_false_idx = orig.loc[mask_o_after_exclusive, :].index[~mask_o_kw]

orig_label[mask_o_before] = "O_BEFORE"
orig_label[mask_o_after_comention] = "O_AFTER_COMENTION"
orig_label.loc[mask_o_after_excl_true_idx]  = "O_AFTER_EXCL_NORM"
orig_label.loc[mask_o_after_excl_false_idx] = "O_AFTER_EXCL_ABNORM"

orig_labeled = orig.copy()
orig_labeled["label"] = orig_label

retr_label = pd.Series("UNCLASSIFIED", index=retr_w.index, dtype="object")

mask_r_valid   = retr_w["delay_days"].notna()
mask_r_before  = mask_r_valid & (retr_w["delay_days"] < 0)
mask_r_after   = mask_r_valid & (retr_w["delay_days"] >= 0)

mask_r_before_comention = mask_r_before & retr_w["URL_norm"].isin(urls_orig_valid)
mask_r_before_exclusive = mask_r_before & ~retr_w["URL_norm"].isin(urls_orig_valid)

mask_r_kw = has_correction_keywords(retr.loc[mask_r_before_exclusive, :])
r_excl_true_idx  = retr.loc[mask_r_before_exclusive, :].index[mask_r_kw]
r_excl_false_idx = retr.loc[mask_r_before_exclusive, :].index[~mask_r_kw]

retr_label[mask_r_after] = "R_AFTER"
retr_label[mask_r_before_comention] = "R_BEFORE_COMENTION"
retr_label.loc[r_excl_true_idx]  = "R_BEFORE_EXCL_NORM"
retr_label.loc[r_excl_false_idx] = "R_BEFORE_EXCL_ABNORM"

retr_labeled = retr.copy()
retr_labeled["label"] = retr_label

orig_labeled.to_csv(OUT_ORIG, index=False)
retr_labeled.to_csv(OUT_RETR, index=False)

print("Saved:")
print(" -", OUT_ORIG)
print(" -", OUT_RETR)

def label_stats(df, label_col="label"):
    total = len(df)
    by_label = df[label_col].value_counts(dropna=False).rename_axis("label").reset_index(name="count")
    by_label["share_of_total_%"] = by_label["count"] / total * 100

    classified = df[df[label_col] != "UNCLASSIFIED"]
    total_class = len(classified)
    if total_class > 0:
        tmp = classified[label_col].value_counts().rename_axis("label").reset_index(name="count_classified")
        by_label = by_label.merge(tmp, on="label", how="left")
        by_label["share_of_classified_%"] = by_label["count_classified"].fillna(0) / total_class * 100
    else:
        by_label["count_classified"] = 0
        by_label["share_of_classified_%"] = 0.0
    return by_label.sort_values("label")

print("\n=== Original labels ===")
stats_orig = label_stats(orig_labeled)
print(stats_orig.to_string(index=False))

print("\n=== Retraction labels ===")
stats_retr = label_stats(retr_labeled)
print(stats_retr.to_string(index=False))