In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH_MASTER = "dataset_14_08_2025_completion.csv"
PATH_ORIG   = "mentions_original_fill_cleaned.csv"
PATH_RETR   = "mentions_retraction_fill_cleaned.csv"
OUT = Path("timeline_classified_outputs")
OUT.mkdir(exist_ok=True)

def norm_str(x):
    if pd.isna(x): return np.nan
    x = str(x).strip()
    return np.nan if x in {"", "0", "nan", "NaN", "none", "None", "null", "Null"} else x

def norm_doi(x):
    x = norm_str(x)
    if pd.isna(x): return np.nan
    x = x.lower()
    if x.startswith("doi:"):
        x = x[4:].strip()
    return x

def canonical_url(x):
    x = norm_str(x)
    if pd.isna(x): return np.nan
    x = x.strip()
    if "#" in x:
        x = x.split("#", 1)[0]
    return x

master = pd.read_csv(PATH_MASTER, dtype=str, keep_default_na=False)
for c in ["OriginalPaperDate","RetractionDate"]:
    master[c] = pd.to_datetime(master[c], errors="coerce", utc=True)

master["orig_doi_norm"] = master.get("OriginalPaperDOI").map(norm_doi)
master["retr_doi_norm"] = master.get("RetractionDOI").map(norm_doi)

map_origdoi_to_retrdate = (
    master.dropna(subset=["orig_doi_norm","RetractionDate"])
          .groupby("orig_doi_norm")["RetractionDate"].min()
)
map_retrdoi_to_retrdate = (
    master.dropna(subset=["retr_doi_norm","RetractionDate"])
          .groupby("retr_doi_norm")["RetractionDate"].min()
)

orig_full = pd.read_csv(PATH_ORIG, dtype=str, keep_default_na=False)
retr_full = pd.read_csv(PATH_RETR, dtype=str, keep_default_na=False)

orig_cols = orig_full.columns.tolist()
retr_cols = retr_full.columns.tolist()

orig = orig_full.copy()
retr = retr_full.copy()

for df in (orig, retr):
    df["Mention Date"] = pd.to_datetime(df["Mention Date"], errors="coerce", utc=True)
    df["DOI"] = df["DOI"].map(norm_doi)
    df["Mention URL"] = df["Mention URL"].map(canonical_url)

orig["RetractionDate_mapped"] = orig["DOI"].map(map_origdoi_to_retrdate)
retr["RetractionDate_mapped"] = retr["DOI"].map(map_retrdoi_to_retrdate)

orig["delay_days"] = (orig["Mention Date"] - orig["RetractionDate_mapped"]).dt.days
retr["delay_days"] = (retr["Mention Date"] - retr["RetractionDate_mapped"]).dt.days

orig_valid = orig[orig["delay_days"].notna()].copy()
retr_valid = retr[retr["delay_days"].notna()].copy()

urls_orig_all = set(orig_valid["Mention URL"].dropna())
urls_retr_all = set(retr_valid["Mention URL"].dropna())

mask_o_before = orig_valid["delay_days"] < 0
mask_o_after  = orig_valid["delay_days"] >= 0

mask_o_after_comention = mask_o_after & orig_valid["Mention URL"].isin(urls_retr_all)
mask_o_after_exclusive = mask_o_after & ~orig_valid["Mention URL"].isin(urls_retr_all)

mask_r_before = retr_valid["delay_days"] < 0
mask_r_after  = retr_valid["delay_days"] >= 0

mask_r_before_comention = mask_r_before & retr_valid["Mention URL"].isin(urls_orig_all)
mask_r_before_exclusive = mask_r_before & ~retr_valid["Mention URL"].isin(urls_orig_all)

original_before = orig_valid[mask_o_before]
original_before[orig_cols].to_csv(OUT/"original_before.csv", index=False)

original_after_exclusive = orig_valid[mask_o_after_exclusive]
original_after_exclusive[orig_cols].to_csv(OUT/"original_after_exclusive.csv", index=False)

original_after_comention = orig_valid[mask_o_after_comention]
original_after_comention[orig_cols].to_csv(OUT/"original_after_comention.csv", index=False)

retraction_before_exclusive = retr_valid[mask_r_before_exclusive]
retraction_before_exclusive[retr_cols].to_csv(OUT/"retraction_before_exclusive.csv", index=False)

retraction_before_comention = retr_valid[mask_r_before_comention]
retraction_before_comention[retr_cols].to_csv(OUT/"retraction_before_comention.csv", index=False)

retraction_after = retr_valid[mask_r_after]
retraction_after[retr_cols].to_csv(OUT/"retraction_after.csv", index=False)

print("Saved to:", OUT.resolve())
print({
    "original_before": original_before.shape,
    "original_after_exclusive": original_after_exclusive.shape,
    "original_after_comention": original_after_comention.shape,
    "retraction_before_exclusive": retraction_before_exclusive.shape,
    "retraction_before_comention": retraction_before_comention.shape,
    "retraction_after": retraction_after.shape,
})


Saved to: /Users/miaoyixuan/CS58/timeline_classified_outputs
{'original_before': (14532, 46), 'original_after_exclusive': (6378, 46), 'original_after_comention': (8078, 46), 'retraction_before_exclusive': (361, 46), 'retraction_before_comention': (1902, 46), 'retraction_after': (12729, 46)}


In [4]:
import pandas as pd
import re
from pathlib import Path

BASE = Path("timeline_classified_outputs")   
PATH_ORIG_AFTER_EXCL = BASE / "original_after_exclusive.csv"
PATH_RETR_BEFORE_EXCL = BASE / "retraction_before_exclusive.csv"

PATH_ORIG_FULL = "mentions_original_fill_cleaned.csv"
PATH_RETR_FULL = "mentions_retraction_fill_cleaned.csv"

orig_after_excl = pd.read_csv(PATH_ORIG_AFTER_EXCL, dtype=str, keep_default_na=False)
retr_before_excl = pd.read_csv(PATH_RETR_BEFORE_EXCL, dtype=str, keep_default_na=False)
orig_full = pd.read_csv(PATH_ORIG_FULL, dtype=str, keep_default_na=False)
retr_full = pd.read_csv(PATH_RETR_FULL, dtype=str, keep_default_na=False)

keywords = [
    "retraction", "retracted", "withdrawn", "scandal", "controversial",
    "fake", "fraud", "error", "misconduct", "correction", "corrupted"
]
pattern = re.compile("|".join(keywords), flags=re.IGNORECASE)

def filter_no_keywords(df):
    mention_col = df.get("Mention Title", "")
    output_col = df.get("Research Output Title", "")
    combined = mention_col.astype(str) + " " + output_col.astype(str)
    mask_no_kw = ~combined.str.contains(pattern, na=False)
    return df[mask_no_kw]

orig_no_kw = filter_no_keywords(orig_after_excl)
retr_no_kw = filter_no_keywords(retr_before_excl)

OUT = Path("keyword_filtered_outputs")
OUT.mkdir(exist_ok=True)

orig_no_kw.to_csv(OUT / "original_after_exclusive_no_keywords.csv", index=False)
retr_no_kw.to_csv(OUT / "retraction_before_exclusive_no_keywords.csv", index=False)

orig_total = len(orig_after_excl)
retr_total = len(retr_before_excl)
orig_no_kw_n = len(orig_no_kw)
retr_no_kw_n = len(retr_no_kw)

orig_ratio = orig_no_kw_n / orig_total * 100 if orig_total else 0
retr_ratio = retr_no_kw_n / retr_total * 100 if retr_total else 0

orig_ratio_vs_full = orig_no_kw_n / len(orig_full) * 100
retr_ratio_vs_full = retr_no_kw_n / len(retr_full) * 100

print("=== Potential Misinformation Keyword Filtering Summary ===")
print(f"Original After-Retraction (exclusive): {orig_total:,} rows")
print(f" → Without keywords: {orig_no_kw_n:,} ({orig_ratio:.2f}% of group, {orig_ratio_vs_full:.2f}% of full original CSV)")

print(f"Retraction Before-Retraction (exclusive): {retr_total:,} rows")
print(f" → Without keywords: {retr_no_kw_n:,} ({retr_ratio:.2f}% of group, {retr_ratio_vs_full:.2f}% of full retraction CSV)")

print(f"Outputs saved to: {OUT.resolve()}")


=== Potential Misinformation Keyword Filtering Summary ===
Original After-Retraction (exclusive): 6,378 rows
 → Without keywords: 3,718 (58.29% of group, 12.68% of full original CSV)
Retraction Before-Retraction (exclusive): 361 rows
 → Without keywords: 70 (19.39% of group, 0.45% of full retraction CSV)
Outputs saved to: /Users/miaoyixuan/CS58/keyword_filtered_outputs
