In [12]:
import pandas as pd, numpy as np, json
from pathlib import Path

DATASET_PATH = "dataset_14_08_2025_completion.csv"
MENTIONS_RETRACTION_PATH = "mentions_retraction_label.csv"
MENTIONS_ORIGINAL_PATH = "mentions_original_label.csv"

out_html = "NewsDiff_Temporal_Interactive.html"

df_base = pd.read_csv(DATASET_PATH)
df_r = pd.read_csv(MENTIONS_RETRACTION_PATH)
df_o = pd.read_csv(MENTIONS_ORIGINAL_PATH)

orig_dt = pd.to_datetime(df_base['OriginalPaperDate'], errors='coerce', infer_datetime_format=True)
retr_dt = pd.to_datetime(df_base['RetractionDate'], errors='coerce', infer_datetime_format=True)
df_base['OriginalPaperDate_iso'] = orig_dt.dt.strftime('%Y-%m-%d')
df_base['RetractionDate_iso']    = retr_dt.dt.strftime('%Y-%m-%d')

def _safe_str(x):
    return None if pd.isna(x) else str(x)

base_records = []
for rec in df_base.to_dict('records'):
    base_records.append({
        "record_id": _safe_str(rec.get("Record ID")),
        "title": _safe_str(rec.get("Title")),
        "original_doi": _safe_str(rec.get("OriginalPaperDOI")),
        "original_pmid": _safe_str(rec.get("OriginalPaperPubMedID")),
        "retraction_doi": _safe_str(rec.get("RetractionDOI")),
        "retraction_pmid": _safe_str(rec.get("RetractionPubMedID")),
        "original_date": _safe_str(rec.get("OriginalPaperDate_iso")),
        "retraction_date": _safe_str(rec.get("RetractionDate_iso")),
    })

map_retract_doi  = { b["retraction_doi"]: b for b in base_records if b["retraction_doi"]  }
map_retract_pmid = { b["retraction_pmid"]: b for b in base_records if b["retraction_pmid"] }
map_orig_doi     = { b["original_doi"]: b   for b in base_records if b["original_doi"]     }
map_orig_pmid    = { b["original_pmid"]: b  for b in base_records if b["original_pmid"]    }

need_cols = [
    "label", "Mention Date", "Mention Title", "Research Output Title", "Mention URL",
    "DOI", "PubMed ID", "External Mention ID"
]

def to_iso_series(s):
    dt = pd.to_datetime(s, errors='coerce', infer_datetime_format=True)
    return dt.dt.strftime('%Y-%m-%d')

def prep_mentions(df):
    sub = df[need_cols].copy()
    sub['Mention Date_iso'] = to_iso_series(sub['Mention Date'])
    return sub

m_r = prep_mentions(df_r)
m_o = prep_mentions(df_o)

df_m = pd.concat([m_r, m_o], ignore_index=True)

def row_to_js(rec):
    return {
        "label": _safe_str(rec.get("label")),
        "mention_date": _safe_str(rec.get("Mention Date_iso")),
        "mention_title": _safe_str(rec.get("Mention Title")),

        "research_output_title": _safe_str(rec.get("Research Output Title")),
        "mention_url": _safe_str(rec.get("Mention URL")),
        "doi": _safe_str(rec.get("DOI")),
        "pmid": _safe_str(rec.get("PubMed ID")),
        "external_id": _safe_str(rec.get("External Mention ID")),
    }

TRUE_LABELS = {
    "O_BEFORE","O_AFTER_COMENTION","O_AFTER_EXCL_NORM",
    "R_AFTER","R_BEFORE_COMENTION","R_BEFORE_EXCL_NORM"
}
FALSE_LABELS = {"O_AFTER_EXCL_ABNORM","R_BEFORE_EXCL_ABNORM"}

arr_true  = [row_to_js(r) for r in df_m[df_m["label"].isin(TRUE_LABELS)].to_dict('records')]
arr_false = [row_to_js(r) for r in df_m[df_m["label"].isin(FALSE_LABELS)].to_dict('records')]

label_map = {
    "O_BEFORE": {"Time Category":"Original Mentions: BEFORE retraction", "Typical Meaning":"News or blogs reporting the original paper before it was officially retracted", "Judgment":"True"},
    "O_AFTER_COMENTION": {"Time Category":"Original Mentions: AFTER retraction + Retraction co-mention", "Typical Meaning":"The same article mentions both the original and the retraction", "Judgment":"True (corrective / explanatory)"},
    "O_AFTER_EXCL_NORM": {"Time Category":"Original Mentions: AFTER retraction EXCLUSIVE - with correction signals", "Typical Meaning":"News continues to cite the original paper after retraction but includes a notice", "Judgment":"True (corrective / updated)"},
    "O_AFTER_EXCL_ABNORM": {"Time Category":"Original Mentions: AFTER retraction EXCLUSIVE - no correction signals", "Typical Meaning":"Continues citing the original paper after retraction, without any correction note", "Judgment":"False (outdated / misleading)"},
    "R_AFTER": {"Time Category":"Retraction Mentions: AFTER retraction", "Typical Meaning":"Reports on the retraction after it officially occurred", "Judgment":"True"},
    "R_BEFORE_COMENTION": {"Time Category":"Retraction Mentions: BEFORE retraction + Original co-mention", "Typical Meaning":"A retraction is mentioned together with the original paper before the formal retraction date", "Judgment":"True (tracking / clarification)"},
    "R_BEFORE_EXCL_ABNORM": {"Time Category":"Retraction Mentions: BEFORE retraction EXCLUSIVE - with validation signals", "Typical Meaning":"Reports a retraction before the official date, but the information is later verified", "Judgment":"True (verified / clarified)"},
    "R_BEFORE_EXCL_NORM": {"Time Category":"Retraction Mentions: BEFORE retraction EXCLUSIVE - no validation signals", "Typical Meaning":"Claims a retraction before it actually happens, without subsequent verification", "Judgment":"False (premature / speculative)"}
}

payload = {
    "true_items": arr_true,
    "false_items": arr_false,
    "label_map": label_map,
    "maps": {
        "retract_doi": map_retract_doi,
        "retract_pmid": map_retract_pmid,
        "orig_doi": map_orig_doi,
        "orig_pmid": map_orig_pmid,
    }
}

js_payload = json.dumps(payload, ensure_ascii=False)

parts = []
parts.append("""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>Temporal Information Diff — Interactive </title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<style>
  :root{
    --bg:#ffffff;
    --card:#fafafa;
    --border:#CCCCCC;
    --text:#111827;
    --muted:#6b7280;
    --accent:#2563eb; /* calm Plotly/Streamlit-ish blue */
  }
  *{box-sizing:border-box}
  body { font-family: system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; color:var(--text); background:var(--bg); }
  .container { max-width: 1200px; margin: 20px auto; padding: 0 16px; }
  .row { display: flex; gap: 16px; flex-wrap: wrap; }
  .card { background:var(--card); border: 1px solid var(--border); border-radius: 12px; padding: 12px; margin: 6px; box-shadow:0 1px 2px rgba(0,0,0,.04); }
  .k { font-size:12px; color:var(--muted); }
  .v { font-size:14px; font-weight:600; word-break: break-word; }
  .panel { flex: 1; min-width: 320px; }
  .controls .card input, .controls .card select { width: 100%; padding: 8px; border:1px solid var(--border); border-radius:8px; outline:none }
  .controls .card select:focus, .controls .card input:focus{border-color:var(--accent)}
  .btn { display:inline-block; padding:8px 12px; border:1px solid var(--accent); color:#fff; background:var(--accent); border-radius:10px; cursor:pointer; margin-right:8px; user-select:none; }
  .btn:active { transform: translateY(1px); }
  h1 { margin: 4px 0 10px; font-size: 22px; }
  h2 { margin: 12px 0 6px; font-size: 18px; }
  hr { border:0; border-top:1px solid var(--border); margin: 16px 0; }
</style>
</head>
<body>
<div class="container">
  <h1>Temporal Information Diff — Interactive </h1>
  <div class="row controls">
    <div class="card" style="flex:2">
      <div class="k">TRUE Label Filter</div>
      <select id="trueLabel">
        <option value="ANY">Any of TRUE set</option>
        <option value="O_BEFORE">O_BEFORE</option>
        <option value="O_AFTER_COMENTION">O_AFTER_COMENTION</option>
        <option value="O_AFTER_EXCL_NORM">O_AFTER_EXCL_NORM</option>
        <option value="R_AFTER">R_AFTER</option>
        <option value="R_BEFORE_COMENTION">R_BEFORE_COMENTION</option>
        <option value="R_BEFORE_EXCL_NORM">R_BEFORE_EXCL_NORM</option>
      </select>
      <div class="k" style="margin-top:8px">Seed (integer)</div>
      <input id="seedInput" type="number" placeholder="e.g., 2025" />
      <div style="margin-top:10px">
        <span class="btn" id="btnSeeded">Sample with Seed</span>
        <span class="btn" id="btnRandom">Sample Random</span>
      </div>
      <div class="k" style="margin-top:8px">Tip: Use the same seed to reproduce the same picks.</div>
    </div>
    <div class="card" style="flex:2">
      <div class="k">FALSE Label Filter</div>
      <select id="falseLabel">
        <option value="ANY">Any of FALSE set</option>
        <option value="O_AFTER_EXCL_ABNORM">O_AFTER_EXCL_ABNORM</option>
        <option value="R_BEFORE_EXCL_ABNORM">R_BEFORE_EXCL_ABNORM</option>
      </select>
      <div class="k" style="margin-top:8px">FALSE = O_AFTER_EXCL_ABNORM / R_BEFORE_EXCL_ABNORM</div>
    </div>
  </div>

  <hr/>

  <div class="row">
    <div class="panel">
      <h2>TRUE Example</h2>
      <div id="trueCards"></div>
      <div id="trueTimeline"></div>
    </div>
    <div class="panel">
      <h2>FALSE Example</h2>
      <div id="falseCards"></div>
      <div id="falseTimeline"></div>
    </div>
  </div>
</div>
<script>
""")

parts.append("const PAYLOAD = ")
parts.append(js_payload)
parts.append(";\n")

parts.append(r"""
const TRUE_SET   = new Set(["O_BEFORE","O_AFTER_COMENTION","O_AFTER_EXCL_NORM","R_AFTER","R_BEFORE_COMENTION","R_BEFORE_EXCL_NORM"]);
const FALSE_SET  = new Set(["O_AFTER_EXCL_ABNORM","R_BEFORE_EXCL_ABNORM"]);

function mulberry32(a) { // seedable RNG
  return function() {
    let t = a += 0x6D2B79F5;
    t = Math.imul(t ^ t >>> 15, t | 1);
    t ^= t + Math.imul(t ^ t >>> 7, t | 61);
    return ((t ^ t >>> 14) >>> 0) / 4294967296;
  }
}

function infoCard(k, v) {
  const vv = (v===null || v===undefined) ? "" : String(v);
  return `
    <div class="card">
      <div class="k">${k}</div>
      <div class="v">${vv}</div>
    </div>`;
}

function bestJoin(mention) {
  const maps = PAYLOAD.maps;
  const tryKeys = [
    ["retract_doi", mention.doi],
    ["retract_pmid", mention.pmid],
    ["orig_doi", mention.doi],
    ["orig_pmid", mention.pmid],
  ];
  for (const [which, key] of tryKeys) {
    if (!key) continue;
    const hit = maps[which][key];
    if (hit) return [hit, which];
  }
  return [null, null];
}

function toDate(o) {
  if (!o) return null;
  const d = new Date(o);
  if (isNaN(d)) return null;
  return d;
}

function renderTimeline(divId, original_date, retraction_date, mention_date, title) {
  const xs = [];
  const labels = [];
  if (original_date) { xs.push(original_date); labels.push("O"); }
  if (retraction_date) { xs.push(retraction_date); labels.push("R"); }
  if (mention_date) { xs.push(mention_date); labels.push("M"); }
  const traceLine = (xs.length>=2) ? [{
    x: xs, y: xs.map(_=>0), mode: "lines", line: {width:2}, name: "Timeline"
  }] : [];
  const tracePts  = [{
    x: xs, y: xs.map(_=>0), mode: "markers+text", text: labels, textposition:"top center",
    marker: {size:12}, name:"Events"
  }];
  const data = [...traceLine, ...tracePts];
  const layout = { title, xaxis: {}, yaxis: {visible:false},
                   plot_bgcolor:"white", margin:{l:40,r:20,t:60,b:40}, height:320 };
  Plotly.newPlot(divId, data, layout, {displayModeBar:false});
}

function renderPanel(targetCardsId, targetTimelineId, mention, tag) {
  const [base] = bestJoin(mention);
  const labelMeta = PAYLOAD.label_map[mention.label] || {};

  const cardsLeft = [
    infoCard("Title", base ? base.title : ""),
    infoCard("Paper Record ID", base ? base.record_id : ""),
    infoCard("Mention Title", mention.mention_title),
    infoCard("Mention Date", mention.mention_date),
    infoCard("Mention URL", mention.mention_url),
  ].join("");

  const cardsRight = [
    infoCard("Original DOI", base ? base.original_doi : ""),
    infoCard("Original Date", base ? base.original_date : ""),
    infoCard("Retraction DOI", base ? base.retraction_doi : ""),
    infoCard("Retraction Date", base ? base.retraction_date : ""),
  ].join("");

  const cardsMeta = [
    infoCard("Time Category", labelMeta["Time Category"] || ""),
    infoCard("Typical Meaning", labelMeta["Typical Meaning"] || ""),
    infoCard("Label", mention.label || ""),
    infoCard("Judgment", labelMeta["Judgment"] || ""),
  ].join("");

  document.getElementById(targetCardsId).innerHTML = `
    <div class="row">
      <div class="panel">${cardsLeft}</div>
      <div class="panel">${cardsRight}</div>
      <div class="panel">${cardsMeta}</div>
    </div>`;

  const od = toDate(base ? base.original_date : null);
  const rd = toDate(base ? base.retraction_date : null);
  const md = toDate(mention.mention_date);
  renderTimeline(targetTimelineId, od, rd, md, "Timeline");

}

function sampleAndRender(seed=null) {
  const trueLabelSel  = document.getElementById("trueLabel").value;
  const falseLabelSel = document.getElementById("falseLabel").value;

  let truePool = PAYLOAD.true_items.filter(x => TRUE_SET.has(x.label));
  if (trueLabelSel !== "ANY") truePool = truePool.filter(x => x.label === trueLabelSel);

  let falsePool = PAYLOAD.false_items.filter(x => FALSE_SET.has(x.label));
  if (falseLabelSel !== "ANY") falsePool = falsePool.filter(x => x.label === falseLabelSel);

  if (truePool.length===0 || falsePool.length===0) {
    alert("No data found for selected filters.");
    return;
  }

  let rng = Math.random;
  if (seed !== null && !isNaN(seed)) { rng = mulberry32(Number(seed)); }

  const iTrue = Math.floor(rng()*truePool.length);
  const iFalse = Math.floor(rng()*falsePool.length);
  renderPanel("trueCards", "trueTimeline", truePool[iTrue], "TRUE");
  renderPanel("falseCards", "falseTimeline", falsePool[iFalse], "FALSE");
}

document.getElementById("btnSeeded").addEventListener("click", () => {
  const val = document.getElementById("seedInput").value;
  const seed = (val==="" ? 0 : Number(val));
  sampleAndRender(seed);
});
document.getElementById("btnRandom").addEventListener("click", () => sampleAndRender(null));

// Initial render
sampleAndRender(2025);
</script>
</body>
</html>
""")

Path(out_html).write_text("".join(parts), encoding="utf-8")
out_html



Columns (28) have mixed types. Specify dtype option on import or set low_memory=False.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. You can safely remove this argument.


The argument 'infer_datetime_format' is deprecated and will be removed in a future version. A strict version of it is now the default, see https://pandas.

'NewsDiff_Temporal_Interactive.html'