In [None]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('mentions_original_labeled.csv')

# 定义需要删除的列（根据实际需求修改）
columns_to_drop = ['text', 'images_json']  # 这些列会被删除

# 删除指定的列（inplace=True表示在原DataFrame上修改）
df.drop(columns=columns_to_drop, inplace=True)

df.to_csv('mentions_original_without_content.csv', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('mentions_retraction_labeled.csv')

columns_to_drop = ['text', 'images_json'] 

df.drop(columns=columns_to_drop, inplace=True)

df.to_csv('mentions_retraction_without_content.csv', index=False)

In [None]:
import pandas as pd
from pathlib import Path

base_path = Path("dataset_14_08_2025_completion.csv")
orig_path = Path("mentions_original_without_content.csv")
retr_path = Path("mentions_retraction_without_content.csv")
out_path  = Path("temporal_mentions_joined1.csv")

base = pd.read_csv(base_path)
m_o  = pd.read_csv(orig_path)
m_r  = pd.read_csv(retr_path)

base["OriginalDate_iso"]   = pd.to_datetime(base["OriginalPaperDate"], errors="coerce").dt.strftime("%Y-%m-%d")
base["RetractionDate_iso"] = pd.to_datetime(base["RetractionDate"],  errors="coerce").dt.strftime("%Y-%m-%d")
m_o["MentionDate_iso"]     = pd.to_datetime(m_o["Mention Date"], errors="coerce").dt.strftime("%Y-%m-%d")
m_r["MentionDate_iso"]     = pd.to_datetime(m_r["Mention Date"], errors="coerce").dt.strftime("%Y-%m-%d")

def _fillna_str(s):
    return s.fillna("").astype(str)

b = base.fillna("")
maps = {
    "retract_doi":  dict(zip(_fillna_str(b["RetractionDOI"]),        b.index)),
    "retract_pmid": dict(zip(_fillna_str(b["RetractionPubMedID"]),   b.index)),
    "orig_doi":     dict(zip(_fillna_str(b["OriginalPaperDOI"]),     b.index)),
    "orig_pmid":    dict(zip(_fillna_str(b["OriginalPaperPubMedID"]),b.index)),
}

def best_match_row(doi, pmid):
    doi  = "" if pd.isna(doi) else str(doi)
    pmid = "" if pd.isna(pmid) else str(pmid)
    if doi and doi in maps["retract_doi"]:  return maps["retract_doi"][doi], "retract_doi"
    if pmid and pmid in maps["retract_pmid"]: return maps["retract_pmid"][pmid], "retract_pmid"
    if doi and doi in maps["orig_doi"]:     return maps["orig_doi"][doi], "orig_doi"
    if pmid and pmid in maps["orig_pmid"]:  return maps["orig_pmid"][pmid], "orig_pmid"
    return None, None

m_o["mention_source"] = "original"
m_r["mention_source"] = "retraction"
mentions = pd.concat([m_o, m_r], ignore_index=True)

match_idx, match_key = [], []
for doi, pmid in zip(mentions["DOI"], mentions["PubMed ID"]):
    idx, key = best_match_row(doi, pmid)
    match_idx.append(idx)
    match_key.append(key)

mentions["_base_idx"] = match_idx
mentions["_match_key"] = match_key

keep_mention_cols = [
    "mention_source","label",
    "Mention Type","MentionDate_iso","Mention Title","Mention URL",
    "Outlet or Author","Country","DOI","PubMed ID",
    "Altmetric Attention Score","Publication Date",
    "http_status","domain","page_title","article_title","text_len_full","images_count",
    "Details Page URL","norm_url","final_url",
]
keep_mention_cols = [c for c in keep_mention_cols if c in mentions.columns]
out = mentions[keep_mention_cols + ["_base_idx","_match_key"]].copy()

base_cols = [
    "Record ID","Title","Subject","Institution","Journal","Publisher","Country",
    "Author","URLS","ArticleType",
    "OriginalPaperDOI","OriginalPaperPubMedID",
    "RetractionDOI","RetractionPubMedID",
    "OriginalDate_iso","RetractionDate_iso",
    "RetractionNature","Reason","Paywalled","Notes"
]
base_subset = base[base_cols].copy()

out = out.merge(base_subset, left_on="_base_idx", right_index=True, how="left")

ordered_cols = [
    "mention_source","label","Mention Type","MentionDate_iso","Mention Title","Mention URL",
    "Outlet or Author","Country","DOI","PubMed ID","Altmetric Attention Score","Publication Date",
    "http_status","domain","page_title","article_title","text_len_full","images_count",
    "Details Page URL","norm_url","final_url",
    "Record ID","Title","Subject","Institution","Journal","Publisher","Author","URLS","ArticleType",
    "OriginalPaperDOI","OriginalPaperPubMedID","OriginalDate_iso",
    "RetractionDOI","RetractionPubMedID","RetractionDate_iso",
    "RetractionNature","Reason","Paywalled","Notes",
    "_match_key"
]
ordered_cols = [c for c in ordered_cols if c in out.columns]
out = out[ordered_cols]

out.to_csv(out_path, index=False, encoding="utf-8")
print(str(out_path), out.shape)
