In [None]:
import pandas as pd
import os

INPUT_CSV = "mentions_retraction_fill.csv"

COLUMNS_TO_DELETE = [
    "External Mention ID",
    "Authors at my Institution",
    "Departments",
    "ISBN",
    "National Clinical Trial ID",
    "URI",
    "Handle.net IDs",
    "ADS Bibcode",
    "arXiv ID",
    "RePEc ID",
    "SSRN",
    "URN",
    "error",
    "text_len",
    "extraction_method",
    "text_truncated",
]

df = pd.read_csv(
    INPUT_CSV,
    dtype=str,
    keep_default_na=False,
    na_values=[]
)

orig_rows = len(df)
deduped_df = df.drop_duplicates(keep="first")
dup_removed = orig_rows - len(deduped_df)

work = deduped_df.copy()
if "text_len_full" not in work.columns:
    raise KeyError("Missing column 'text_len_full'. Please ensure it exists in the file.")

num = pd.to_numeric(work["text_len_full"], errors="coerce").fillna(0)

zero_mask = (num == 0)
zero_len_count = int(zero_mask.sum())

small_mask = (num < 200) & (~zero_mask)
small_len_count = int(small_mask.sum())

work = work[~(num < 200)].copy()
final_rows = len(work)

def find_col(cols, target):
    target_lower = target.strip().lower()
    for c in cols:
        if c.strip().lower() == target_lower:
            return c
    return None

norm_url_col = find_col(work.columns, "norm_url")
if norm_url_col is None:
    raise KeyError("Missing column 'norm_url' (case or space may differ). Please verify the file.")

unique_norm_url_count = work[norm_url_col].nunique(dropna=True)

def normalize(name: str) -> str:
    return name.strip().lower().replace(" ", "_")

norm_map = {normalize(c): c for c in work.columns}
to_delete_norm = [normalize(x) for x in COLUMNS_TO_DELETE]

existing_to_drop = []
missing_to_drop = []
for n in to_delete_norm:
    if n in norm_map:
        existing_to_drop.append(norm_map[n])
    else:
        n2 = n.replace(".", "").replace("-", "")
        matched = None
        for k, v in norm_map.items():
            if k.replace(".", "").replace("-", "") == n2:
                matched = v
                break
        if matched:
            existing_to_drop.append(matched)
        else:
            missing_to_drop.append(n)

existing_to_drop = sorted(set(existing_to_drop), key=lambda x: work.columns.get_loc(x))
work = work.drop(columns=existing_to_drop, errors="ignore")

print("===== Cleaning Statistics =====")
print(f"Original total rows: {orig_rows}")
print(f"Removed identical duplicate rows: {dup_removed}")
print(f"Removed rows where text_len_full == 0: {zero_len_count}")
print(f"Removed rows where 0 < text_len_full < 200: {small_len_count}")
print(f"Final total rows: {final_rows}")
print(f"Unique norm_url count: {unique_norm_url_count}")

print("\n===== Column Removal Results =====")
print("Deleted columns (actual names):", existing_to_drop if existing_to_drop else "None")
if missing_to_drop:
    print("Not found (normalized names, may differ due to naming variations):", missing_to_drop)

base, ext = os.path.splitext(INPUT_CSV)
output_csv = f"{base}_cleaned{ext or '.csv'}"
work.to_csv(output_csv, index=False, encoding="utf-8")
print(f"\nCleaned file saved as: {output_csv}")

In [None]:
import pandas as pd
import os

INPUT_CSV = "mentions_retraction_fill.csv"

COLUMNS_TO_DELETE = [
    "External Mention ID",
    "Authors at my Institution",
    "Departments",
    "ISBN",
    "National Clinical Trial ID",
    "URI",
    "Handle.net IDs",
    "ADS Bibcode",
    "arXiv ID",
    "RePEc ID",
    "SSRN",
    "URN",
    "error",
    "text_len",
    "extraction_method",
    "text_truncated",
]

df = pd.read_csv(
    INPUT_CSV,
    dtype=str,
    keep_default_na=False,
    na_values=[]
)

orig_rows = len(df)
deduped_df = df.drop_duplicates(keep="first")
dup_removed = orig_rows - len(deduped_df)

work = deduped_df.copy()
if "text_len_full" not in work.columns:
    raise KeyError("Missing column 'text_len_full'. Please ensure it exists in the file.")

num = pd.to_numeric(work["text_len_full"], errors="coerce").fillna(0)

zero_mask = (num == 0)
zero_len_count = int(zero_mask.sum())

small_mask = (num < 200) & (~zero_mask)
small_len_count = int(small_mask.sum())

work = work[~(num < 200)].copy()
final_rows = len(work)

def find_col(cols, target):
    target_lower = target.strip().lower()
    for c in cols:
        if c.strip().lower() == target_lower:
            return c
    return None

norm_url_col = find_col(work.columns, "norm_url")
if norm_url_col is None:
    raise KeyError("Missing column 'norm_url' (case or space may differ). Please verify the file.")

unique_norm_url_count = work[norm_url_col].nunique(dropna=True)

def normalize(name: str) -> str:
    return name.strip().lower().replace(" ", "_")

norm_map = {normalize(c): c for c in work.columns}
to_delete_norm = [normalize(x) for x in COLUMNS_TO_DELETE]

existing_to_drop = []
missing_to_drop = []
for n in to_delete_norm:
    if n in norm_map:
        existing_to_drop.append(norm_map[n])
    else:
        n2 = n.replace(".", "").replace("-", "")
        matched = None
        for k, v in norm_map.items():
            if k.replace(".", "").replace("-", "") == n2:
                matched = v
                break
        if matched:
            existing_to_drop.append(matched)
        else:
            missing_to_drop.append(n)

existing_to_drop = sorted(set(existing_to_drop), key=lambda x: work.columns.get_loc(x))
work = work.drop(columns=existing_to_drop, errors="ignore")

print("===== Cleaning Statistics =====")
print(f"Original total rows: {orig_rows}")
print(f"Removed identical duplicate rows: {dup_removed}")
print(f"Removed rows where text_len_full == 0: {zero_len_count}")
print(f"Removed rows where 0 < text_len_full < 200: {small_len_count}")
print(f"Final total rows: {final_rows}")
print(f"Unique norm_url count: {unique_norm_url_count}")

print("\n===== Column Removal Results =====")
print("Deleted columns (actual names):", existing_to_drop if existing_to_drop else "None")
if missing_to_drop:
    print("Not found (normalized names, may differ due to naming variations):", missing_to_drop)

base, ext = os.path.splitext(INPUT_CSV)
output_csv = f"{base}_cleaned{ext or '.csv'}"
work.to_csv(output_csv, index=False, encoding="utf-8")
print(f"\nCleaned file saved as: {output_csv}")