In [2]:
import json
import pandas as pd

# ===== 1) Specify the file path directly =====
# Change this to your actual file name
file_name = r"E:\judita's project\全部数据\judita_统一ICLR7.7更新后.json"

# ===== 2) Read JSON data =====
with open(file_name, "r", encoding="utf-8") as f:
    data = json.load(f)

# ===== 3) If the top level is a dict, extract the list from it; otherwise, it is a list directly =====
if isinstance(data, dict):
    for v in data.values():
        if isinstance(v, list):
            records = v
            break
    else:
        raise ValueError("List not found in the JSON file, please check the structure")
else:
    records = data

print("✅ Total records:", len(records))

# ===== 4) Extract paper_id =====
paper_ids = [rec.get("id", None) for rec in records]

# ===== 5) Save as CSV =====
df = pd.DataFrame({"paper_id": paper_ids})
output_file = r"E:\judita's project\new data 2\paper_ids.csv"
df.to_csv(output_file, index=False)

print(f"✅ Saved {len(df)} paper_ids to {output_file}")

✅ Total records: 5886
✅ Saved 5886 paper_ids to E:\judita's project\new data 2\paper_ids.csv


In [3]:
import pandas as pd

# ===== 1) Specify file paths =====
# Should contain one column of paper_ids (column name is flexible, will be auto-detected)
paper_ids_file = r"E:\judita's project\new data 2\paper_ids.csv"
# The dataset (CSV, must contain a 'paper_id' column)
records_file = r"E:\judita's project\new data\matched_sections_cleaned_final.csv"

# ===== 2) Read paper_ids (preserving original order) =====
df_ids_raw = pd.read_csv(paper_ids_file)

# Fault tolerance: Identify the paper_id column name (allows for names other than exactly 'paper_id')
id_col = None
for c in df_ids_raw.columns:
    if c.strip().lower() == "paper_id":
        id_col = c
        break
# If not found, default to the first column
if id_col is None:
    id_col = df_ids_raw.columns[0]

paper_ids = df_ids_raw[id_col].astype(str).fillna("").tolist()

# ===== 3) Read the records CSV (with error handling) =====
df_rec = pd.read_csv(
    records_file,
    on_bad_lines="skip",  # Skip bad lines
    engine="python",      # More robust error handling
)

# Ensure the 'paper_id' column exists
if "paper_id" not in df_rec.columns:
    raise ValueError("The 'paper_id' column was not found in the records dataset. Please check the file.")

# Unify types and clean (strip whitespace)
df_rec["paper_id"] = df_rec["paper_id"].astype(str).str.strip()
paper_ids_norm = [str(x).strip() for x in paper_ids]

# ===== 4) Pre-calculate the occurrence count for each paper_id (using a hash map, O(n) for counting) =====
counts = df_rec["paper_id"].value_counts(dropna=False).to_dict()

# ===== 5) Tally counts one by one, following the order in paper_ids.csv =====
section_count_list = [int(counts.get(pid, 0)) for pid in paper_ids_norm]

# ===== 6) Assemble the results (preserving original order) =====
result = pd.DataFrame({
    "paper_id": paper_ids,          # Original values (as they appear in your file)
    "section_count": section_count_list,
    "matched": [c > 0 for c in section_count_list]
})

# ===== 7) Tally and print the output =====
total_ids = len(result)
unmatched_ids = (result["section_count"] == 0).sum()
matched_ids = total_ids - unmatched_ids

print("✅ Iteration and counting complete")
print("Total number of paper_ids:", total_ids)
print("Number of matched IDs:", matched_ids)
print("Number of unmatched IDs:", unmatched_ids)

# ===== 8) Save the results =====
out_all = r"E:\judita's project\new data 2\paper_id_section_counts.csv"
out_unmatched = r"E:\judita's project\new data 2\unmatched_paper_ids.csv"
result.to_csv(out_all, index=False)
result.loc[result["section_count"] == 0, ["paper_id"]].to_csv(out_unmatched, index=False)

print("Saved:", out_all)
print("Unmatched IDs saved:", out_unmatched)

✅ Iteration and counting complete
Total number of paper_ids: 5886
Number of matched IDs: 5817
Number of unmatched IDs: 69
Saved: E:\judita's project\new data 2\paper_id_section_counts.csv
Unmatched IDs saved: E:\judita's project\new data 2\unmatched_paper_ids.csv


In [6]:
import pandas as pd

# 读取原始文件
df = pd.read_csv(r"E:\judita's project\new data 2\paper_id_section_counts.csv")

# 过滤 matched 为 True 的记录
df_filtered = df[df["matched"] == True].copy()

# 打印记录数量
print(f"✅ Number of records with matched=True: {len(df_filtered)}")

# 删除 matched 列
df_filtered.drop(columns=["matched"], inplace=True)

# 保存为新的 CSV 文件
df_filtered.to_csv("paper_id_section_counts_only-true.csv", index=False, encoding="utf-8")


✅ Number of records with matched=True: 5817


In [7]:
import pandas as pd
import numpy as np

# ===== 1) Specify file paths =====
# File containing the paper_id column
ids_file = r"E:\judita's project\new data 2\paper_id_section_counts_only-true.csv"
# Each row is a section record, containing paper_id and section_content_vila
records_file = r"E:\judita's project\new data\matched_sections_cleaned_final.csv"

# ===== 2) Read the paper_id column (preserving original order) =====
df_ids_raw = pd.read_csv(ids_file)
id_col = next((c for c in df_ids_raw.columns if c.strip().lower() == "paper_id"), df_ids_raw.columns[0])
df_ids = df_ids_raw[[id_col]].rename(columns={id_col: "paper_id"})
df_ids["paper_id"] = df_ids["paper_id"].astype(str).str.strip()

# ===== 3) Read the records CSV (preserving original file order) =====
df_rec = pd.read_csv(
    records_file,
    on_bad_lines="skip",
    engine="python"
)

if "paper_id" not in df_rec.columns or "section_content_vila" not in df_rec.columns:
    raise ValueError("The CSV must contain both 'paper_id' and 'section_content_vila' columns!")

# Ensure data types
df_rec["paper_id"] = df_rec["paper_id"].astype(str).str.strip()
df_rec["section_content_vila"] = df_rec["section_content_vila"].astype(str).fillna("").str.strip()

# ===== 4) Preserve original file order =====
df_rec["_row"] = np.arange(len(df_rec))  # Add a row number to each row
df_rec_sorted = df_rec.sort_values(by=["paper_id", "_row"], kind="mergesort")

# ===== 5) Get the intro/conclusion/count for each paper_id =====
grp = df_rec_sorted.groupby("paper_id")["section_content_vila"]
first_map = grp.first()   # First entry (based on file order)
last_map  = grp.last()    # Last entry (based on file order)
count_map = df_rec_sorted.groupby("paper_id").size()

# ===== 6) Assemble the results (preserving the original order from paper_ids.csv) =====
result = df_ids.copy()
result["section_count"] = result["paper_id"].map(count_map).fillna(0).astype(int)
result["introduction"]  = result["paper_id"].map(first_map).fillna("")
result["conclusion"]    = result["paper_id"].map(last_map).fillna("")

# ===== 7) Save the output =====
out_file = r"E:\judita's project\new data 2\paper_id_intro_conclusion.csv"
result.to_csv(out_file, index=False, encoding="utf-8-sig")

print("✅ Processing complete")
print("Total number of paper_ids:", len(result))
print("Number of matched paper_ids:", (result['section_count'] > 0).sum())
print("Number of unmatched paper_ids:", (result['section_count'] == 0).sum())
print("Output file:", out_file)

✅ Processing complete
Total number of paper_ids: 5817
Number of matched paper_ids: 5817
Number of unmatched paper_ids: 0
Output file: E:\judita's project\new data 2\paper_id_intro_conclusion.csv


In [11]:
import pandas as pd
import numpy as np
from IPython.display import display # Import the display function for better notebook output

# ---------- Inputs ----------
# Define the paths for your input and output files.
file1 = r"E:\judita's project\2017sheet5.xlsx"
file2 = r"E:\judita's project\2017sheet6.xlsx"
out_path = r"E:\judita's project\new data 2\merged_papers_withlink.csv"

# ---------- Helper Functions ----------

def normalize_columns(df):
    """Creates a mapping from lowercase column names to original names."""
    return {c.lower(): c for c in df.columns}

def pick_col(colmap, *candidates):
    """Finds the first matching column name from a list of candidates."""
    for c in candidates:
        if c is None:
            continue
        key = c.lower()
        if key in colmap:
            return colmap[key]
    return None

def extract_needed(df):
    """
    Extracts and standardizes required columns from a DataFrame.
    It flexibly finds columns like 'paper_id', 'rate0', 'title', etc.,
    calculates the average score, and returns a clean DataFrame.
    """
    colmap = normalize_columns(df)

    # Find the actual column names using various possible candidates
    c_paper_id = pick_col(colmap, "paper_id", "id", "paperid")
    c_rate0    = pick_col(colmap, "rate0", "rate_0", "r0")
    c_rate1    = pick_col(colmap, "rate1", "rate_1", "r1")
    c_rate2    = pick_col(colmap, "rate2", "rate_2", "r2")
    c_decison  = pick_col(colmap, "decison", "decision")
    c_title    = pick_col(colmap, "title")
    c_abstract = pick_col(colmap, "abstract", "abs")
    c_pdf      = pick_col(colmap, "pdf_link", "pdflink", "pdf", "pdf_url", "url", "link1")

    out = pd.DataFrame()
    out["paper_id"] = df[c_paper_id] if c_paper_id else np.nan

    # Process ratings and calculate the average score
    rates = []
    for cname in [c_rate0, c_rate1, c_rate2]:
        if cname:
            # Convert to numeric, forcing errors to become NaN (Not a Number)
            rates.append(pd.to_numeric(df[cname], errors="coerce"))
        else:
            # If a rate column doesn't exist, create a series of NaNs
            rates.append(pd.Series([np.nan] * len(df)))

    rate_mat = np.vstack([r.values for r in rates]).T
    out["avg_score"] = np.nanmean(rate_mat, axis=1) # Calculate mean, ignoring NaNs

    # Assign standardized columns to the output DataFrame
    out["rate0"] = rates[0]
    out["rate1"] = rates[1]
    out["rate2"] = rates[2]
    out["decison"]   = df[c_decison] if c_decison else np.nan
    out["title"]     = df[c_title] if c_title else np.nan
    out["abstract"]  = df[c_abstract] if c_abstract else np.nan
    out["pdf_link"]  = df[c_pdf] if c_pdf else np.nan

    # Return the DataFrame with a fixed column order
    final_cols = ["paper_id", "rate0", "rate1", "rate2", "avg_score", "decison", "title", "abstract", "pdf_link"]
    return out[final_cols]

# ---------- Main Logic ----------

# 1. Read Excel files
print("Reading input files...")
df1_raw = pd.read_excel(file1)
df2_raw = pd.read_excel(file2)

# 2. Process each file to standardize columns
print("Standardizing data from each file...")
df1 = extract_needed(df1_raw)
df2 = extract_needed(df2_raw)

# 3. Combine, remove duplicates, and clean the data
print("Merging files and removing duplicates...")
combined = pd.concat([df1, df2], ignore_index=True)
combined_dedup = combined.drop_duplicates(subset=["paper_id"], keep="first").copy()
# Replace empty strings or whitespace-only strings with NaN
combined_dedup = combined_dedup.replace(r"^\s*$", np.nan, regex=True)

# 4. Identify rows that are missing a title or an abstract
missing_title = combined_dedup[combined_dedup["title"].isna()]
missing_abstract = combined_dedup[combined_dedup["abstract"].isna()]
missing_records = pd.concat([missing_title, missing_abstract]).drop_duplicates()

# 5. Save the final merged file
print(f"Saving merged data to {out_path}...")
combined_dedup.to_csv(out_path, index=False, encoding="utf-8-sig")

# 6. Display the records with missing data directly in the notebook
print("\n--- Records with Missing Title or Abstract ---")
if missing_records.empty:
    print("No records are missing a title or abstract. Great!")
else:
    # 'display()' renders the DataFrame as a formatted table in Jupyter
    display(missing_records)

# 7. Print a final summary
print("\n--- Summary ---")
print(f"✅ Processing complete.")
print(f"Total unique records saved: {len(combined_dedup)}")
print(f"Records missing a title: {len(missing_title)}")
print(f"Records missing an abstract: {len(missing_abstract)}")
print(f"Merged file saved to: {out_path}")

Reading input files...
Standardizing data from each file...
Merging files and removing duplicates...
Saving merged data to E:\judita's project\new data 2\merged_papers_withlink.csv...

--- Records with Missing Title or Abstract ---


Unnamed: 0,paper_id,rate0,rate1,rate2,avg_score,decison,title,abstract,pdf_link
3474,2,3,4,2,3.0,0,,,
3499,1,3,2,4,3.0,0,,,



--- Summary ---
✅ Processing complete.
Total unique records saved: 6356
Records missing a title: 2
Records missing an abstract: 2
Merged file saved to: E:\judita's project\new data 2\merged_papers_withlink.csv


In [18]:
import pandas as pd
import numpy as np
import io, csv
import json

# ========= Paths (modify as needed) =========
base_file = r"E:\judita's project\new data 2\paper_id_intro_conclusion.csv"
# Contains paper_id + various metadata
meta_file = r"E:\judita's project\new data 2\merged_papers_withlink.csv"
out_file  = r"E:\judita's project\new data 2\paper_id_intro_conclusion_enriched.csv"
unmatched_out = r"E:\judita's project\new data 2\unmatched_paper_id.csv"

# ========= Robust CSV Reader: Handles multiple encodings/delimiters, skips bad lines/quotes =========
def sniff_sep(sample: str, default=','):
    """Detects the delimiter of a CSV sample."""
    try:
        dialect = csv.Sniffer().sniff(sample, delimiters=[',',';','\t','|'])
        return dialect.delimiter
    except Exception:
        return default

def try_read_bytes(raw: bytes, enc: str):
    """Tries to read CSV bytes with a specific encoding in strict mode."""
    text = raw.decode(enc, errors="strict")
    sep = sniff_sep(text[:20000], default=',')
    return pd.read_csv(io.StringIO(text), sep=sep)

def try_read_bytes_relaxed(raw: bytes, enc: str):
    """Tries to read CSV bytes with a specific encoding in a more relaxed mode."""
    text = raw.decode(enc, errors="replace")
    sep = sniff_sep(text[:20000], default=',')
    return pd.read_csv(
        io.StringIO(text),
        sep=sep,
        engine="python",
        on_bad_lines="skip",
        quoting=csv.QUOTE_NONE,
        escapechar="\\",
    )

def safe_read_csv(path: str) -> pd.DataFrame:
    """Reads a CSV file by trying multiple encodings and parsing strategies."""
    encodings_try = ["utf-8", "utf-8-sig", "cp1252", "latin1", "iso-8859-1", "gb18030", "big5"]
    with open(path, "rb") as f:
        raw = f.read()
    # Try strict parsing first
    for enc in encodings_try:
        try:
            df = try_read_bytes(raw, enc)
            print(f"[ok] Successfully parsed: {path} (encoding={enc})")
            return df
        except Exception:
            pass
    # If strict fails, try relaxed parsing
    for enc in encodings_try:
        try:
            df = try_read_bytes_relaxed(raw, enc)
            print(f"[ok] Successfully parsed in relaxed mode: {path} (encoding={enc})")
            return df
        except Exception:
            pass
    # As a final fallback, use latin1 and ignore all errors
    text = raw.decode("latin1", errors="ignore")
    sep = sniff_sep(text[:20000], default=',')
    df = pd.read_csv(io.StringIO(text), sep=sep, engine="python", on_bad_lines="skip",
                     quoting=csv.QUOTE_NONE, escapechar="\\")
    print(f"[ok] Successfully parsed with fallback: {path} (encoding=latin1, errors=ignore)")
    return df

# ========= Case-insensitive Column Selector =========
def colmap(df): return {c.lower(): c for c in df.columns}
def pick(df, *cands):
    """Finds the first existing column from a list of candidates, ignoring case."""
    m = colmap(df)
    for c in cands:
        if c and c.lower() in m:
            return m[c.lower()]
    return None

# ========= Read Data (fault-tolerant + multi-encoding) =========
base = safe_read_csv(base_file)
meta = safe_read_csv(meta_file)

# Standardize the paper_id primary key
pid_base_col = pick(base, "paper_id") or base.columns[0]
pid_meta_col = pick(meta, "paper_id") or meta.columns[0]
base[pid_base_col] = base[pid_base_col].astype(str).str.strip()
meta[pid_meta_col] = meta[pid_meta_col].astype(str).str.strip()
base = base.rename(columns={pid_base_col: "paper_id"})
meta = meta.rename(columns={pid_meta_col: "paper_id"})

# ========= Extract Required Fields from Meta (case-insensitive with aliases) =========
# Attributes you want to keep:
# 'paper_id','rate0','rate1','rate2','avg_score','decision','title','abstract','pdf_link'
title_col    = pick(meta, "title")
abstract_col = pick(meta, "abstract", "abs")
rate0_col    = pick(meta, "rate0", "rate_0", "r0")
rate1_col    = pick(meta, "rate1", "rate_1", "r1")
rate2_col    = pick(meta, "rate2", "rate_2", "r2")
avg_col      = pick(meta, "avg_score", "average_score", "avg")
decision_col = pick(meta, "decision", "decison")  # Compatible with typo 'decison'
pdf_col      = pick(meta, "pdf_link", "link1", "pdf_url", "url", "pdf")  # Compatible with 'link1'

need_cols = ["paper_id"]
rename_map = {}
def add_if_exists(real_col_name, standard_col_name):
    """Adds a column to the list if it exists and maps it to a standard name."""
    if real_col_name is not None:
        need_cols.append(real_col_name)
        rename_map[real_col_name] = standard_col_name

add_if_exists(rate0_col, "rate0")
add_if_exists(rate1_col, "rate1")
add_if_exists(rate2_col, "rate2")
add_if_exists(avg_col, "avg_score")
add_if_exists(decision_col, "decision")
add_if_exists(title_col, "title")
add_if_exists(abstract_col, "abstract")
add_if_exists(pdf_col, "pdf_link")

meta_reduced = meta[list(dict.fromkeys(need_cols))].copy().rename(columns=rename_map)

# If avg_score is missing, calculate it from rate0/1/2 (ignoring NaN)
if "avg_score" not in meta_reduced.columns:
    for c in ["rate0", "rate1", "rate2"]:
        if c in meta_reduced.columns:
            meta_reduced[c] = pd.to_numeric(meta_reduced[c], errors="coerce")
    if all(c in meta_reduced.columns for c in ["rate0", "rate1", "rate2"]):
        meta_reduced["avg_score"] = np.nanmean(meta_reduced[["rate0","rate1","rate2"]].values, axis=1)
    else:
        meta_reduced["avg_score"] = np.nan

# If meta has duplicate paper_ids, keep the first occurrence (as per requirement)
meta_reduced = meta_reduced.drop_duplicates(subset=["paper_id"], keep="first")

# ========= Left Join: Keep all paper_ids from the base file =========
enriched = base.merge(meta_reduced, on="paper_id", how="left")

# ========= Match Statistics =========
fields = ["title", "abstract", "rate0", "rate1", "rate2", "avg_score", "decision", "pdf_link"]
present_cols = [c for c in fields if c in enriched.columns]
def _nonempty(s): return s.notna() & (s.astype(str).str.strip() != "")

if present_cols:
    matched_any = np.zeros(len(enriched), dtype=bool)
    for c in present_cols:
        matched_any |= _nonempty(enriched[c])
else:
    matched_any = np.zeros(len(enriched), dtype=bool)

total_base = len(enriched)
matched_count = int(matched_any.sum())
unmatched_count = total_base - matched_count

print("✅ Merge complete")
print("Total rows in base table:", total_base)
print("Rows with at least one matched field:", matched_count)
print("Rows with no matched fields:", unmatched_count)

# Export list of unmatched items (paper_id only)
if unmatched_count > 0:
    enriched.loc[~matched_any, ["paper_id"]].drop_duplicates().to_csv(unmatched_out, index=False, encoding="utf-8-sig")
    print("List of unmatched IDs saved to:", unmatched_out)
else:
    print("Unmatched items: None")

# ========= Save Final Result =========
# Ensure the final DataFrame has a consistent column order
final_columns_order = [
    "paper_id", "section_count", "introduction", "conclusion", "rate0", "rate1",
    "rate2", "avg_score", "decision", "title", "abstract", "pdf_link"
]
# Only include columns that actually exist in the DataFrame
final_columns_existing = [col for col in final_columns_order if col in enriched.columns]
enriched = enriched[final_columns_existing]

enriched.to_csv(out_file, index=False, encoding="utf-8-sig")
print("Output file:", out_file)


# ========= Optional: Null value statistics =========
# Check which paper_ids have null/empty values in specified fields
check_cols = [
    "paper_id","section_count","introduction","conclusion",
    "rate0","rate1","rate2","avg_score","decision","title","abstract","pdf_link"
]

empty_records = {}

for col in check_cols:
    if col not in enriched.columns:
        continue
    # Check for NaN or whitespace-only strings
    mask = enriched[col].isna() | (enriched[col].astype(str).str.strip() == "")
    if mask.any():
        empty_records[col] = enriched.loc[mask, "paper_id"].tolist()
        print(f"⚠️ Column '{col}' has {mask.sum()} null or empty values")
        # Print only the first 10 examples
        print("Examples of corresponding paper_ids:", empty_records[col][:10])
    else:
        print(f"✅ Column '{col}' has no null values")

# Save the full list of empty/null records to a file
out_missing = r"E:\judita's project\new data 2\missing_paper_ids_by_field.json"
with open(out_missing, "w", encoding="utf-8") as f:
    json.dump(empty_records, f, indent=2, ensure_ascii=False)

print("Full list of empty/null records has been exported to:", out_missing)

[ok] Successfully parsed: E:\judita's project\new data 2\paper_id_intro_conclusion.csv (encoding=utf-8)
[ok] Successfully parsed: E:\judita's project\new data 2\merged_papers_withlink.csv (encoding=utf-8)
✅ Merge complete
Total rows in base table: 5817
Rows with at least one matched field: 5817
Rows with no matched fields: 0
Unmatched items: None
Output file: E:\judita's project\new data 2\paper_id_intro_conclusion_enriched.csv
✅ Column 'paper_id' has no null values
✅ Column 'section_count' has no null values
⚠️ Column 'introduction' has 5 null or empty values
Examples of corresponding paper_ids: ['7R7fAoUygoa', 'D9I3drBz4UC', 'Fj1Tpym9KxH', 'FvfV64rovnY', 'RVhzamxHBjP']
⚠️ Column 'conclusion' has 1 null or empty values
Examples of corresponding paper_ids: ['VNJUTmR-CaZ']
✅ Column 'rate0' has no null values
✅ Column 'rate1' has no null values
✅ Column 'rate2' has no null values
✅ Column 'avg_score' has no null values
✅ Column 'decision' has no null values
✅ Column 'title' has no null v

In [28]:
import pandas as pd

# ===== 1. 读取原始数据文件 =====
file_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_enriched.csv"
df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip")

# ===== 2. 定义需要手动填补的内容 =====
intro_fill = {
    "7R7fAoUygoa": """Recent works have demonstrated a ubiquitous “double descent” phenomenon present in a range
of machine learning models, including decision trees, random features, linear regression, and deep
neural networks (Opper, 1995; 2001; Advani & Saxe, 2017; Spigler et al., 2018; Belkin et al., 2018;
Geiger et al., 2019b; Nakkiran et al., 2020; Belkin et al., 2019; Hastie et al., 2019; Bartlett et al.,
2019; Muthukumar et al., 2019; Bibas et al., 2019; Mitra, 2019; Mei & Montanari, 2019; Liang &
Rakhlin, 2018; Liang et al., 2019; Xu & Hsu, 2019; Derezinski et al., 2019; Lampinen & Ganguli, ´
2018; Deng et al., 2019; Nakkiran, 2019). The phenomenon is that models exhibit a peak of high
test risk when they are just barely able to fit the train set, that is, to interpolate. For example, as we
increase the size of models, test risk first decreases, then increases to a peak around when effective
model size is close to the training data size, and then decreases again in the overparameterized
regime. Also surprising is that Nakkiran et al. (2020) observe a double descent as we increase
sample size, i.e. for a fixed model, training the model with more data can hurt test performance.
These striking observations highlight a potential gap in our understanding of generalization and
an opportunity for improved methods. Ideally, we seek to use learning algorithms which robustly
improve performance as the data or model size grow and do not exhibit such unexpected non monotonic behaviors. In other words, we aim to improve the test performance in situations which
would otherwise exhibit high test risk due to double descent. Here, a natural strategy would be to
use a regularizer and tune its strength on a validation set. This motivates the central question of this
work:
When does optimally tuned regularization mitigate or remove the double-descent phenomenon?
Another motivation is the fact that double descent is largely observed for unregularized or under regularized models in practice. As an example, Figure 1 shows a simple linear ridge regression
setting in which the unregularized estimator exhibits double descent, but an optimally-tuned regu larizer has monotonic test performance.
Our Contributions: We study this question from both a theoretical and empirical perspective.
Theoretically, we start with the setting of high-dimensional linear regression. Linear regression is
a sensible starting point to study these questions, since it already exhibits many of the qualitative
features of double descent in more complex models (e.g. Belkin et al. (2019); Hastie et al. (2019)
and further related works in Section 1.1). Our work shows that optimally-tuned ridge regression can
achieve both sample-wise monotonicity and model-size-wise monotonicity under certain assump tions. Concretely, we show
1. Sample-wise monotonicity: In the setting of well-specified linear regression with isotropic
features/covariates (Figure 1), we prove that optimally-tuned ridge regression yields monotonic test
performance with increasing samples. That is, more data never hurts for optimally-tuned ridge
regression. (See Theorem 1).
2. Model-wise monotonicity: We consider a setting where the input/covariate lives in a high dimensional ambient space with isotropic covariance. Given a fixed model size d (which might be
much smaller than ambient dimension), we consider the family of models which first project the
input to a random d-dimensional subspace, and then compute a linear function in this projected
“feature space.” (This is nearly identical to models of double-descent considered in Hastie et al.
(2019, Section 5.1)). We prove that in this setting, as we grow the model-size, optimally-tuned
ridge regression over the projected features has monotone test performance. That is, with optimal
regularization, bigger models are always better or the same. (See Theorem 3).
3. Monotonicity in the real-world: We also demonstrate several richer empirical settings where
optimal ` 2 regularization induces monotonicity, including random feature classifiers and convolu tional neural networks. This suggests that the mitigating effect of optimal regularization may hold
more generally in broad machine learning contexts. (See Section 5).
A few remarks are in order:
Problem-specific vs Minimax and Bayesian. It is worth noting that our results hold for all linear
ground-truths, rather than holding for only the worst-case ground-truth or a random ground-truth.
Indeed, the minimax optimal estimator or the Bayes optimal estimator are both trivially sample-wise
and model-wise monotonic with respect to the minimax risk or the Bayes risk. However, they do not
guarantee monotonicity of the risk itself for a given fixed problem. In particular, there exist minimax
optimal estimators which are not sample-monotonic in the sense we desire.
Universal vs Asymptotic. We also remark that our analysis is not only non-asymptotic but also
works for all possible input dimensions, model sizes, and sample sizes. To our knowledge, the
results herein are the first non-asymptotic sample-wise and model-wise monotonicity results for
linear regression. (See discussion of related works Hastie et al. (2019); Mei & Montanari (2019)
for related results in the asymptotic setting). Our work reveals aspects of the problem that were not
present in prior asymptotic works. For example, we empirically show that optimal regularization
can eliminate even “triple descent” in ridge regression (Figure 2). Moreover, we show that for non Gaussian covariates, optimally-tuned ridge regression is not always sample-monotonic: we give a
counterexample in Section 4.
Towards a more general characterization. Our theoretical results crucially rely on the covariance
of the data being isotropic. A natural next question is if and when the same results can hold more
generally. A full answer to this question is beyond the scope of this paper, though we give the
following results:
1. Optimally-tuned ridge regression is not always sample-monotonic: we show a counterex ample for a certain non-Gaussian data distribution and heteroscedastic noise. We are not
aware of prior work pointing out this fact. (See Section 4 for the counterexample and
intuitions.)
2. For non-isotropic Gaussian covariates, we can achieve sample-wise monotonicity with a
regularizer that depends on the population covariance matrix of data. This suggests unla beled data might also help mitigate double descent in some settings, because the population
covariance can be estimated from unlabeled data. (See Appendix B).
3. For non-isotropic Gaussian covariates, we conjecture that optimally-tuned ridge regression
is sample-monotonic even with a standard ` 2 regularizer (as in Figure 2). We derive a
sufficient condition for this conjecture. Due to that current random matrix theory may be
insufficient to verify this conjecture, we verify it numerically on a wide variety of cases.
(See Appendix B for details).
The last two results above highlight the importance of the form of the regularizer, which leads to the
open question: “How do we design good regularizers which mitigate or remove double descent?”
We hope that our results can motivate future work on mitigating the double descent phenomenon,
and allow us to train high performance models which do not exhibit nonmonotonic behaviors.
1.1 RELATED WORKS
The study of nonmonotonicity in learning algorithms existed prior to double descent and has a long
history going back to (at least) Trunk (1979) and LeCun et al. (1991); Le Cun et al. (1991), where the
former was largely empirical observations and the latter studied the sample non-nonmonotonicity of
unregularized linear regression in terms of the eigenspectrum of the covariance matrix; the difference
to our works is that we study this in the context of optimal regularization. In fact, Duin (1995;
2000); Opper (2001); Loog & Duin (2012). Loog et al. (2019) introduces the same notion of risk
monotonicity which we consider, and studies several examples of monotonic and non-monotonic
procedures.
Double descent of test risk as a function of model size was considered recently in more generality
by Belkin et al. (2018). Similar behavior was observed empirically in earlier work in somewhat
more restricted settings Trunk (1979); Opper (1995; 2001); Skurichina & Duin (2002); Le Cun et al.
(1991); LeCun et al. (1991) and more recently in Advani & Saxe (2017); Geiger et al. (2019a);
Spigler et al. (2018); Neal et al. (2018). Recently Nakkiran et al. (2020) demonstrated a generalized
double descent phenomenon on modern deep networks, and highlighted “sample non-monotonicity”
as an aspect of double descent.
A recent stream of theoretical works consider model-wise double descent in simplified settings—
often via linear models for regression or classification. This also connects to works on high dimentional regression in the statistics literature. A partial list of works in these areas include
Belkin et al. (2019); Hastie et al. (2019); Bartlett et al. (2019); Muthukumar et al. (2019); Bibas
et al. (2019); Mitra (2019); Mei & Montanari (2019); Liang & Rakhlin (2018); Liang et al. (2019);
Xu & Hsu (2019); Derezinski et al. (2019); Lampinen & Ganguli (2018); Deng et al. (2019); Nakki- ´
ran (2019); Mahdaviyeh & Naulet (2019); Dobriban et al. (2018); Dobriban & Sheng (2019); Kobak
et al. (2018). Of these, most closely related to our work are Hastie et al. (2019); Dobriban et al.
(2018); Mei & Montanari (2019). Specifically, Hastie et al. (2019) considers the risk of unregular ized and regularized linear regression in an asymptotic regime, where dimension d and number of
samples n scale to infinity together, at a constant ratio d/n. In contrast, we show non-asymptotic
results, and are able to consider increasing the number of samples for a fixed model, without scaling
both together. Mei & Montanari (2019) derive similar results for unregularized and regularized random features, also in an asymptotic limit. The non-asymptotic versions of the settings considered in
Hastie et al. (2019) are almost identical to ours— for example, our projection model in Section 3 is
nearly identical to the model in Hastie et al. (2019, Section 5.1). Finally, subsequent to our work,
d’Ascoli et al. (2020) identified triple descent in an asymptotic setting.""",
    "D9I3drBz4UC": """Real-world data are often long-tail distributed over semantic classes: A few classes contain many
instances, whereas most classes contain only a few instances. Long-tailed recognition is challenging,
as it needs to handle not only a multitude of small-data learning problems on the tail classes, but
also extreme imbalanced classification over all the classes.
There are two ways to prevent the many head instances from overwhelming the few tail instances in
the classifier training objective: 1) class re-balancing/re-weighting which gives more importance
to tail instances (Cao et al., 2019; Kang et al., 2020; Liu et al., 2019), 2) ensembling over different
data distributions which re-organizes long-tailed data into groups, trains a model per group, and
then combines individual models in a multi-expert framework (Zhou et al., 2020; Xiang et al., 2020).
We compare three state-of-the-art (SOTA) long-tail classifiers against the standard cross-entropy
(CE) classifier: cRT and τ -norm (Kang et al., 2020) which adopt a two-stage optimization, first rep resentation learning and then classification learning, and LDAM (Cao et al., 2019), which is trained
end-to-end with a marginal loss. In terms of the classification accuracy, a common metric for model
selection on a fixed training set, Fig. 1a shows that, all these existing long-tail methods increase the
overall, medium- and few-shot accuracies over CE, but decrease the many-shot accuracy.
These intuitive solutions and their experimental results seem to suggest that there is a head-tail per formance trade-off in long-tailed recognition. We need a principled performance analysis approach
that could shed light on such a limitation if it exists and provide guidance on how to overcome it.
Our insight comes from a dynamic view of the training set: It is merely a sample set of some underly ing data distribution. Instead of evaluating how a long-tailed classifier performs on the fixed training
set, we evaluate how it performs as the training set fluctuates according to the data distribution.

Consider the training data D as a random variable. The prediction error of model h on instance x
with output Y varies with the realization of D. The expected variance with respect to variable D
has a well-known bias-variance decomposition:
Error(x; h) = E[(h(x; D) − Y )2] = Bias(x; h) + Variance(x; h) + irreducible error(x). (1)
For the above L2 loss on regression h(x) → Y , the model bias measures the accuracy of the pre diction with respect to the true value, the variance measures the stability of the prediction, and the
irreducible error measures the precision of the prediction and is irrelevant to the model h.
As shown on the above right, these concepts can be expressed entirely in terms of L2 loss L. We can
thus extended them to classification (Domingos, 2000) by replacing L with L0-1 for classification:
We apply such bias and variance analysis to the CE and long-tail classifiers. We sample CIFAR100
(Krizhevsky, 2009) according to a long-tail distribution multiple times. For each method, we train
a model per long-tail sampled dataset and then estimate the per-class bias and variance over these
multiple models on the balanced test set of CIFAR100-LT Liu et al. (2019). Fig. 1a shows that:
1. On the model bias: The head bias is significantly smaller than the tail bias, at 0.3 vs. 0.9 for
CE. All the existing long-tail methods reduce the overall bias by primarily reducing the tail bias.
However, the head-tail bias gap remains large at 0.3 vs. 0.8.
2. On the model variance: All the existing long-tail methods increase the model variance across
all class splits, with a slight reduction in the medium-shot variance for cRT.
That is, existing long-tail methods reduce the model bias for the tail at the cost of increased model
variance for all the classes, and the head-tail model bias gap remains large.
We conduct further statistical analysis to understand the head-tail model bias gap. We examine the
largest softmax score in the other classes of {c : c = t}, where t is the ground-truth class of an
instance. The smaller this hardest negative score is, the less the confusion, and the lower the model
bias. Fig. 1b shows that there is increasingly more and larger confusion from the head to the tail.
Guided by our model bias/variance and confusion pattern analysis, we propose a new long-tail classifier with four distinctive features: 1) It reduces the model variance for all the classes with multiple
experts. 2) It reduces the model bias for the tail with an additional distribution-aware diversity loss.
3) It reduces the computational complexity that comes with multiple experts with a dynamic expert
routing module which deploys another trained distinctive expert for a second (or third, ...) opinion
only when it is called for. 4) The routing module and a shared architecture for experts of reduced
complexity effectively cut down the computational cost of our multi-expert model, to a level that
could be even lower than the commonly adopted baseline with the same backbone.
Our so-called RoutIng Diverse Experts (RIDE) not only reduces the model variance for all the
classes, but also significantly reduces the model bias for the tail classes and increases the mean
accuracies for all class splits, all of which existing long-tail methods fail to accomplish.
RIDE delivers 5%∼7% higher accuracies than the current SOTA methods on CIFAR100-LT,
ImageNet-LT (Liu et al., 2019) and iNaturalist (Van Horn et al., 2018). RIDE is also a universal
framework that can be applied to different backbone networks for improving existing long-tail algorithms such as focal loss (Lin et al., 2017), LDAM (Cao et al., 2019), τ -norm (Kang et al., 2020)""",
    "Fj1Tpym9KxH": """Domain Adversarial Training (Ganin & Lempitsky, 2015) (DAT) refers to adversarial learning of
neural network based feature representations that are invariant to the domain. For example, car
images from the clipart domain have similar feature representations as car images from the web
domain. DAT has been widely useful in diverse areas (cited 3540 times) such as fairness (Adel
et al., 2019), object detection (Saito et al., 2019), domain generalization (Li et al., 2018), image to-image translation (Liu et al., 2017) etc. The prime driver of research on DAT is its application
in unsupervised Domain Adaptation (DA), which aims to learn a classifier using labeled source
data and unlabeled target data, such that it generalizes well on target data. Various enhancements
like superior objectives (Acuna et al., 2021; Zhang et al., 2019), architectures (Long et al., 2018)
etc. have been proposed to improve its effectiveness. However, as DAT objective is combination of
Generative Adversarial Network (GAN) (Goodfellow et al., 2014) and Empirical Risk Minimization
(ERM) (Vapnik, 2013) objectives, there has not been much focus on explicitly analyzing the nature
of optimization in DAT. One direction of work aiming to improve generalization of ERM on unseen
data focuses on developing algorithms that converge to a smooth (or a flat) minima (Foret et al.,
2021; Keskar & Socher, 2017). However, we find that these techniques, when directly applied for
DAT, do not significantly improve the generalization on the target domain (Sec. 4 and 7).
In this work, we analyze the loss landscape near the optimal point obtained by DAT, to gain insights
into curvature. We first focus on the eigen-spectrum of Hessian of the task loss (ERM term for
classification) where we find that using Stochastic Gradient Descent (SGD) as optimizer converges
to a smoother minima in comparison to Adam (Kingma & Ba, 2014). Further we find that smoother
minima w.r.t.task loss leads to better generalization on the target domain. Contrary to task loss,
we find that smoothness enhancing formulation for adversarial components worsen performance,
rendering ERM-based techniques which enhance smoothness for all loss components ineffective.
Hence we introduce Smooth Domain Adversarial Training (SDAT), which aims only to reach a
smooth minima w.r.t. task loss, and helps in generalizing better on the target domain. SDAT requires
an additional gradient computation step and can be combined with existing methods with a few lines
of code. We show the soundness of the SDAT method theoretically by proving a generalization
bound (Sec. 4) on target error. We extensively verify the empirical efficacy of SDAT across various
datasets for classification (i.e., DomainNet, VisDA-2017 and Office-Home), along with showing
a prototypical application in DA for object detection, demonstrating it’s diverse applicability. In
summary, we make the following contributions:
• We analyze the optimization procedure of DAT, establishing the correlation between the
smoothness near optima w.r.t. task loss and generalization on the target domain.
• Contrary to ERM, we show through our theoretical and empirical analysis that smoothness
enhancing adversarial formulation leads to sub-optimal performance.
• For enhancing the smoothness w.r.t. task loss near optima in DAT, we propose a novel and
theoretically motivated SDAT that improves the generalization on the target domain. SDAT
effectively increases the average performance of even state-of-the-art adversarial adaptation
methods.""",
    "FvfV64rovnY": """For a large variety of models and datasets, neural network performance has been empirically observed
to scale as a power-law with model size and dataset size (Hestness et al., 2017; Kaplan et al., 2020;
Rosenfeld et al., 2020b; Henighan et al., 2020). These exponents determine how quickly performance
improves with more data and larger models. We would like to understand why these power-laws
emerge, and what features of the data and models determine the values of the power law exponents.
In this work, we present a theoretical framework for understanding scaling laws in trained neural
networks. We identify four related scaling regimes with respect to the number of model parameters
P and the dataset size D. With respect to each of D, P, there is both a variance-limited regime and a
resolution-limited regime.
Variance-Limited Regime In the limit of infinite data or an arbitrarily wide model, some aspects
of neural network training simplify. Specifically, if we fix one of D, P and study scaling with respect
to the other parameter as it becomes arbitrarily large, then the difference between the finite test loss
and its limiting value scales as 1/x, i.e. as a power-law with exponent 1, with x = D or √
P ∝ width
in deep networks and x = D or P in linear models.
Resolution-Limited Regime In this regime, one of D or P is effectively infinite, and we study
scaling as the other parameter increases. In this case, a variety of works have empirically observed
power-law scalings 1/xα, typically with 0 < α < 1 for both x = P or D. We derive exponents in
this regime precisely in the setting of random feature models (c.f. next section). Empirically, we find
that our theoretical predictions for exponents hold in pretrained, fine-tuned models even though these
lie outside our theoretical setting.
For more general nonlinear models, we propose a refinement of naive bounds into estimates via
expansions that hold asymptotically. These rely on the idea that additional data (in the infinite
model-size limit) or added model parameters (in the infinite data limit) are used by the model to
carve up the data manifold into smaller components. For smooth manifolds, loss, and network, the
test loss will depend on the linear size of a sub-region, while it is the d-dimensional sub-region
volume that scales inversely with P or D, giving rise to α ∝ 1/d.
1 To test this empirically, we make
measurements of the resolution-limited exponents in neural networks and intrinsic dimension of the
data manifold, shown in Figure 1b.
Explicit Derivation We derive the scaling laws for these four regimes explicitly in the setting of
random feature teacher-student models, which also applies to neural networks in the large width limit.
This setting allows us to solve for the test error directly in terms of the feature covariance (kernel).
The scaling of the test loss then follows from the asymptotic decay of the spectrum of the covariance
matrix. For generic continuous kernels on a d-dimensional manifold, we can further relate this to the
dimension of the data manifold.
Summary of Contributions:
1. We propose four scaling regimes for neural networks. The variance-limited and resolution limited regimes originate from different mechanisms, which we identify. To our knowledge,
this categorization has not been previously exhibited. We provide empirical support for all four
regimes in deep networks on standard datasets.
2. We derive the variance-limited regime under simple yet general assumptions (Theorem 1).
3. We present a hypothesis for resolution-limited scaling through refinement of naive bounds (Theorems 2, 3), for general nonlinear models. We empirically test the dependence of the estimates on
intrinsic dimension of the data manifold for deep networks on standard datasets (Figure 1b).
4. In the setting of random feature teacher-student networks, we derive both variance-limited and
resolution-limited scaling exponents exactly. In the latter case, we relate this to the spectral decay
of kernels. We identify a novel duality that exists between model and dataset size scaling.
5. We empirically investigate predictions from the random features setting in pretrained, fine-tuned
models on standard datasets and find they give excellent agreement.
6. We study the dependence of the scaling exponent on changes in architecture and data, finding that
(i) changing the input distribution via switching datasets and (ii) the addition of noise have strong
effects on the exponent, while (iii) changing the target task via superclassing does not.
Related Works: There have been a number of recent works demonstrating empirical scaling laws
(Hestness et al., 2017; Kaplan et al., 2020; Rosenfeld et al., 2020b; Henighan et al., 2020; Rosenfeld
et al., 2020a) in deep neural networks, including scaling laws with model size, dataset size, compute,
and other observables such as mutual information and pruning. Some precursors (Ahmad & Tesauro,
1989; Cohn & Tesauro, 1991) can be found in earlier literature. Recently, scaling laws have also
played a significant role in motivating work on the largest models that have yet been developed
(Brown et al., 2020; Fedus et al., 2021).
There has been comparatively little work on theoretical ideas (Sharma & Kaplan, 2020; Bisla et al.,
2021) that match and explain empirical findings in generic deep neural networks. In the particular
case of large width, deep neural networks behave as random feature models (Neal, 1994; Lee et al.,
2018; Matthews et al., 2018; Jacot et al., 2018; Lee et al., 2019; Dyer & Gur-Ari, 2020), and known
results on the loss scaling of kernel methods can be applied (Spigler et al., 2020; Bordelon et al.,
2020). Though not in the original, Bordelon et al. (2020) analyze resolution-limited dataset size
scaling for power-law spectra in later versions.
During the completion of this work, Hutter (2021) presented a specific solvable model of learning
exhibiting non-trivial power-law scaling for power-law (Zipf) distributed features. This does not
directly relate to the setups studied in this work, or present bounds that supersede our results.
Concurrent to our work, Bisla et al. (2021) presented a derivation of the resolution-limited scaling
with dataset size, also stemming from nearest neighbor distance scaling on data manifolds. However,
they do not discuss requirements on model versus dataset size or how this scaling behavior fits into
other asymptotic scaling regimes.
In the variance-limited regime, scaling laws in the context of random feature models (Rahimi &
Recht, 2008; Hastie et al., 2019; d’Ascoli et al., 2020), deep linear models (Advani & Saxe, 2017;
Advani et al., 2020), one-hidden-layer networks (Mei & Montanari, 2019; Adlam & Pennington,
2020a;b), and wide neural networks treated as Gaussian processes or trained in the NTK regime
(Lee et al., 2019; Dyer & Gur-Ari, 2020; Andreassen & Dyer, 2020; Geiger et al., 2020) have been
studied. In particular, this behavior was used in (Kaplan et al., 2020) to motivate a particular ansatz
for simultaneous scaling with data and model size. The resolution-limited analysis can perhaps be
viewed as an attempt to quantify the ideal-world generalization error of Nakkiran et al. (2021).
This work makes use of classic results connecting the spectrum of a smooth kernel to the geometry
it is defined over (Weyl, 1912; Reade, 1983; Kuhn, 1987; Ferreira & Menegatto, 2009) and on the ¨
scaling of iteratively refined approximations to smooth manifolds (Stein, 1999; Bickel et al., 2007;
de Laat, 2011)."""
,
    "RVhzamxHBjP": """1.1 INVERSE PROBLEMS AND DEEP LEARNING
For many physical systems, we observe only the output and strive to infer the input. The inference
task is often captured by the generic term “inverse problem”. Formally, the underlying system
is modeled by a forward mapping f, and solving the inverse problem amounts to identifying the
inverse mapping f−1. Inverse problems abound in numerous fields and take diverse forms, see,
e.g., (Hartley & Zisserman, 2003; Gonzalez & Woods, 2017; Comon, 2010; Colton & Kress, 2013;
Herman, 2009; Entekhabi et al., 1994; Ge, 2013). Let y denote the observed output. Traditionally,
inverse problems are mostly formulated as regularized optimization problems of the form
minx(y, f(x)) + λΩ(x), (1.1)
where x represents the input to be estimated, (y, f(x)) ensures y ≈ f(x) (` means loss), Ω(x)
encodes prior knowledge about x—often added to make the problem well-posed, and λ is a tradeoff
parameter. To solve Eq. (1.1) , iterative numerical algorithms are often developed (Kirsch, 2011).
Deep learning has enabled learning data-driven loss ` or Ω, or replacing mappings in iterative meth ods for solving Eq. (1.1) by data-adaptive ones. These ideas can capture structures in practical data
not expressible before and tend to lead to faster and/or more effective algorithms. Most radical
is perhaps the end-to-end approach: a deep neural network (DNN) is directly set up and trained
to approximate the inverse mapping f
−1—backed by the famous universal approximation theo rem (Poggio et al., 2017) and based on a sufficiently large set of (x, y) pairs. Instead of citing the
abundance of individual papers, we refer the reader to the excellent review articles (McCann et al.,
2017; Lucas et al., 2018; Arridge et al., 2019; Ongie et al., 2020b) on these developments.
1.2 DIFFICULTY WITH SYMMETRIES
In this paper, we focus on the end-to-end learning approach. This approach has recently been widely
acclaimed for its remarkable performance on several tasks such as image denoising (Xie et al.,
2012), image super-resolution (Dong et al., 2014), image deblurring (Xu et al., 2014), and sparse
recovery (Mousavi & Baraniuk, 2017). In these examples, f is linear.
When f is nonlinear, intrinsic symmetries appear in many problems. A couple of quick examples:
• Fourier phase retrieval (PR) The forward model is Y = |F(X)|2, where X ∈ Cn×n andY ∈ Rm×m are matrices and F is the 2D (oversampled) Fourier transform. The operation
|·| takes elementwise complex magnitudes. It is well known that translations to the nonzero part of X (if feasible), conjugate flipping of X, and global phase transfer e
iθX forany θ ∈ [0, 2π) all lead to the same Y (Bendory et al., 2017).
• Blind deconvolution The forward model is y = a ~ x, where a is the convolution kernel,
x is the signal (e.g., image) of interest, and ~ denotes the circular convolution. Both a and
x are inputs. Here, a ~ x = (λa) ~ (x/λ) for any λ = 0, and circularly shifting a to the
left and shifting x to the right by the same amount does not change y (Lam & Goodman,
2000; Tonellot & Broadhead, 2010)
Solving these inverse problems means recovering the input up to the intrinsic system symmetries, as
evidently this is the best one can hope for.
Symmetries can cause significant difficulty for
the end-to-end approach. To see this, suppose
we randomly sample real values xi’s and form a
training set  xi, x2iand try to learn the squareroot function, allowing both positive and negative outputs, using the end-to-end approach.
Now if we think of the function determined by
the training set, which the neural network is trying to approximate, it is highly oscillatory (see
Fig. 1)1
: the sign symmetry dictates that in the
training set, there are frequent cases where x2I and x2j are close but xi and xj have different
signs and are far apart. Although in theory neural networks with adequate capacity are universal function approximators, in practice they will
struggle to learn such irregular functions. For general inverse problems, so long as the forward symmetries can relate remote inputs to the same output, similar problems can surface.
1.3 OUR CONTRIBUTION: SYMMETRY BREAKING
An easy fix to the above issue is fixing all signs of xi’s to be positive (or negative), which we call
“symmetry breaking”. We generalize this and
• Take phase retrieval (PR) as an example to show how symmetry breaking can be performed
and how this can lead to substantial gain in performance. For PR, our algorithm solves the
problem in a regime not accessible by previous methods.
• Identify the basic principle of effective symmetry breaking, which can be readily applied
to other inverse problems with symmetries."""
}

conclusion_fill = {
    "VNJUTmR-CaZ": 
"""In this paper, we proposed a new graph neural network architecture, called CAM, for a multirobot task allocation problem with a set of complexities, including tasks with time deadline and robots with ferry range and payload capacity constraints. This new architecture incorporates an encoder based on covariant node-based embedding and a decoder based on attention mechanism. To learn the features of the encoder and decoder, the problem has been imposed as a reinforcement learning problem and it has been solved using a policy simple gradient algorithm, REINFORCE. In addition, to compare the performance of the proposed CAM method, an attention-based approach (aka AM) has been extended to be able to handle a multi-agent combinatorial optimization problem (i.e., a multi-robot task allocation problem).To evaluate the performance of the proposed architecture, and the extended version of AM are
trained for 200 epochs and tested on 100 unseen case studies. Performance was analyzed in terms of
cost function and completion rate. The new proposed architecture showed a better sample efficiency
than AM by reaching a better cost value only after 7 epochs versus 73 epochs of AM. Our primary
method, CAM, outperformed the AM approach by achieving (up to) 84% better cost function value
(in term of the mean value). The computational cost analysis showed that the proposed CAM model
takes a few milliseconds to take a decision; hence, it is an excellent choice for online decision making
(here, task allocation). A further study on the performance of both AM and CAM approaches on
case studies with different number of tasks demonstrated that our proposed CAM method is superior
to AM in all case studies. While the current method is operational over varying task size (the upper
bound is the number of tasks that model has been trained on), it is not invariant to the size of swarm.
One immediate future work is to expend the architecture to have a swarm size invariant model."""
}

# ===== 3. 对 DataFrame 进行字段更新 =====
for pid, text in intro_fill.items():
    df.loc[df['paper_id'] == pid, 'introduction'] = text

for pid, text in conclusion_fill.items():
    df.loc[df['paper_id'] == pid, 'conclusion'] = text

# ===== 4. 保存为新文件，避免覆盖原文件 =====
output_file = r"E:\judita's project\new data 2\paper_id_intro_conclusion_enriched_filled.csv"
df.to_csv(output_file, index=False, encoding="utf-8-sig")

print("✅ Filling complete, saved to:", output_file)


✅ Filling complete, saved to: E:\judita's project\new data 2\paper_id_intro_conclusion_enriched_filled.csv


In [29]:
import pandas as pd

# ✅ Step 1: Modify your file path
# Change to your own file path
csv_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_enriched_filled.csv"

# ✅ Step 2: Read the CSV (handle bad lines / compatible with common encodings)
try:
    df = pd.read_csv(csv_path, encoding='utf-8', on_bad_lines='skip')
except UnicodeDecodeError:
    df = pd.read_csv(csv_path, encoding='latin1', on_bad_lines='skip')

# ✅ Step 3: Output the total record count and column names
print(f"📊 Total records (rows): {len(df)}")
print(f"🧩 Number of fields (columns): {df.shape[1]}")
print("📋 List of field names:")
print(df.columns.tolist())

# ✅ Step 4: Count null values / empty strings for each field
print("\n🔍 Null/Empty String Count per Column:")
for col in df.columns:
    # Null or empty strings (including whitespace-only)
    mask = df[col].isna() | (df[col].astype(str).str.strip() == "")
    count = mask.sum()
    if count > 0:
        print(f"⚠️ {col}: {count} null/empty values")
    else:
        print(f"✅ {col}: No null values")

📊 Total records (rows): 5817
🧩 Number of fields (columns): 12
📋 List of field names:
['paper_id', 'section_count', 'introduction', 'conclusion', 'rate0', 'rate1', 'rate2', 'avg_score', 'decision', 'title', 'abstract', 'pdf_link']

🔍 Null/Empty String Count per Column:
✅ paper_id: No null values
✅ section_count: No null values
✅ introduction: No null values
✅ conclusion: No null values
✅ rate0: No null values
✅ rate1: No null values
✅ rate2: No null values
✅ avg_score: No null values
✅ decision: No null values
✅ title: No null values
✅ abstract: No null values
✅ pdf_link: No null values


In [33]:
import pandas as pd

# ✅ Step 1: 读取已清洗的数据集（修改路径为你的）
file_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_enriched_filled.csv"
df = pd.read_csv(file_path, encoding="utf-8")

# ✅ Step 2: 定义拼接函数（防止空值错误）
def safe_concat(prefix, text):
    text = str(text).strip()
    return f"{prefix}{text}" if text else ""

# ✅ Step 3: 创建组合字段
df["title+abstract"] = df.apply(lambda row:
    safe_concat("title: ", row["title"]) + " // " +
    safe_concat("abstract: ", row["abstract"]),
    axis=1
)

df["title+abstract+introduction"] = df.apply(lambda row:
    safe_concat("title: ", row["title"]) + " // " +
    safe_concat("abstract: ", row["abstract"]) + " // " +
    safe_concat("intro: ", row["introduction"]),
    axis=1
)

df["title+abstract+introduction+conclusion"] = df.apply(lambda row:
    safe_concat("title: ", row["title"]) + " // " +
    safe_concat("abstract: ", row["abstract"]) + " // " +
    safe_concat("intro: ", row["introduction"]) + " // " +
    safe_concat("concl: ", row["conclusion"]),
    axis=1
)

df["title+abstract+conclusion"] = df.apply(lambda row:
    safe_concat("title: ", row["title"]) + " // " +
    safe_concat("abstract: ", row["abstract"]) + " // " +
    safe_concat("concl: ", row["conclusion"]),
    axis=1
)

df["title+introduction+conclusion"] = df.apply(lambda row:
    safe_concat("title: ", row["title"]) + " // " +
    safe_concat("intro: ", row["introduction"]) + " // " +
    safe_concat("concl: ", row["conclusion"]),
    axis=1
)

# ✅ Step 4: 保存新的带组合字段的文件
output_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_combined.csv"
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print("✅ Concatenation complete and saved to:", output_path)
print("📦 Fields added:", [
    "title+abstract",
    "title+abstract+introduction",
    "title+abstract+introduction+conclusion",
    "title+abstract+conclusion",
    "title+introduction+conclusion"
])

✅ Concatenation complete and saved to: E:\judita's project\new data 2\paper_id_intro_conclusion_combined.csv
📦 Fields added: ['title+abstract', 'title+abstract+introduction', 'title+abstract+introduction+conclusion', 'title+abstract+conclusion', 'title+introduction+conclusion']


In [34]:
import pandas as pd
import random

# ===== 1. 读取 CSV 文件 =====
file_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_combined.csv"
df = pd.read_csv(file_path, encoding="utf-8")



# ===== 3. 随机抽取一个 paper_id =====
random_row = df.sample(n=1).iloc[0]

# ===== 4. 打印所有相关字段内容 =====
print(f"🆔 paper_id: {random_row['paper_id']}\n")
print(f"📄 Title:\n{random_row['title']}\n")
print(f"📜 Abstract:\n{random_row['abstract']}\n")
print(f"📘 Introduction:\n{random_row['introduction']}\n")
print(f"📕 Conclusion:\n{random_row['conclusion']}\n")
print(f"📎 title+abstract:\n{random_row['title+abstract']}\n")
print(f"📎 title+abstract+introduction:\n{random_row['title+abstract+introduction']}\n")
print(f"📎 title+abstract+introduction+conclusion:\n{random_row['title+abstract+introduction+conclusion']}\n")
print(f"📎 title+abstract+conclusion:\n{random_row['title+abstract+conclusion']}\n")
print(f"📎 title+introduction+conclusion:\n{random_row['title+introduction+conclusion']}\n")


🆔 paper_id: ATp1nW2FuZL

📄 Title:
neural learning of one-of-many solutions for combinatorial problems in structured output spaces

📜 Abstract:
Recent research has proposed neural architectures for solving combinatorial problems in structured output spaces. In many such problems, there may exist multiple solutions for a given input, e.g. a partially filled Sudoku puzzle may have many completions satisfying all constraints. Further, we are often interested in finding any "one" of the possible solutions, without any preference between them. Existing approaches completely ignore this solution multiplicity. In this paper, we argue that being oblivious to the presence of multiple solutions can severely hamper their training ability. Our contribution is two-fold. First, we formally define the task of learning one-of-many solutions for combinatorial problems in structured output spaces, which is applicable for solving several problems of interest such as N-Queens, and Sudoku. Second, we presen

In [35]:
import pandas as pd
import json

# === 1. 文件路径（请根据实际路径修改）===
csv_file_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_combined.csv"  # ✅ CSV 文件
json_file_path = r"E:\judita's project\全部数据\judita_统一ICLR7.7更新后.json"  # ✅ JSON 文件
output_path = r"E:\judita's project\new data 2\paper_id_intro_conclusion_combined_final.csv"  # ✅ 输出文件

# === 2. 加载 CSV 文件 ===
df_csv = pd.read_csv(csv_file_path)
df_csv["paper_id"] = df_csv["paper_id"].astype(str).str.strip()

# === 3. 加载 JSON 文件 ===
with open(json_file_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

# === 4. 将 JSON 数据转为 DataFrame，并重命名 id 为 paper_id ===
df_json = pd.DataFrame(json_data)
df_json = df_json.rename(columns={"id": "paper_id"})
df_json["paper_id"] = df_json["paper_id"].astype(str).str.strip()

# 只保留需要的字段（你也可以修改此列表）
columns_needed = [
    "paper_id", "page_count", "citation_count", "figure_count", "table_count", "reference_count", "publication", "year"
]
df_json = df_json[columns_needed]

# === 5. 合并两个数据集（保留 CSV 中的原始顺序）===
df_merged = df_csv.merge(df_json, on="paper_id", how="left")

# === 6. 保存结果 ===
df_merged.to_csv(output_path, index=False, encoding="utf-8-sig")

print("✅ Merge complete, output file path:", output_path)
missing = df_merged[["page_count", "citation_count", "figure_count", "table_count", "reference_count", "publication", "year"]].isna().sum()
print("\n📊 Missing value statistics (after merge):")
print(missing)


✅ Merge complete, output file path: E:\judita's project\new data 2\paper_id_intro_conclusion_combined_final.csv

📊 Missing value statistics (after merge):
page_count         0
citation_count     0
figure_count       0
table_count        0
reference_count    0
publication        0
year               0
dtype: int64
