In [None]:
import numpy as np
import pandas as pd
import os, time, re, httpx, json, arxiv, tarfile, pathlib
from tqdm import tqdm
from google import genai
from google.genai import types
from openai import OpenAI
import anthropic

In [None]:
Gemini_client = genai.Client(api_key='')
GPT_client = OpenAI(api_key='')
Claude_client = anthropic.Anthropic(api_key='')

In [None]:
df_train = pd.read_csv('WithdrarXiv-Check_train.csv')
df_test = pd.read_csv('WithdrarXiv-Check_test.csv')
df_train.shape, df_test.shape

In [None]:
df_test.head()

In [None]:
# Download paper source files from arXiv
for i in tqdm(range(len(df_test))):
    entry = df_test.iloc[i]
    paper_id = entry['paper_id']
    paper_src_path = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id]))).download_source(dirpath='arxiv_src')
    with tarfile.open(paper_src_path, "r:gz") as tar:
        tar.extractall('arxiv_src/' + paper_id)
    time.sleep(4) # arXiv recommends no more than 1 request every 3 seconds

## Finding Problems

In [None]:
prompt_checker = '''Please check the attached paper for critical errors and unsoundness problems that would invalidate the conclusions. You can ignore minor issues (e.g, typos and formatting errors) and limitations that have been properly acknowledged.
In your final output, give me up to 5 most critical problems as a JSON object using the following schema: Entry = {"Problem": str, "Location": str, "Explanation": str}, Return: list[Entry]. For location, give page number, section number, equation number, or whatever applicable. You can end the list early if there are fewer problems. No need to provide references.'''

#### Gemini

In [None]:
# PDF
checker_problems_GeminiPro = []
checker_model = 'Gemini 2.5 Pro'
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    paper_pdf_url = 'https://arxiv.org/pdf/' + entry['paper_id']
    paper_pdf_data = httpx.get(paper_pdf_url).content

    Gemini_checker_response = Gemini_client.models.generate_content(
        model="gemini-2.5-pro-preview-05-06",
        contents=[prompt_checker,
                  types.Part.from_bytes(data=paper_pdf_data, mime_type='application/pdf')],
        config=types.GenerateContentConfig(tools=[], response_mime_type='application/json',
                                           temperature=0, seed=42,
                                           thinking_config=types.ThinkingConfig(include_thoughts=True)),
    )
    if Gemini_checker_response.text != '':
        try: Gemini_checker_response_json = json.loads(Gemini_checker_response.text)
        except json.JSONDecodeError:
            Gemini_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrtu])', r'\\\\', Gemini_checker_response.text))
    else:
        Gemini_checker_response_json = []
    token_usage = Gemini_checker_response.usage_metadata

    checker_problem_entry = {'entry_id': i,
                             'retraction_id': entry['retraction_id'],
                             'paper_id': entry['paper_id'],
                             'retraction_comment': entry['retraction_comment'],
                             'checker_model': checker_model,
                             'attempt_id': 0,
                             'problems': Gemini_checker_response_json,
                             'token_usage': {'input': token_usage.prompt_token_count,
                                             'thinking': token_usage.thoughts_token_count,
                                             'output': token_usage.candidates_token_count}
    }
    checker_problems_GeminiPro.append(checker_problem_entry)

In [None]:
with open('checker_problems_GeminiPro.json', 'w') as out:
    json.dump(checker_problems_GeminiPro, out, indent=2, ensure_ascii=False)

In [None]:
# TeX
checker_problems_GeminiPro_tex = []
checker_model = 'Gemini 2.5 Flash'
check_problems_pdf = checker_problems_GeminiPro
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    paper_id = entry['paper_id']
    latex = ''
    if entry['tex_available']:
        for tex_file in pathlib.Path('arxiv_src/'+paper_id).rglob("*.tex"):
            latex += tex_file.read_text(encoding="utf-8", errors="ignore")
        if latex == '':
            with open('arxiv_src/'+paper_id+'/'+paper_id, 'r', encoding='utf-8', errors='ignore') as f:
                latex = f.read()
        Gemini_checker_response = Gemini_client.models.generate_content(
            model="gemini-2.5-pro-preview-05-06",
            contents=[prompt_checker+'\n\n'+latex],
            config=types.GenerateContentConfig(tools=[], response_mime_type='application/json',
                                               temperature=0, seed=42,
                                               thinking_config=types.ThinkingConfig(include_thoughts=True)),
        )
        if Gemini_checker_response.text != '':
            try: Gemini_checker_response_json = json.loads(Gemini_checker_response.text)
            except json.JSONDecodeError:
                Gemini_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrt])', r'\\\\', Gemini_checker_response.text))
        else:
            Gemini_checker_response_json = []
        token_usage = Gemini_checker_response.usage_metadata

        checker_problem_entry = {'entry_id': i,
                                'retraction_id': entry['retraction_id'],
                                'paper_id': entry['paper_id'],
                                'retraction_comment': entry['retraction_comment'],
                                'checker_model': checker_model,
                                'attempt_id': 0,
                                'problems': Gemini_checker_response_json,
                                'token_usage': {'input': token_usage.prompt_token_count,
                                                'thinking': token_usage.thoughts_token_count,
                                                'output': token_usage.candidates_token_count}
        }
    else: # If no TeX available, use the corresponding PDF checker problems
        for e in check_problems_pdf:
            if e['paper_id'] == paper_id:
                checker_problem_entry = e
                break
    checker_problems_GeminiPro_tex.append(checker_problem_entry)

In [None]:
with open('checker_problems_GeminiPro_tex.json', 'w') as out:
    json.dump(checker_problems_GeminiPro_tex, out, indent=2, ensure_ascii=False)

#### OpenAI

In [None]:
# Get uploaded file list
file_list = GPT_client.files.list(purpose="user_data", order="asc")

In [None]:
# PDF
checker_problems_o3 = []
checker_model = 'o3'
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    file_id = None
    for f in file_list.data: # if already uploaded, use the existing file
        if f.filename == entry['paper_id']+".pdf":
            file_id = f.id
            break
    if file_id is None: # if not, upload it
        paper_pdf_url = 'https://arxiv.org/pdf/' + entry['paper_id']
        paper_pdf_data = httpx.get(paper_pdf_url).content
        paper_pdf_upload = GPT_client.files.create(
            file=(entry['paper_id']+".pdf", paper_pdf_data, "application/pdf"),
            purpose="user_data")
        file_id = paper_pdf_upload.id

    GPT_checker_response = GPT_client.responses.create(
        model="o3-2025-04-16",
        input=[{"role": "user", "content": [
            {"type": "input_text", "text": prompt_checker},
            {"type": "input_file", "file_id": file_id},
        ]}],
        reasoning={'summary': 'auto'},
        tools=[], #temperature=0, seed=42
    )
    GPT_checker_response_raw = GPT_checker_response.output[-1].content[0].text.lstrip('```json').rstrip('```')
    try: GPT_checker_response_json = json.loads(GPT_checker_response_raw)
    except json.JSONDecodeError:
        # try: GPT_checker_response_raw = GPT_checker_response_raw.split(':\n\n')[1].lstrip('```json').rstrip('```') # for o4-mini
        # except IndexError: GPT_checker_response_raw = GPT_checker_response_raw.split('.\n\n')[1].lstrip('```json').rstrip('```')
        GPT_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrt])', r'\\\\', GPT_checker_response_raw))
    GPT_checker_response_summary = [s.text for s in GPT_checker_response.output[0].summary] if GPT_checker_response.output[0].summary else []
    token_usage = GPT_checker_response.usage

    checker_problem_entry = {'entry_id': i,
                            'retraction_id': entry['retraction_id'],
                            'paper_id': entry['paper_id'],
                            'retraction_comment': entry['retraction_comment'],
                            'checker_model': checker_model,
                            'attempt_id': 0,
                            'problems': GPT_checker_response_json,
                            'think_summary': GPT_checker_response_summary,
                            'token_usage': {'input': token_usage.input_tokens,
                                            'thinking': token_usage.output_tokens_details.reasoning_tokens,
                                            'output': token_usage.output_tokens - token_usage.output_tokens_details.reasoning_tokens}
    }
    checker_problems_o3.append(checker_problem_entry)

In [None]:
with open('checker_problems_o3.json', 'w') as out:
    json.dump(checker_problems_o3, out, indent=2, ensure_ascii=False)

In [None]:
# TeX
checker_problems_o3_tex = []
checker_model = 'o3'
check_problems_pdf = checker_problems_o3
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    paper_id = entry['paper_id']
    latex = ''
    if entry['tex_available']:
        for tex_file in pathlib.Path('arxiv_src/'+paper_id).rglob("*.tex"):
            latex += tex_file.read_text(encoding="utf-8", errors="ignore")
        if latex == '':
            with open('arxiv_src/'+paper_id+'/'+paper_id, 'r', encoding='utf-8', errors='ignore') as f:
                latex = f.read()

        GPT_checker_response = GPT_client.responses.create(
            model="o3-2025-04-16",
            input=[{"role": "user", "content": prompt_checker+'\n\n'+latex}],
            reasoning={'summary': 'detailed'},
            tools=[], #temperature=0, seed=42
        )
        GPT_checker_response_raw = GPT_checker_response.output[-1].content[0].text.lstrip('```json').rstrip('```')
        try: GPT_checker_response_json = json.loads(GPT_checker_response_raw)
        except json.JSONDecodeError:
            # try: GPT_checker_response_raw = GPT_checker_response_raw.split(':\n\n')[1].lstrip('```json').rstrip('```') # for o4-mini
            # except IndexError: GPT_checker_response_raw = GPT_checker_response_raw.split('.\n\n')[1].lstrip('```json').rstrip('```')
            GPT_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrt])', r'\\\\', GPT_checker_response_raw))
        GPT_checker_response_summary = [s.text for s in GPT_checker_response.output[0].summary] if GPT_checker_response.output[0].summary else []
        token_usage = GPT_checker_response.usage

        checker_problem_entry = {'entry_id': i,
                                'retraction_id': entry['retraction_id'],
                                'paper_id': entry['paper_id'],
                                'retraction_comment': entry['retraction_comment'],
                                'checker_model': checker_model,
                                'attempt_id': 0,
                                'problems': GPT_checker_response_json,
                                'think_summary': GPT_checker_response_summary,
                                'token_usage': {'input': token_usage.input_tokens,
                                                'thinking': token_usage.output_tokens_details.reasoning_tokens,
                                                'output': token_usage.output_tokens - token_usage.output_tokens_details.reasoning_tokens}
        }
    else: # If no TeX available, use the corresponding PDF checker problems
        for e in check_problems_pdf:
            if e['paper_id'] == paper_id:
                checker_problem_entry = e
                break
    checker_problems_o3_tex.append(checker_problem_entry)

In [None]:
with open('checker_problems_o3_tex.json', 'w') as out:
    json.dump(checker_problems_o3_tex, out, indent=2, ensure_ascii=False)

#### Claude

In [None]:
# PDF
checker_problems_Claude = []
checker_model = 'Claude 3.7 Sonnet'
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    paper_pdf_url = 'https://arxiv.org/pdf/' + entry['paper_id']

    Claude_checker_response = Claude_client.messages.create(
        model="claude-3-7-sonnet-20250219",
        messages=[{"role": "user", "content": [
            {"type": "document", "source": {"type": "url", "url": paper_pdf_url}}, # Anthropic recommends putting the document first
            {"type": "text", "text": prompt_checker},
        ]}],
        max_tokens=16000,
        thinking={"type": "enabled", "budget_tokens": 14000},
        tools=[], temperature=1, #seed=42,
    )
    Claude_checker_response_raw = Claude_checker_response.content[-1].text.lstrip('```json').rstrip('```')
    try: Claude_checker_response_json = json.loads(Claude_checker_response_raw)
    except json.JSONDecodeError:
        if re.search('\"Problem\":', Claude_checker_response_raw) is None:
            Claude_checker_response_json = []
        else:
            Claude_checker_response_raw = Claude_checker_response_raw.split(':\n\n')[1].lstrip('```json').split('\n\n')[0].rstrip('```')
            Claude_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrt])', r'\\\\', Claude_checker_response_raw))
    Claude_checker_response_think = [block.thinking for block in Claude_checker_response.content[:-1]]
    token_usage = Claude_checker_response.usage
    token_usage_output = Claude_client.messages.count_tokens(
        model="claude-3-7-sonnet-20250219",
        messages=[{"role": "user", "content": Claude_checker_response.content[-1].text}],
    ).input_tokens

    checker_problem_entry = {'entry_id': i,
                            'retraction_id': entry['retraction_id'],
                            'paper_id': entry['paper_id'],
                            'retraction_comment': entry['retraction_comment'],
                            'checker_model': checker_model,
                            'attempt_id': 0,
                            'problems': Claude_checker_response_json,
                            'think_process': Claude_checker_response_think,
                            'token_usage': {'input': token_usage.input_tokens,
                                            'thinking': token_usage.output_tokens - token_usage_output,
                                            'output': token_usage_output}
    }
    checker_problems_Claude.append(checker_problem_entry)
    if token_usage.input_tokens > 80000: # for rate limit
        time.sleep(90)
    elif token_usage.input_tokens > 40000:
        time.sleep(45)

In [None]:
with open('checker_problems_Claude.json', 'w') as out:
    json.dump(checker_problems_Claude, out, indent=2, ensure_ascii=False)

In [None]:
# TeX
checker_problems_Claude_tex = []
checker_model = 'Claude 3.7 Sonnet'
check_problems_pdf = checker_problems_Claude
for i in tqdm(range(0, len(df_test))):
    entry = df_test.iloc[i]
    paper_id = entry['paper_id']
    latex = ''
    if entry['tex_available']:
        for tex_file in pathlib.Path('arxiv_src/'+paper_id).rglob("*.tex"):
            latex += tex_file.read_text(encoding="utf-8", errors="ignore")
        if latex == '':
            with open('arxiv_src/'+paper_id+'/'+paper_id, 'r', encoding='utf-8', errors='ignore') as f:
                latex = f.read()

        Claude_checker_response = Claude_client.messages.create(
            model="claude-3-7-sonnet-20250219",
            messages=[{"role": "user", "content": prompt_checker+'\n\n'+latex}],
            max_tokens=16000,
            thinking={"type": "enabled", "budget_tokens": 14000},
            tools=[], temperature=1, #seed=42,
        )
        Claude_checker_response_raw = Claude_checker_response.content[-1].text.lstrip('```json').rstrip('```')
        try: Claude_checker_response_json = json.loads(Claude_checker_response_raw)
        except json.JSONDecodeError:
            if re.search('\"Problem\":', Claude_checker_response_raw) is None:
                Claude_checker_response_json = []
            else:
                try: Claude_checker_response_raw = Claude_checker_response_raw.split(':\n\n')[1].lstrip('```json').split('\n\n')[0].rstrip('```')
                except IndexError: Claude_checker_response_raw = Claude_checker_response_raw.split('.\n\n')[1].lstrip('```json').split('\n\n')[0].rstrip('```')
                Claude_checker_response_json = json.loads(re.sub(r'(?<!\\)\\(?!["\\/bfnrt])', r'\\\\', Claude_checker_response_raw))
        Claude_checker_response_think = [block.thinking for block in Claude_checker_response.content[:-1]]
        token_usage = Claude_checker_response.usage
        token_usage_output = Claude_client.messages.count_tokens(
            model="claude-3-7-sonnet-20250219",
            messages=[{"role": "user", "content": Claude_checker_response.content[-1].text}],
        ).input_tokens

        checker_problem_entry = {'entry_id': i,
                                'retraction_id': entry['retraction_id'],
                                'paper_id': entry['paper_id'],
                                'retraction_comment': entry['retraction_comment'],
                                'checker_model': checker_model,
                                'attempt_id': 0,
                                'problems': Claude_checker_response_json,
                                'think_process': Claude_checker_response_think,
                                'token_usage': {'input': token_usage.input_tokens,
                                                'thinking': token_usage.output_tokens - token_usage_output,
                                                'output': token_usage_output}
        }
    else: # If no TeX available, use the corresponding PDF checker problems
        for e in check_problems_pdf:
            if e['paper_id'] == paper_id:
                checker_problem_entry = e
                break
    checker_problems_Claude_tex.append(checker_problem_entry)
    if token_usage.input_tokens > 80000: # for rate limit
        time.sleep(90)
    elif token_usage.input_tokens > 40000:
        time.sleep(45)

In [None]:
with open('checker_problems_Claude_tex.json', 'w') as out:
    json.dump(checker_problems_Claude_tex, out, indent=2, ensure_ascii=False)

## Cost Analysis

In [None]:
# Number of identified problems
nProb = [len(entry['problems']) for entry in checker_problems_GeminiPro]
np.mean(nProb), np.quantile(nProb, [0, 0.25, 0.5, 0.75, 1])

In [None]:
def estimate_cost(checker_problems, input_price, output_price):
    input_total, think_total, output_total = 0, 0, 0
    n = len(checker_problems)
    for entry in checker_problems:
        token_usage = entry['token_usage']
        if token_usage['input'] is not None:
            input_total += token_usage['input']
            think_total += token_usage['thinking']
            if token_usage['output'] is not None:
                output_total += token_usage['output']
        else:
            n -= 1
    input_avg = input_total / n
    think_avg = think_total / n
    output_avg = output_total / n
    print(n, round(input_avg), round(think_avg), round(output_avg))
    cost_avg = input_avg/1e6*input_price + (think_avg + output_avg)/1e6*output_price
    print(round(cost_avg, 3))

In [None]:
estimate_cost(checker_problems_GeminiPro, 1.25, 10) # Gemini 2.5 Pro

In [None]:
estimate_cost(checker_problems_GeminiFlash, 0.15, 3.50) # Gemini 2.5 Flash

In [None]:
estimate_cost(checker_problems_o3, 10, 40) # o3

In [None]:
estimate_cost(checker_problems_o4mini, 1.1, 4.4) # o4mini

In [None]:
estimate_cost(checker_problems_Claude, 3, 15) # Claude 3.7 Sonnet

## Eval

In [None]:
# Load checker problems
checker_problems_GeminiPro = json.loads(open('checker_problems_GeminiPro.json').read())
checker_problems_GeminiFlash = json.loads(open('checker_problems_GeminiFlash.json').read())
checker_problems_GeminiPro_tex = json.loads(open('checker_problems_GeminiPro_tex.json').read())
checker_problems_GeminiFlash_tex = json.loads(open('checker_problems_GeminiFlash_tex.json').read())

checker_problems_o3 = json.loads(open('checker_problems_o3.json').read())
checker_problems_o4mini = json.loads(open('checker_problems_o4mini.json').read())
checker_problems_o3_tex = json.loads(open('checker_problems_o3_tex.json').read())
checker_problems_o4mini_tex = json.loads(open('checker_problems_o4mini_tex.json').read())

checker_problems_Claude = json.loads(open('checker_problems_Claude.json').read())
checker_problems_Claude_tex = json.loads(open('checker_problems_Claude_tex.json').read())

### Hit Rate

In [None]:
prompt_judge_hit = '''My colleague was reading a paper and said there is a problem in it, as described below:
Problem: {problem}
Location: {location}
Explanation: {explanation}

I checked the paper and noticed that the authors have the following retraction comment:
{retraction_comment}

Is my colleague referring to exactly the same problem mentioned in the retraction comment? Your final answer should be "Yes" or "No". Default your answer to "No" and only give "Yes" if you are certain. You may explain your decision but please be concise.
'''

In [None]:
# Gemini 2.5 Pro judge
eval_hit_o4mini_Gemini = []
checker_problems = checker_problems_o4mini
judge_model = 'Gemini 2.5 Pro'
for i in tqdm(range(0, len(df_test))):
    eval_entry = checker_problems[i]
    problems = eval_entry['problems']
    if len(problems) == 0:
        hit_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': None,
                    'judge_model': judge_model,
                    'judge_response': None,
                    'hit': False,
                    'judge_think_summary': None,
                    'token_usage': {'input': None,
                                    'thinking': None,
                                    'output': None}
        }
        eval_hit_o4mini_Gemini.append(hit_entry)
        continue
    for j in range(0, len(problems)):
        problem = problems[j]
        Gemini_hit_response = Gemini_client.models.generate_content(
            model="gemini-2.5-pro-preview-06-05",
            contents=[prompt_judge_hit.format(
                problem=problem['Problem'],
                location=problem['Location'],
                explanation=problem['Explanation'],
                retraction_comment=eval_entry['retraction_comment'])],
            config=types.GenerateContentConfig(tools=[], temperature=0, seed=42,
                                               thinking_config=types.ThinkingConfig(include_thoughts=True))
        )
        Gemini_hit_response_summary = [part.text for part in Gemini_hit_response.candidates[0].content.parts[:-1]]
        token_usage = Gemini_hit_response.usage_metadata
        hit_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': j,
                    'judge_model': judge_model,
                    'judge_response': Gemini_hit_response.text,
                    'hit': re.search(r'[Yy]es', Gemini_hit_response.text) is not None,
                    'judge_think_summary': Gemini_hit_response_summary,
                    'token_usage': {'input': token_usage.prompt_token_count,
                                    'thinking': token_usage.thoughts_token_count,
                                    'output': token_usage.candidates_token_count}
        }
        eval_hit_o4mini_Gemini.append(hit_entry)
        if hit_entry['hit']:
            break

In [None]:
with open('eval_hit_o4mini_Gemini.json', 'w') as out:
    json.dump(eval_hit_o4mini_Gemini, out, indent=2, ensure_ascii=False)

In [None]:
# o3 judge
eval_hit_o4mini_o3 = []
checker_problems = checker_problems_o4mini
judge_model = 'o3'
for i in tqdm(range(0, len(df_test))):
    eval_entry = checker_problems[i]
    problems = eval_entry['problems']
    if len(problems) == 0:
        hit_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': None,
                    'judge_model': judge_model,
                    'judge_response': None,
                    'hit': False,
                    'judge_think_summary': None,
                    'token_usage': {'input': None,
                                    'thinking': None,
                                    'output': None}
        }
        eval_hit_o4mini_o3.append(hit_entry)
        continue
    for j in range(0, len(problems)):
        problem = problems[j]
        GPT_hit_response = GPT_client.responses.create(
            model="o3-2025-04-16",
            input=[{"role": "user", "content": prompt_judge_hit.format(
                problem=problem['Problem'],
                location=problem['Location'],
                explanation=problem['Explanation'],
                retraction_comment=eval_entry['retraction_comment'])}],
            reasoning={'summary': 'auto'},
            tools=[], #temperature=0, seed=42
            service_tier="flex",
        )
        GPT_hit_response_text = GPT_hit_response.output[-1].content[0].text
        GPT_hit_response_summary = [s.text for s in GPT_hit_response.output[0].summary] if GPT_hit_response.output[0].summary else []
        token_usage = GPT_hit_response.usage

        hit_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': j,
                    'judge_model': judge_model,
                    'judge_response': GPT_hit_response_text,
                    'hit': re.search(r'[Yy]es', GPT_hit_response_text) is not None,
                    'judge_think_summary': GPT_hit_response_summary,
                    'token_usage': {'input': token_usage.input_tokens,
                                    'thinking': token_usage.output_tokens_details.reasoning_tokens,
                                    'output': token_usage.output_tokens - token_usage.output_tokens_details.reasoning_tokens}
        }
        eval_hit_o4mini_o3.append(hit_entry)
        if hit_entry['hit']:
            break

In [None]:
with open('eval_hit_o4mini_o3.json', 'w') as out:
    json.dump(eval_hit_o4mini_o3, out, indent=2, ensure_ascii=False)

In [None]:
# Hit rate by a single judge
eval_hit_df = pd.DataFrame(eval_hit_o4mini_o3)
eval_hit_df_groups = eval_hit_df.groupby('entry_id').sum('hit')
eval_hit_df_groups.shape[0], (eval_hit_df_groups['hit'] > 0).mean()

In [None]:
# Combine votes from both judges to get the final hit rate
eval_hit_o4mini_Gemini = json.loads(open('eval_hit_o4mini_Gemini.json').read())
eval_hit_o4mini_o3 = json.loads(open('eval_hit_o4mini_o3.json').read())
hr1 = pd.DataFrame(eval_hit_o4mini_Gemini).groupby('entry_id').sum('hit')['hit']
hr2 = pd.DataFrame(eval_hit_o4mini_o3).groupby('entry_id').sum('hit')['hit']
len(hr1), len(hr2), (hr1 + hr2 == 2).sum()/len(hr2)

### Precision

In [None]:
prompt_judge_tp = '''My colleague was reading this paper and said there is a critical problem in it, as described below:
Problem: {problem}
Location: {location}
Explanation: {explanation}

Is this problem a true problem or a false alarm? Please be careful because I don't want to get the authors into trouble by mistake. In your final answer, clearly indicate "Yes, it is a true problem" or "No, it is a false alarm". Make your best decision if you are unsure. You may explain your decision but please be concise.
'''

In [None]:
# Gemini 2.5 Pro judge
eval_tp_Claude_Gemini = []
checker_problems = checker_problems_Claude
judge_model = 'Gemini 2.5 Pro'
for i in tqdm(range(0, len(df_test))):
    eval_entry = checker_problems[i]
    problems = eval_entry['problems']
    paper_pdf_url = 'https://arxiv.org/pdf/' + eval_entry['paper_id']
    paper_pdf_data = httpx.get(paper_pdf_url).content

    if len(problems) == 0:
        continue
    for j in range(0, len(problems)):
        problem = problems[j]
        Gemini_tp_response = Gemini_client.models.generate_content(
            model="gemini-2.5-pro-preview-06-05",
            contents=[
                types.Part.from_bytes(data=paper_pdf_data, mime_type='application/pdf'), # put document first to enable prompt caching
                prompt_judge_tp.format(
                    problem=problem['Problem'],
                    location=problem['Location'],
                    explanation=problem['Explanation'])
            ],
            config=types.GenerateContentConfig(tools=[], temperature=0, seed=42,
                thinking_config=types.ThinkingConfig(include_thoughts=True))
        )
        Gemini_tp_response_summary = [part.text for part in Gemini_tp_response.candidates[0].content.parts[:-1]]
        token_usage = Gemini_tp_response.usage_metadata

        tp_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': j,
                    'judge_model': judge_model,
                    'judge_response': Gemini_tp_response.text,
                    'true_positive': re.search(r'[Yy]es', Gemini_tp_response.text) is not None,
                    'judge_think_summary': Gemini_tp_response_summary,
                    'token_usage': {'input': token_usage.prompt_token_count,
                                    'thinking': token_usage.thoughts_token_count,
                                    'output': token_usage.candidates_token_count}
        }
        eval_tp_Claude_Gemini.append(tp_entry)

In [None]:
with open('eval_tp_Claude_Gemini.json', 'w') as out:
    json.dump(eval_tp_Claude_Gemini, out, indent=2, ensure_ascii=False)

In [None]:
# Get uploaded file list
file_list = GPT_client.files.list(purpose="user_data", order="asc")

In [None]:
# o3 judge
eval_tp_Claude_o3 = []
checker_problems = checker_problems_Claude
judge_model = 'o3'
for i in tqdm(range(0, len(df_test))):
    eval_entry = checker_problems[i]
    problems = eval_entry['problems']
    if len(problems) == 0:
        continue

    file_id = None
    for f in file_list.data: # if already uploaded, use the existing file
        if f.filename == eval_entry['paper_id']+".pdf":
            file_id = f.id
            break
    if file_id is None: # if not, upload it
        paper_pdf_url = 'https://arxiv.org/pdf/' + eval_entry['paper_id']
        paper_pdf_data = httpx.get(paper_pdf_url).content
        paper_pdf_upload = GPT_client.files.create(
            file=(eval_entry['paper_id']+".pdf", paper_pdf_data, "application/pdf"),
            purpose="user_data")
        file_id = paper_pdf_upload.id

    for j in range(0, len(problems)):
        problem = problems[j]
        GPT_tp_response = GPT_client.responses.create(
            model="o3-2025-04-16",
            input=[{"role": "user", "content": [
                {"type": "input_file", "file_id": file_id}, # put document first to enable prompt caching
                {"type": "input_text", "text": prompt_judge_tp.format(
                    problem=problem['Problem'],
                    location=problem['Location'],
                    explanation=problem['Explanation'])},
            ]}],
            reasoning={'summary': 'auto'},
            tools=[], #temperature=0, seed=42
            service_tier="flex",
        )
        GPT_tp_response_text = GPT_tp_response.output[-1].content[0].text
        GPT_tp_response_summary = [s.text for s in GPT_tp_response.output[0].summary] if GPT_tp_response.output[0].summary else []
        token_usage = GPT_tp_response.usage

        tp_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': j,
                    'judge_model': judge_model,
                    'judge_response': GPT_tp_response_text,
                    'true_positive': re.search(r'[Yy]es', GPT_tp_response_text) is not None,
                    'judge_think_summary': GPT_tp_response_summary,
                    'token_usage': {'input': token_usage.input_tokens,
                                    'thinking': token_usage.output_tokens_details.reasoning_tokens,
                                    'output': token_usage.output_tokens - token_usage.output_tokens_details.reasoning_tokens}
        }
        eval_tp_Claude_o3.append(tp_entry)

In [None]:
with open('eval_tp_Claude_o3.json', 'w') as out:
    json.dump(eval_tp_Claude_o3, out, indent=2, ensure_ascii=False)

In [None]:
# Average precision by a single judge
eval_tp_df = pd.DataFrame(eval_tp_Claude_o3)
eval_tp_df_groups = eval_tp_df.groupby('entry_id').mean('true_positive')
eval_tp_df_groups.shape[0], eval_tp_df_groups['true_positive'].mean()

In [None]:
# Combine votes from both judges to get the final average precision
tp1 = pd.DataFrame(json.loads(open('eval_tp_Claude_Gemini.json').read()))
tp2 = pd.DataFrame(json.loads(open('eval_tp_Claude_o3.json').read()))
tp = pd.merge(tp1, tp2, on=['entry_id', 'problem_id'], how='inner')[['entry_id', 'problem_id', 'true_positive_x', 'true_positive_y']]
tp['true_positive'] = (tp['true_positive_x'] & tp['true_positive_y'])
tp_groups = tp.groupby('entry_id').mean('true_positive')
tp_groups.shape[0], tp_groups['true_positive'].mean(), tp['true_positive'].sum()

In [None]:
# An edge case: test paper 103 has more than 100 pages, which exceeds the capacity of OpenAI models, so we use LaTeX instead
checker_problems = checker_problems_GeminiFlash
judge_model = 'o3'
for i in tqdm(range(103, 104)):
    eval_entry = checker_problems[i]
    problems = eval_entry['problems']
    paper_id = eval_entry['paper_id']
    if len(problems) == 0:
        continue

    latex = ''
    for tex_file in pathlib.Path('arxiv_src/'+paper_id).rglob("*.tex"):
        latex += tex_file.read_text(encoding="utf-8", errors="ignore")
    if latex == '':
        with open('arxiv_src/'+paper_id+'/'+paper_id, 'r', encoding='utf-8', errors='ignore') as f:
            latex = f.read()

    for j in range(0, len(problems)):
        problem = problems[j]
        GPT_tp_response = GPT_client.responses.create(
            model="o3-2025-04-16",
            input=[{"role": "user", "content": [
                {"type": "input_text", "text": prompt_judge_tp.format(
                    problem=problem['Problem'],
                    location=problem['Location'],
                    explanation=problem['Explanation'])},
                {"type": "input_text", "text": latex},
            ]}],
            reasoning={'summary': 'auto'},
            tools=[], #temperature=0, seed=42
            service_tier="flex",
        )
        GPT_tp_response_text = GPT_tp_response.output[-1].content[0].text
        GPT_tp_response_summary = [s.text for s in GPT_tp_response.output[0].summary] if GPT_tp_response.output[0].summary else []
        token_usage = GPT_tp_response.usage

        tp_entry = {'entry_id': eval_entry['entry_id'],
                    'retraction_id': eval_entry['retraction_id'],
                    'paper_id': eval_entry['paper_id'],
                    'checker_model': eval_entry['checker_model'],
                    'attempt_id': 0,
                    'problem_id': j,
                    'judge_model': judge_model,
                    'judge_response': GPT_tp_response_text,
                    'true_positive': re.search(r'[Yy]es', GPT_tp_response_text) is not None,
                    'judge_think_summary': GPT_tp_response_summary,
                    'token_usage': {'input': token_usage.input_tokens,
                                    'thinking': token_usage.output_tokens_details.reasoning_tokens,
                                    'output': token_usage.output_tokens - token_usage.output_tokens_details.reasoning_tokens}
        }
        eval_tp_GeminiFlash_o3.append(tp_entry)