## Retrieve PDF based on openreview_id

In [None]:
import requests

def get_pdf(id, pdf_name):
    # pdf url
    pdf_url = "https://openreview.net/notes/edits/attachment?id="+id+"&name=pdf"
    
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(pdf_url, headers=headers)
    if response.status_code == 200:
        with open(pdf_name, "wb") as f:
            f.write(response.content)
        print("✅ PDF is downloaded as "+pdf_name)
    else:
        print("❌ Failure, Status Code: ", response.status_code)

## Compare the Differences between two PDFs

In [None]:
from pdfminer.high_level import extract_text
import difflib
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def compare_texts(text1, text2):
    diff = difflib.unified_diff(
        text1.splitlines(),
        text2.splitlines(),
        fromfile='Original',
        tofile='Modified',
        lineterm=''
    )
    return '\n'.join(diff)

def parse_diff(diff_text):
    lines = diff_text.splitlines()
    
    all_diff = []
    current_diff = None
    for line in tqdm(lines[2:]):
        # Check for diff change markers
        if line.startswith('@@'):
            if current_diff is not None:
                # Add the previous diff to the corresponding list
                all_diff.append(current_diff)
            # Start a new diff block
            current_diff = {
                'context_before': "",
                'context_after': "",
                'original_lines': "",
                'modified_lines': "",
            }
        elif line.startswith('-'):
            current_diff['original_lines'] = current_diff['original_lines'] + line[1:].strip() + " "
        elif line.startswith('+'):
            current_diff['modified_lines'] = current_diff['modified_lines'] + line[1:].strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] == "" and current_diff['modified_lines'] == ""):
            current_diff['context_before'] = current_diff['context_before'] + line.strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] != "" or current_diff['modified_lines'] != ""):
            current_diff['context_after'] = current_diff['context_after'] + line.strip() + " "
            
    print("successfully build all_diff")
    
    return all_diff

#### Usage

In [None]:
pdf_original = "original.pdf"
pdf_modified = "modified.pdf"

# extract text
original = extract_text_from_pdf(pdf_original)
modified = extract_text_from_pdf(pdf_modified)

# compare the differences
diff_result = compare_texts(original, modified)

# format the differences into dict
all_diff_dict = parse_diff(diff_result)

## Extract Figure from a PDF

In [None]:
import pikepdf
from pathlib import Path

def find_items_with_prefix(obj, prefix):
    results = []
    try:
        for k in obj:
            key_str = str(k)
            if key_str.startswith(prefix):
                results.append((k, obj[k]))
            results.extend(find_items_with_prefix(obj[k], prefix))
    except:
        pass
    return results

def extract_figures_from_pdf(page: pikepdf.Page, prefix_name: str, out_pdf_prefix: str = "page_1_figure"):
    # 1) Grab the all the image item
    xobjs = page.Resources.get("/XObject", {})
    all_form = find_items_with_prefix(xobjs, prefix_name)
    
    cnt = 0
    for xobj_name, xobj_form in all_form:
        cnt = cnt + 1
        # 2) Create a new PDF
        new_pdf = pikepdf.Pdf.new()
        
        # 3) Copy the form (and its dependencies) into new_pdf
        form_copy = new_pdf.copy_foreign(xobj_form)
        
        # 4) Clone the page (to preserve MediaBox, etc.), then overwrite its Resources/Contents
        new_pdf.pages.append(page)
        new_page = new_pdf.pages[0]
        
        new_page.Resources = pikepdf.Dictionary({
            "/XObject": pikepdf.Dictionary({ xobj_name: form_copy })
        })
        # 5) Save
        draw_cmd = b"q\n1 0 0 1 0 0 cm " + xobj_name.encode("utf-8") + b" Do\nQ"
        content_stream = pikepdf.Stream(new_pdf, draw_cmd)
        new_page.Contents = new_pdf.make_indirect(content_stream)
        
        output_pdf_path = Path(str(out_pdf_prefix)+"_"+str(cnt)+".pdf")
        new_pdf.save(output_pdf_path)
        print("Wrapped PDF written to", output_pdf_path)

#### Usage

In [None]:
src_pdf_path = Path("paper.pdf")

src = pikepdf.Pdf.open(str(src_pdf_path))

# save the image in each page into a pdf
cnt = 0
for page in src.pages:
    cnt = cnt + 1
    out_pdf_prefix = "original_page_"+str(cnt)+"_figure"
    extract_figures_from_pdf(
        page=page,
        prefix_name="/Im",
        out_pdf_prefix=out_pdf_prefix
    )

## Extract Paragraphs from a PDF

In [None]:
import re
import json
from pathlib import Path
from pdfminer.high_level import extract_text

def check_str_regex(s: str) -> bool: # check if a str has more than 3 math-related symbols and less than 10 characters
    math_symbol_pattern = r'[0-9+\-*/]'
    math_count = len(re.findall(math_symbol_pattern, s))
    letter_count = len(re.findall(r'[A-Za-z]', s))
    return (math_count >= 3) and (letter_count < 10)

def extract_paragraphs_from_pdf(pdf_path: Path, up_form: str, output_json_file: Path):
    # extract all the text from pdf
    full_text = extract_text(pdf_path)
    
    # split the text into lines
    lines = full_text.splitlines()
    
    # construct paragraphs based on the empty lines
    formatted_lines = []
    buffer = []
    for line in lines:
        if line.strip():          # Non-empty -> the same paragraph
            if line[-1] == '-':
                buffer.append(line[:-1])
            else:
                buffer.append(line)
        else:                     # Empty -> Next paragraph
            if buffer:            # Combine the content in buffer
                formatted_lines.append("".join(buffer))
                buffer = []       # Clean buffer
    if buffer:
        formatted_lines.append("".join(buffer))
    
    # only extract paragraphs between abstract and appendix
    start = 0
    try:
        start = formatted_lines.index("Abstract")
    except:
        try:
            start = formatted_lines.index("ABSTRACT")
        except:
            print("can not find abstract")
    end = len(formatted_lines)
    try:
        end = formatted_lines.index("Appendix")
    except:
        try:
            end = formatted_lines.index("APPENDIX")
        except:
            print("can not find appendix")

    # the structured content and insert the title
    structured_content = {
        "Title": formatted_lines[1],
    }
    
    # start constructing the structured content
    before_context = ""
    current_chapter = 0
    current_sub_chapter = 0
    current_section = ""
    is_across_page = False
    current_image_table = []
    is_across_page = False
    is_chapter = False
    for line in formatted_lines[start:end]:
        if check_str_regex(line): # no more than 3 digits or at least 15 characters
            continue
        
        # before_context add into formatted context
        if up_form in before_context:
            pass
        elif is_chapter:
            is_chapter = False
        elif before_context.isdigit() and len(line) > 20: # page changed
            is_across_page = True
        elif len(before_context) > 1 and before_context[0].isdigit() and before_context[1] != ".":
            pass
        elif before_context.startswith("Figure") or before_context.startswith("Table"):
            current_image_table.append(before_context)
        elif before_context != "":
            if len(structured_content[current_section]) == 0:
                char_end = "."
            else:
                char_end = structured_content[current_section][-1][-1]
            if not before_context.isdigit() and before_context != "": # get rid of pure digit
                if char_end == "." and not (before_context[0].isdigit() and before_context[1] == ".") and not before_context[0] == "•" and not before_context[0] == "(" and not before_context[0] == ")": # combine itemize and enumerate into one paragraph
                    structured_content[current_section].append(before_context)
                else:
                    structured_content[current_section][-1] = structured_content[current_section][-1] + " " + before_context
            if char_end == "." and len(current_image_table) != 0: # insert the figure or table only when the paragraph is finished
                structured_content[current_section].extend(current_image_table)
                current_image_table = []
        
        # abstract
        if line == "Abstract" or line == "ABSTRACT":
            is_chapter = True
            current_section = "Abstract"
            structured_content[current_section] = []
        # reference
        if line == "REFERENCES" or line == "References":
            is_chapter = True
            current_section = "References"
            structured_content[current_section] = []
        # chapter
        if before_context.isdigit() and len(line) <= 20:
            is_chapter = True
            current_section = before_context+" "+line
            structured_content[current_section] = []
            current_chapter = current_chapter + 1
            current_sub_chapter = 0
        if not line.isdigit() and line[0].isdigit() and line[1] == " ":
            is_chapter = True
            current_section = line
            structured_content[current_section] = []
            current_chapter = current_chapter + 1
            current_sub_chapter = 0
        # sub-chapter
        if not line.isdigit() and line[0] == str(current_chapter) and line[1] == "." and line[2] == str(current_sub_chapter+1):
            is_chapter = True
            current_section = line
            structured_content[current_section] = []
            current_sub_chapter = current_sub_chapter + 1

        # combine the paragraph across two pages
        if is_across_page:
            before_context = before_context + " " + line
            is_across_page = False
        else:
            before_context = line
    
    # output the structured content as json
    with open(output_json_file, 'w') as json_file:
        json.dump(structured_content, json_file, indent=4)

#### Usage

In [None]:
pdf_path = "original.pdf"
up_form = "Under review as a conference paper at ICLR 2025"
output_json_file = 'data_original.json'

extract_paragraphs_from_pdf(pdf_path, up_form, output_json_file)

## Extract Tables from a PDF

In [None]:
import tabula
from pathlib import Path
from PyPDF2 import PdfReader

def extract_table_from_pdf(pdf_path: Path, output_csv_name: str):
    # get the number of pages
    pdf_reader = PdfReader(pdf_path)
    num_page = len(pdf_reader.pages)
    
    # go through all pages
    table_idx = 0
    for page_idx in range(num_page):
        page_idx = page_idx + 1
        # read tables into a list
        dfs = tabula.read_pdf(pdf_path, 
                     pages=str(page_idx), 
                     multiple_tables=True)
        # save table into csv
        for _, df in enumerate(dfs, start=1):
            print("Page "+str(page_idx)+" Table "+str(table_idx))
            df.to_csv(f"{output_csv_name}_{page_idx}_{table_idx}.csv", index=False)
            table_idx = table_idx + 1

#### Usage

In [None]:
pdf_path = "paper.pdf"
output_csv_name = "paper_table"
extract_table_from_pdf(pdf_path, output_csv_name)

## Locate the differences in paragraphs

In [None]:
import re
import difflib
import requests
from tqdm import tqdm
from typing import List, Optional
from pathlib import Path
from pdfminer.high_level import extract_text

def get_pdf(id, pdf_name):
    # pdf url
    pdf_url = "https://openreview.net/notes/edits/attachment?id="+id+"&name=pdf"
    
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(pdf_url, headers=headers)
    if response.status_code == 200:
        with open(pdf_name, "wb") as f:
            f.write(response.content)
        print("✅ PDF is downloaded as "+pdf_name)
    else:
        print("❌ Failure, Status Code: ", response.status_code)

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def compare_texts(text1, text2):
    diff = difflib.unified_diff(
        text1.splitlines(),
        text2.splitlines(),
        fromfile='Original',
        tofile='Modified',
        lineterm=''
    )
    return '\n'.join(diff)

def parse_diff(diff_text):
    lines = diff_text.splitlines()
    
    all_diff = []
    current_diff = None
    for line in tqdm(lines[2:]):
        # Check for diff change markers
        if line.startswith('@@'):
            if current_diff is not None:
                # Add the previous diff to the corresponding list
                all_diff.append(current_diff)
            # Start a new diff block
            current_diff = {
                'context_before': "",
                'context_after': "",
                'original_lines': "",
                'modified_lines': "",
            }
        elif line.startswith('-'):
            current_diff['original_lines'] = current_diff['original_lines'] + line[1:].strip() + " "
        elif line.startswith('+'):
            current_diff['modified_lines'] = current_diff['modified_lines'] + line[1:].strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] == "" and current_diff['modified_lines'] == ""):
            current_diff['context_before'] = current_diff['context_before'] + line.strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] != "" or current_diff['modified_lines'] != ""):
            current_diff['context_after'] = current_diff['context_after'] + line.strip() + " "
            
    print("successfully build current_diff")
    
    return all_diff

def check_str_regex(s: str) -> bool: # more than 3 digits and less than 10 characters
    has_3_digits = bool(re.search(r'(?:.*\d){3,}', s))
    letter_count = len(re.findall(r'[A-Za-z]', s))
    return has_3_digits and (letter_count < 10)

def preprocess_lines_in_paragraphs(lines: list) -> list:
    formatted_lines = []
    buffer = []
    for line in lines:
        if line.strip(): # Non-empty -> the same paragraph
            if line[-1] == '-':
                buffer.append(line[:-1])
            else:
                buffer.append(line)
        else: # Empty -> Next paragraph
            if buffer: # Combine the content in buffer
                formatted_lines.append("".join(buffer))
                buffer = [] # Clean buffer
    if buffer:
        formatted_lines.append("".join(buffer))
        
    return formatted_lines

def extract_paragraphs_from_pdf_new(pdf_path: Path, filter_list: Optional[List[str]] = None):
    # extract all the text from pdf
    full_text = extract_text(pdf_path)
    
    # split the text into lines
    lines = full_text.splitlines()
    
    # construct paragraphs based on the empty lines
    formatted_lines = preprocess_lines_in_paragraphs(lines)
    
    # only extract paragraphs between abstract and appendix
    start = 0
    try:
        start = formatted_lines.index("Abstract")
    except:
        try:
            start = formatted_lines.index("ABSTRACT")
        except:
            print("can not find abstract")
    end = len(formatted_lines)
    try:
        end = formatted_lines.index("Appendix")
    except:
        try:
            end = formatted_lines.index("APPENDIX")
        except:
            print("can not find appendix")

    # the structured content and insert the title
    structured_content = {
        "Title": [formatted_lines[1]],
    }
    
    # start constructing the structured content
    before_context = ""
    current_section_idx = 0
    current_subsection_idx = 0
    # before_section = ""
    current_section = ""
    current_image_table = []
    is_chapter = False
    num_paragraph = 0
    num_image_table = 0
    for line in formatted_lines[start:end]:
        if check_str_regex(line): # no more than 3 digits or at least 10 characters
            continue
        
        # check if before_context contains invalid content
        is_filter = False
        if filter_list is not None:
            for text in filter_list:
                if text in before_context:
                    is_filter = True
                    break
        
        # before_context add into formatted context
        if is_filter:
            pass
        elif is_chapter:
            num_image_table = 0
            num_paragraph = -1
            is_chapter = False
        elif before_context.startswith("Figure") or before_context.startswith("Table"):
            current_image_table.append(before_context)
        elif before_context != "":
            if not before_context.isdigit() and before_context != "": # get rid of pure digit
                if len(structured_content[current_section]) == 0:
                    num_image_table = 0
                    num_paragraph += 1
                    structured_content[current_section].append(before_context)
                else:
                    char_end = structured_content[current_section][num_paragraph-num_image_table][-1]
                    
                    is_append = True
                    if char_end != ".":
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == "."): # 1. 2.
                        is_append = False
                    elif char_end == "." and (before_context[0] == "•"): # •
                        is_append = False
                    elif char_end == "." and (before_context[0].isdigit() and before_context[1] == ")"): # 1) 2)
                        is_append = False
                    elif char_end == "." and before_context[0] == "(": # (1), (information)
                        is_append = False

                    if is_append:
                        num_image_table = 0
                        num_paragraph += 1
                        structured_content[current_section].append(before_context)
                        if len(current_image_table) != 0:
                            num_image_table = len(current_image_table)
                            structured_content[current_section].extend(current_image_table)
                            current_image_table = []
                    else:
                        structured_content[current_section][num_paragraph-num_image_table] = structured_content[current_section][num_paragraph-num_image_table] + " " + before_context
                
        # abstract
        if line == "Abstract" or line == "ABSTRACT":
            is_chapter = True
            current_section = "Abstract"
            structured_content[current_section] = []
        # reference
        if line == "REFERENCES" or line == "References":
            is_chapter = True
            # before_section = current_section
            current_section = "References"
            structured_content[current_section] = []
        # chapter
        if before_context.isdigit() and len(line) <= 20:
            is_chapter = True
            # before_section = current_section
            current_section = before_context+" "+line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        if not line.isdigit() and line[0].isdigit() and line[1] == " ":
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_section_idx = current_section_idx + 1
            current_subsection_idx = 0
        # sub-chapter
        if not line.isdigit() and line[0] == str(current_section_idx) and line[1] == "." and line[2] == str(current_subsection_idx+1):
            is_chapter = True
            # before_section = current_section
            current_section = line
            structured_content[current_section] = []
            current_subsection_idx = current_subsection_idx + 1

        before_context = line
    
    return structured_content

def connect_diffs_and_paragraphs(original_pdf_path: Path, modified_pdf_path: Path, filter_list: Optional[List[str]] = None):
    # extract text
    original_text = extract_text_from_pdf(original_pdf_path)
    print("Successfully extract text from the original pdf")
    modified_text = extract_text_from_pdf(modified_pdf_path)
    print("Successfully extract text from the modified pdf")

    # get the differences
    all_diff_result = compare_texts(original_text, modified_text)
    print("Successfully extract differences between original pdf and modified pdf")
    
    # get formatted differences
    formatted_diff_result = parse_diff(all_diff_result)
    print("Successfully get formatted differences")
    
    # get the structured paragraphs from modified paper
    structured_paragraphs_from_modified = extract_paragraphs_from_pdf_new(modified_pdf_path, filter_list)
    print("Successfully extract paragraphs from the modified pdf")
    
    # connect differences with paragraphs
    all_diff_loc = []
    for diff in formatted_diff_result:
        diff_loc = diff["context_before"]
        idx = 0
        for key, val in zip(structured_paragraphs_from_modified.keys(), structured_paragraphs_from_modified.values()):
            for paragraph in val:
                idx = idx + 1
                if diff_loc[:15] in paragraph:
                    diff_sample = {}
                    diff_sample["context_before"] = diff["context_before"]
                    diff_sample["context_after"] = diff["context_after"]
                    diff_sample["original_lines"] = diff["original_lines"]
                    diff_sample["modified_lines"] = diff["modified_lines"]
                    diff_sample["section"] = key
                    diff_sample["paragraph_idx"] = idx
                    all_diff_loc.append(diff_sample)
    print("Successfully connect differences with paragraphs")
                    
    return all_diff_loc

#### Usage

In [None]:
original_pdf_path = "original.pdf"
modified_pdf_path = "modified.pdf"
filter_list = ["Under review as a conference paper at ICLR 2025", "Published as a conference paper at ICLR 2025"]

all_diff_loc = connect_diffs_and_paragraphs(original_pdf_path, modified_pdf_path, filter_list)