## Retrieve PDF based on openreview_id

In [None]:
import requests

def get_pdf(id, pdf_name):
    # pdf url
    pdf_url = "https://openreview.net/notes/edits/attachment?id="+id+"&name=pdf"
    
    headers = {
        "User-Agent": "Mozilla/5.0"
    }
    response = requests.get(pdf_url, headers=headers)
    if response.status_code == 200:
        with open(pdf_name, "wb") as f:
            f.write(response.content)
        print("✅ PDF is downloaded as "+pdf_name)
    else:
        print("❌ Failure, Status Code: ", response.status_code)

## Compare thr Differences between two PDFs

In [None]:
from pdfminer.high_level import extract_text
import difflib
from tqdm import tqdm

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def compare_texts(text1, text2):
    diff = difflib.unified_diff(
        text1.splitlines(),
        text2.splitlines(),
        fromfile='Original',
        tofile='Modified',
        lineterm=''
    )
    return '\n'.join(diff)

def parse_diff(diff_text):
    lines = diff_text.splitlines()
    
    all_diff = []
    current_diff = None
    for line in tqdm(lines[2:]):
        # Check for diff change markers
        if line.startswith('@@'):
            if current_diff is not None:
                # Add the previous diff to the corresponding list
                all_diff.append(current_diff)
            # Start a new diff block
            current_diff = {
                'context_before': "",
                'context_after': "",
                'original_lines': "",
                'modified_lines': "",
            }
        elif line.startswith('-'):
            current_diff['original_lines'] = current_diff['original_lines'] + line[1:].strip() + " "
        elif line.startswith('+'):
            current_diff['modified_lines'] = current_diff['modified_lines'] + line[1:].strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] == "" and current_diff['modified_lines'] == ""):
            current_diff['context_before'] = current_diff['context_before'] + line.strip() + " "
        elif line.strip() != "" and (current_diff['original_lines'] != "" or current_diff['modified_lines'] != ""):
            current_diff['context_after'] = current_diff['context_after'] + line.strip() + " "
            
    print("successfully build all_diff")
    
    return all_diff

#### Usage

In [None]:
pdf_original = "original.pdf"
pdf_modified = "modified.pdf"

# extract text
original = extract_text_from_pdf(pdf_original)
modified = extract_text_from_pdf(pdf_modified)

# compare the differences
diff_result = compare_texts(original, modified)

# format the differences into dict
all_diff_dict = parse_diff(diff_result)

## Extract Image from a PDF

In [None]:
import pikepdf
from pathlib import Path

def find_items_with_prefix(obj, prefix):
    results = []
    try:
        for k in obj:
            key_str = str(k)
            if key_str.startswith(prefix):
                results.append((k, obj[k]))
            results.extend(find_items_with_prefix(obj[k], prefix))
    except:
        pass
    return results

def wrap_form_xobject(page: pikepdf.Page, prefix_name: str, out_pdf_prefix: str = "page_1_figure"):
    # 1) Grab the all the image item
    xobjs = page.Resources.get("/XObject", {})
    all_form = find_items_with_prefix(xobjs, prefix_name)
    
    cnt = 0
    for xobj_name, xobj_form in all_form:
        cnt = cnt + 1
        # 2) Create a new PDF
        new_pdf = pikepdf.Pdf.new()
        
        # 3) Copy the form (and its dependencies) into new_pdf
        form_copy = new_pdf.copy_foreign(xobj_form)
        
        # 4) Clone the page (to preserve MediaBox, etc.), then overwrite its Resources/Contents
        new_pdf.pages.append(page)
        new_page = new_pdf.pages[0]
        
        new_page.Resources = pikepdf.Dictionary({
            "/XObject": pikepdf.Dictionary({ xobj_name: form_copy })
        })
        # 5) Save
        draw_cmd = b"q\n1 0 0 1 0 0 cm " + xobj_name.encode("utf-8") + b" Do\nQ"
        content_stream = pikepdf.Stream(new_pdf, draw_cmd)
        new_page.Contents = new_pdf.make_indirect(content_stream)
        
        output_pdf_path = Path(str(out_pdf_prefix)+"_"+str(cnt)+".pdf")
        new_pdf.save(output_pdf_path)
        print("Wrapped PDF written to", output_pdf_path)

#### Usage

In [None]:
src_pdf_path = Path("paper.pdf")

src = pikepdf.Pdf.open(str(src_pdf_path))

# save the image in each page into a pdf
cnt = 0
for page in src.pages:
    cnt = cnt + 1
    out_pdf_prefix = "original_page_"+str(cnt)+"_figure"
    wrap_form_xobject(
        page=page,
        prefix_name="/Im",
        out_pdf_prefix=out_pdf_prefix
    )