In [13]:
import os
import fitz  # PyMuPDF
import pandas as pd
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_images_from_pdf(
    pdf_path,
    min_width=1200,
    save_folder="/Users/tanishqsingh/Desktop/XR_Lab/Extracted_images"
):
    if not os.path.exists(pdf_path):
        print(f"‚ùå PDF file not found: {pdf_path}")
        return []

    print(f"üìÑ Extracting images from: {pdf_path}")
    doc = fitz.open(pdf_path)
    os.makedirs(save_folder, exist_ok=True)
    image_data = []

    for page_index in tqdm(range(len(doc)), desc="üîç Processing Pages"):
        page = doc[page_index]
        images = page.get_images(full=True)

        if not images:
            print(f"‚ö†Ô∏è No images found on page {page_index + 1}")
            continue

        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            try:
                img_pil = Image.open(BytesIO(image_bytes))
            except Exception as e:
                print(f"‚ùå Failed to load image on page {page_index+1}, img {img_index+1}: {e}")
                continue

            width, height = img_pil.size

            image_filename = f"page{page_index+1}_img{img_index+1}.png"
            image_path = os.path.join(save_folder, image_filename)
            img_pil.save(image_path)

            image_data.append({
                "pdf_name": os.path.basename(pdf_path),
                "image_path": image_path,
                "page_number": page_index + 1,
                "image_width": width,
                "image_height": height,
                "image_resolution": f"{width}x{height}",
                "is_high_res": width >= min_width
            })

            print(f"‚úÖ Saved image: {image_filename} ({width}x{height})")

    # Save metadata
    if image_data:
        metadata_df = pd.DataFrame(image_data)
        csv_path = os.path.join(save_folder, "/Users/tanishqsingh/Desktop/XR_Lab/image_metadata.csv")
        metadata_df.to_csv(csv_path, index=False)
        print(f"\n‚úÖ Extracted {len(image_data)} images. Metadata saved to:\n{csv_path}")
    else:
        print("‚ö†Ô∏è No images were extracted from the PDF.")

    return image_data

# üîΩ Example usage: Update this path to your PDF
extract_images_from_pdf("/Users/tanishqsingh/Desktop/XR_Lab/cogs160submisson1.pdf")



üìÑ Extracting images from: /Users/tanishqsingh/Desktop/XR_Lab/cogs160submisson1.pdf


üîç Processing Pages:   5%|‚ñå         | 3/57 [00:00<00:01, 28.93it/s]

‚ö†Ô∏è No images found on page 1
‚úÖ Saved image: page2_img1.png (895x912)
‚úÖ Saved image: page3_img1.png (600x769)
‚úÖ Saved image: page4_img1.png (1400x986)
‚úÖ Saved image: page5_img1.png (1400x933)
‚úÖ Saved image: page5_img2.png (1400x776)


üîç Processing Pages:  14%|‚ñà‚ñç        | 8/57 [00:00<00:05,  8.29it/s]

‚úÖ Saved image: page6_img1.png (1400x933)
‚úÖ Saved image: page6_img2.png (900x600)
‚úÖ Saved image: page7_img1.png (512x768)
‚úÖ Saved image: page8_img1.png (1024x768)
‚úÖ Saved image: page8_img2.png (900x599)


üîç Processing Pages:  18%|‚ñà‚ñä        | 10/57 [00:01<00:05,  9.32it/s]

‚úÖ Saved image: page9_img1.png (400x267)
‚úÖ Saved image: page9_img2.png (900x600)
‚úÖ Saved image: page10_img1.png (1182x737)
‚úÖ Saved image: page11_img1.png (1200x800)
‚úÖ Saved image: page11_img2.png (818x453)
‚úÖ Saved image: page12_img1.png (1200x800)


üîç Processing Pages:  21%|‚ñà‚ñà        | 12/57 [00:01<00:05,  7.53it/s]

‚úÖ Saved image: page12_img2.png (1200x800)
‚úÖ Saved image: page13_img1.png (1200x800)
‚úÖ Saved image: page13_img2.png (1200x800)


üîç Processing Pages:  25%|‚ñà‚ñà‚ñç       | 14/57 [00:01<00:05,  7.35it/s]

‚úÖ Saved image: page14_img1.png (1200x853)


üîç Processing Pages:  26%|‚ñà‚ñà‚ñã       | 15/57 [00:02<00:07,  5.50it/s]

‚úÖ Saved image: page15_img1.png (1819x1289)
‚úÖ Saved image: page15_img2.png (1170x779)
‚úÖ Saved image: page16_img1.png (1400x1000)


üîç Processing Pages:  28%|‚ñà‚ñà‚ñä       | 16/57 [00:02<00:08,  4.72it/s]

‚úÖ Saved image: page16_img2.png (1500x1000)
‚úÖ Saved image: page17_img1.png (1149x1000)


üîç Processing Pages:  30%|‚ñà‚ñà‚ñâ       | 17/57 [00:02<00:10,  3.66it/s]

‚úÖ Saved image: page17_img2.png (2048x1448)


üîç Processing Pages:  33%|‚ñà‚ñà‚ñà‚ñé      | 19/57 [00:03<00:09,  3.97it/s]

‚úÖ Saved image: page18_img1.png (2000x1333)
‚úÖ Saved image: page19_img1.png (790x503)
‚úÖ Saved image: page19_img2.png (790x526)
‚úÖ Saved image: page20_img1.png (790x492)


üîç Processing Pages:  37%|‚ñà‚ñà‚ñà‚ñã      | 21/57 [00:03<00:06,  5.44it/s]

‚úÖ Saved image: page20_img2.png (790x524)
‚úÖ Saved image: page21_img1.png (790x526)
‚úÖ Saved image: page21_img2.png (790x526)


üîç Processing Pages:  39%|‚ñà‚ñà‚ñà‚ñä      | 22/57 [00:03<00:05,  6.19it/s]

‚úÖ Saved image: page22_img1.png (1100x733)


üîç Processing Pages:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 24/57 [00:04<00:06,  5.37it/s]

‚úÖ Saved image: page23_img1.png (1800x1200)
‚úÖ Saved image: page24_img1.png (1100x733)
‚úÖ Saved image: page24_img2.png (735x596)


üîç Processing Pages:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 25/57 [00:04<00:09,  3.25it/s]

‚úÖ Saved image: page25_img1.png (1920x1440)
‚úÖ Saved image: page25_img2.png (1800x1200)


üîç Processing Pages:  47%|‚ñà‚ñà‚ñà‚ñà‚ñã     | 27/57 [00:05<00:07,  4.16it/s]

‚úÖ Saved image: page26_img1.png (1800x1200)
‚úÖ Saved image: page27_img1.png (800x1200)
‚úÖ Saved image: page28_img1.png (1200x702)


üîç Processing Pages:  51%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 29/57 [00:05<00:06,  4.56it/s]

‚úÖ Saved image: page28_img2.png (1500x1000)
‚úÖ Saved image: page29_img1.png (1003x1500)


üîç Processing Pages:  53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 30/57 [00:05<00:05,  5.32it/s]

‚úÖ Saved image: page30_img1.png (800x533)
‚úÖ Saved image: page30_img2.png (825x550)


üîç Processing Pages:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 31/57 [00:06<00:05,  4.51it/s]

‚úÖ Saved image: page31_img1.png (2048x1365)


üîç Processing Pages:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 32/57 [00:06<00:06,  3.98it/s]

‚úÖ Saved image: page32_img1.png (2048x1542)


üîç Processing Pages:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 33/57 [00:06<00:08,  2.77it/s]

‚úÖ Saved image: page33_img1.png (1940x1293)
‚úÖ Saved image: page33_img2.png (1680x1260)
‚úÖ Saved image: page34_img1.png (1000x647)


üîç Processing Pages:  61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 35/57 [00:07<00:05,  3.75it/s]

‚úÖ Saved image: page34_img2.png (1200x744)
‚úÖ Saved image: page35_img1.png (1024x682)
‚úÖ Saved image: page35_img2.png (993x560)


üîç Processing Pages:  65%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 37/57 [00:07<00:03,  5.18it/s]

‚úÖ Saved image: page36_img1.png (1000x666)
‚úÖ Saved image: page37_img1.png (1320x880)


üîç Processing Pages:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 38/57 [00:07<00:04,  4.62it/s]

‚úÖ Saved image: page38_img1.png (1365x2048)


üîç Processing Pages:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 39/57 [00:08<00:05,  3.51it/s]

‚úÖ Saved image: page39_img1.png (2000x1333)
‚úÖ Saved image: page39_img2.png (2048x1366)
‚úÖ Saved image: page40_img1.png (1000x666)


üîç Processing Pages:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 40/57 [00:08<00:04,  3.53it/s]

‚úÖ Saved image: page40_img2.png (2000x1333)


üîç Processing Pages:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 41/57 [00:09<00:05,  2.94it/s]

‚úÖ Saved image: page41_img1.png (2048x1365)
‚úÖ Saved image: page41_img2.png (2048x1365)


üîç Processing Pages:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé  | 42/57 [00:09<00:04,  3.22it/s]

‚úÖ Saved image: page42_img1.png (1704x1217)
‚úÖ Saved image: page43_img1.png (1200x800)


üîç Processing Pages:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 43/57 [00:09<00:03,  3.53it/s]

‚úÖ Saved image: page43_img2.png (1200x800)
‚úÖ Saved image: page44_img1.png (1200x518)


üîç Processing Pages:  77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 44/57 [00:09<00:03,  3.85it/s]

‚úÖ Saved image: page44_img2.png (1200x850)
‚úÖ Saved image: page45_img1.png (1200x881)


üîç Processing Pages:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 46/57 [00:10<00:02,  4.62it/s]

‚úÖ Saved image: page45_img2.png (1200x800)
‚úÖ Saved image: page46_img1.png (1200x800)


üîç Processing Pages:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 47/57 [00:10<00:03,  2.96it/s]

‚úÖ Saved image: page47_img1.png (2048x1366)
‚úÖ Saved image: page47_img2.png (2048x1366)


üîç Processing Pages:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 48/57 [00:11<00:03,  2.92it/s]

‚úÖ Saved image: page48_img1.png (2048x1353)


üîç Processing Pages:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 49/57 [00:11<00:02,  2.85it/s]

‚úÖ Saved image: page49_img1.png (1328x2048)


üîç Processing Pages:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 50/57 [00:11<00:02,  2.47it/s]

‚úÖ Saved image: page50_img1.png (1800x1201)
‚úÖ Saved image: page50_img2.png (1800x1201)


üîç Processing Pages: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 57/57 [00:12<00:00,  4.53it/s]

‚úÖ Saved image: page51_img1.png (2048x1271)
‚úÖ Saved image: page51_img2.png (1800x1172)
‚ö†Ô∏è No images found on page 52
‚ö†Ô∏è No images found on page 53
‚ö†Ô∏è No images found on page 54
‚ö†Ô∏è No images found on page 55
‚ö†Ô∏è No images found on page 56
‚ö†Ô∏è No images found on page 57

‚úÖ Extracted 79 images. Metadata saved to:
/Users/tanishqsingh/Desktop/XR_Lab/image_metadata.csv





[{'pdf_name': 'cogs160submisson1.pdf',
  'image_path': '/Users/tanishqsingh/Desktop/XR_Lab/Extracted_images/page2_img1.png',
  'page_number': 2,
  'image_width': 895,
  'image_height': 912,
  'image_resolution': '895x912',
  'is_high_res': False},
 {'pdf_name': 'cogs160submisson1.pdf',
  'image_path': '/Users/tanishqsingh/Desktop/XR_Lab/Extracted_images/page3_img1.png',
  'page_number': 3,
  'image_width': 600,
  'image_height': 769,
  'image_resolution': '600x769',
  'is_high_res': False},
 {'pdf_name': 'cogs160submisson1.pdf',
  'image_path': '/Users/tanishqsingh/Desktop/XR_Lab/Extracted_images/page4_img1.png',
  'page_number': 4,
  'image_width': 1400,
  'image_height': 986,
  'image_resolution': '1400x986',
  'is_high_res': True},
 {'pdf_name': 'cogs160submisson1.pdf',
  'image_path': '/Users/tanishqsingh/Desktop/XR_Lab/Extracted_images/page5_img1.png',
  'page_number': 5,
  'image_width': 1400,
  'image_height': 933,
  'image_resolution': '1400x933',
  'is_high_res': True},
 {'pdf

In [9]:
import fitz  # PyMuPDF
from PIL import Image
import io

input_path = fitz.open("/Users/tanishqsingh/Desktop/XR_Lab/cogs160submisson1.pdf")
output_path = "/Users/tanishqsingh/Desktop/XR_Lab/compressed_output.pdf"

doc = fitz.open(input_path)
new_pdf = fitz.open()

for page in doc:
    # Render the PDF page as an image
    pix = page.get_pixmap(dpi=100)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Resize or compress image
    img = img.resize((pix.width // 2, pix.height // 2))  # optional downscale

    # Save as a single-page PDF (not JPEG!)
    img_buffer = io.BytesIO()
    img.save(img_buffer, format="PDF", resolution=100.0)
    img_buffer.seek(0)

    # Now this buffer is a valid PDF ‚Äî open it
    img_pdf = fitz.open("pdf", img_buffer)
    new_pdf.insert_pdf(img_pdf)

# Save final compressed output
new_pdf.save(output_path)
new_pdf.close()
doc.close()