In [None]:
# Cell 1: Install dependencies (WITHOUT flash-attn)
!pip install torch==2.8.0 torchvision==0.23.0 --index-url https://download.pytorch.org/whl/cu128
!pip install transformers pillow requests pdf2image poppler-utils tqdm accelerate pypdf
!apt-get update && apt-get install -y poppler-utils

In [None]:
# Cell 2: Verify GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Cell 2.5: Install system dependencies
!sudo apt-get update -qq
!sudo apt-get install -y poppler-utils

# Verify installation
!which pdfinfo
!pdfinfo -v

In [None]:
# Run this first to clear any lingering memory
import torch, gc
torch.cuda.empty_cache()
gc.collect()
print(f"üßπ Cleared. Free memory: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9:.2f} GB")

In [None]:
# Check model cache
!du -sh ~/.cache/huggingface/hub/models--TomoroAI--tomoro-colqwen3-embed-8b/
!ls -lh ~/.cache/huggingface/hub/models--TomoroAI--tomoro-colqwen3-embed-8b/snapshots/*/

In [None]:
import torch
from transformers import AutoModel, AutoProcessor
from PIL import Image
from pdf2image import convert_from_path
from pathlib import Path
import json
from tqdm import tqdm
import gc

MODEL_ID = "TomoroAI/tomoro-colqwen3-embed-8b"
DTYPE = torch.bfloat16
DEVICE = "cuda"
BATCH_SIZE = 2   # Reduced from 8
DPI = 150        # Keep high quality for flowcharts

print("üì¶ Loading model...")
processor = AutoProcessor.from_pretrained(
    MODEL_ID, trust_remote_code=True, max_num_visual_tokens=1280
)
model = AutoModel.from_pretrained(
    MODEL_ID,
    dtype=DTYPE,
    attn_implementation="sdpa",
    trust_remote_code=True,
    device_map=DEVICE,
).eval()

print("‚úÖ Model loaded")
print(f"üîß GPU Memory - Total: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
print(f"üîß After model load - Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB\n")

# Process PDFs
docs_dir = Path("./")
pdf_files = ["swm_2016.pdf", "urdpfi_vol1.pdf", "urdpfi_vol2.pdf"]

all_embeddings = []
all_metadata = []

for pdf_name in pdf_files:
    pdf_path = docs_dir / pdf_name
    
    if not pdf_path.exists():
        print(f"‚ö†Ô∏è  Skipping {pdf_name} (not found)")
        continue
    
    print(f"\n{'='*60}")
    print(f"üìÑ Processing: {pdf_name}")
    print(f"{'='*60}")
    
    # Convert PDF to images
    print("  üñºÔ∏è  Converting to images...")
    images = convert_from_path(str(pdf_path), dpi=DPI)
    print(f"  ‚úÖ {len(images)} pages converted")
    
    # Clear memory before embedding
    gc.collect()
    torch.cuda.empty_cache()
    

    # Embed in batches
    print(f"  üîÆ Embedding (batch_size={BATCH_SIZE})...")
    outputs = []
    
    for start in tqdm(range(0, len(images), BATCH_SIZE), desc="  Progress"):
        batch_imgs = images[start : start + BATCH_SIZE]
        
        # Process batch
        features = processor.process_images(images=batch_imgs)
        features = {k: v.to(DEVICE) if isinstance(v, torch.Tensor) else v 
                   for k, v in features.items()}
        
        # Generate embeddings
        with torch.inference_mode():
            out = model(**features)
            vecs = out.embeddings.to(torch.bfloat16).cpu()
        
        outputs.extend(vecs)
        
        # CRITICAL: Clear memory after EACH batch
        del features, out, batch_imgs
        torch.cuda.empty_cache()
    
    # Store with metadata
    for idx, emb in enumerate(outputs):
        all_embeddings.append(emb)
        all_metadata.append({
            "source": pdf_name,
            "page": idx + 1,
            "total_pages": len(images)
        })
    
    # Clear images from memory
    del images, outputs
    gc.collect()
    
    print(f"  ‚úÖ Embedded {len(all_embeddings)} total pages so far")
    print(f"  üîß GPU Memory - Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB\n")

# Save results
print(f"\n{'='*60}")
print("üíæ Saving embeddings...")
print(f"{'='*60}")

output_dir = Path("./embeddings_output")
output_dir.mkdir(exist_ok=True)

embeddings_tensor = torch.stack(all_embeddings)
torch.save(embeddings_tensor, output_dir / "embeddings.pt")

with open(output_dir / "metadata.json", "w") as f:
    json.dump(all_metadata, f, indent=2)

print(f"\n‚úÖ COMPLETE!")
print(f"üìä Total pages embedded: {len(all_embeddings)}")
print(f"üíæ Files saved to: {output_dir}")
print(f"   - embeddings.pt ({embeddings_tensor.element_size() * embeddings_tensor.nelement() / 1e6:.1f} MB)")
print(f"   - metadata.json")

In [None]:
# Quick cell on Lightning before shutting down
from pdf2image import convert_from_path
from pathlib import Path

output_dir = Path("./page_images")
output_dir.mkdir(exist_ok=True)

docs_dir = Path("./")
for pdf_name in ["swm_2016.pdf", "urdpfi_vol1.pdf", "urdpfi_vol2.pdf"]:
    images = convert_from_path(docs_dir / pdf_name, dpi=150)
    for idx, img in enumerate(images):
        img.save(output_dir / f"{pdf_name.replace('.pdf', '')}__page_{idx+1:04d}.png")

# Then download the entire page_images/ folder