In [4]:
# Cell 1: Import and setup
import os, time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob

print("🎉 NV-Ingest successfully imported in Jupyter!")
print("✅ All packages loaded successfully!")

🎉 NV-Ingest successfully imported in Jupyter!
✅ All packages loaded successfully!


In [5]:
# Create NV-Ingest client
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)
print("✅ NV-Ingest client created successfully!")
print("🎯 Ready to process documents!")

✅ NV-Ingest client created successfully!
🎯 Ready to process documents!


In [15]:
# Check for sample PDF file
sample_file = "data/multimodal_test.pdf"
if os.path.exists(sample_file):
    print(f"✅ Sample PDF found: {sample_file}")
    print(f"   File size: {os.path.getsize(sample_file):,} bytes")
else:
    print(f"❌ Sample file not found: {sample_file}")
    # List available files
    if os.path.exists("data/"):
        print("Available files in data/:")
        for file in os.listdir("data/"):
            print(f"  - {file}")
try:
    if os.path.exists("data/multimodal_test.pdf"):
        ingestor = Ingestor(client=client).files("data/multimodal_test.pdf")
        print("✅ Ingestor created successfully!")
        print("🎉 NV-Ingest setup is complete and working!")
        print("📝 Note: Full processing requires the Docker service to be stable")
    else:
        print("⚠️  Sample file not found, but client setup is successful!")
except Exception as e:
    print(f"⚠️  Ingestor creation: {e}")
    print("✅ Client and imports are still working correctly!")

✅ Sample PDF found: data/multimodal_test.pdf
   File size: 133,446 bytes
✅ Ingestor created successfully!
🎉 NV-Ingest setup is complete and working!
📝 Note: Full processing requires the Docker service to be stable


In [None]:
# Optional: Show file information for files in /data directory
print("📋 Sample Document Information:")
print("=" * 40)

sample_file = "data/multimodal_test.pdf"
if os.path.exists(sample_file):
    stat = os.stat(sample_file)
    print(f"📄 File: {sample_file}")
    print(f"📊 Size: {stat.st_size:,} bytes ({stat.st_size/1024:.1f} KB)")
    print(f"📅 Modified: {time.ctime(stat.st_mtime)}")
    
    # Try to read first few bytes to confirm it's a PDF
    with open(sample_file, 'rb') as f:
        header = f.read(10)
        if header.startswith(b'%PDF'):
            print(f"✅ Confirmed: Valid PDF file")
            print(f"🔍 PDF Header: {header}")
        else:
            print(f"⚠️  Unexpected file format")
else:
    print(f"❌ File not found: {sample_file}")

# Show what files are available
print(f"\n📁 Available files in data directory:")
if os.path.exists("data"):
    for file in os.listdir("data"):
        file_path = os.path.join("data", file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            print(f"   📄 {file} ({size:,} bytes)")

In [14]:
# Test the connection to NV-Ingest service
import time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient

print("🔄 Testing connection to NV-Ingest services...")

# Create client
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)

print("✅ Client created successfully")

# Test with a simple extraction
ingestor = (
    Ingestor(client=client)
    .files("data/multimodal_test.pdf")
    .extract(extract_text=True)
)

print("✅ Ingestor configured successfully")
print("🚀 Starting ingestion test...")

# This is the test line from Alex's requirements
print("🚀 Starting test...")
results = ingestor.ingest(show_progress=True)

print(f"🎉 SUCCESS! The test line worked perfectly!")
print(f"📊 Processed {len(results)} documents")

🔄 Testing connection to NV-Ingest services...
✅ Client created successfully
✅ Ingestor configured successfully
🚀 Starting ingestion test...
🚀 Starting test...


Processing Documents: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.55s/doc]

🎉 SUCCESS! The test line worked perfectly!
📊 Processed 1 documents





In [16]:
# Full NV-INGEST pipeline with LOCAL Milvus
# Note services that are running locally vs. using Endpoints:
# RUNNING LOCALLY (Docker containers):
#   Redis - Message broker on port 6379
# 	etcd - Metadata storage on port 2379
#	MinIO - Object storage on ports 9000-9001
#	Milvus - Vector database on port 19530
#	NV-Ingest Runtime - Main orchestration service on ports 7670-7671
# NVIDIA hosted endpoints:
#	PaddleOCR: https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
#	Page Elements Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nv-yolox-page-elements-v1
#	Graphic Elements Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
#	Table Structure Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
#	NemoRetriever Parse: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-parse
#	Embeddings: https://integrate.api.nvidia.com/v1 (nvidia/llama-3.2-nv-embedqa-1b-v2)
#	Vision-Language Model: https://integrate.api.nvidia.com/v1 (meta/llama-3.2-11b-vision-instruct)
#	Speech-to-Text: https://ai.api.nvidia.com/v1/audio/nvidia/speechtotext

# Import required libraries for timing, client connections, and result processing
import time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob

print("🚀 Testing FULL NV-Ingest pipeline with LOCAL Milvus...")

# Create connection to the NV-Ingest service running in Docker
# This connects to the main orchestration service on port 7671
# This code uses simple message broker and runs the client locally
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)

# Configure connection to local Milvus vector database
# Milvus runs locally on port 19530
milvus_uri = "http://localhost:19530"
collection_name = "nv_ingest_test"

# Build the processing pipeline using method chaining
ingestor = (
    Ingestor(client=client)
    .files("data/multimodal_test.pdf")
    # EXTRACTION PHASE: Extract different types of content types from the PDF
    .extract(              
        extract_text=True,
        extract_tables=True,
        extract_charts=True,
        extract_images=True,
        paddle_output_format="markdown",
        extract_infographics=True,
        text_depth="page"
    )
    # EMBEDDING PHASE: Generate vector embeddings for semantic search
    .embed()
    # STORAGE PHASE: Upload to vector database for retrieval
    .vdb_upload(
        collection_name=collection_name,
        milvus_uri=milvus_uri,
        sparse=False,
        dense_dim=2048,
        recreate=True
    )
)

print("Starting full ingestion with vector database upload...")
t0 = time.time()

# EXECUTE THE NV-INGEST PIPELINE
# The first line (from Alex's requirements) orchestrates the workflow:
# 1. Sends PDF to NV-Ingest service (localhost:7670)
# 2. NV-Ingest calls NVIDIA endpoints for AI processing:
#    - PaddleOCR for table extraction
#    - Page/Graphic elements detection for layout analysis  
#    - Vision-language model for image understanding
#    - Embedding model for vector generation
# 3. Results are aggregated and returned
# 4. Embeddings are uploaded to local Milvus database
# 5. Progress bar shows real-time status
results = ingestor.ingest(show_progress=True)

t1 = time.time()
print(f"⏱️  Time taken: {t1-t0:.2f} seconds")
print(f"📊 Processed {len(results)} documents successfully!")

# Let user know if processing sucessfully completes
if results:
    print(f"\n🎉 COMPLETE SUCCESS!")
    print(f"✅ Document processed and uploaded to vector database")
    print(f"✅ Vector database collection '{collection_name}' created in Milvus")

# Show results
if results:
    print(f"\n📄 Results summary:")
    print(ingest_json_results_to_blob(results[0]))

🚀 Testing FULL NV-Ingest pipeline with LOCAL Milvus...
Starting full ingestion with vector database upload...


Processing Documents: 100%|██████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.76s/doc]


⏱️  Time taken: 8.25 seconds
📊 Processed 1 documents successfully!

🎉 COMPLETE SUCCESS!
✅ Document processed and uploaded to vector database
✅ Vector database collection 'nv_ingest_test' created in Milvus

📄 Results summary:
This chart shows some gadgets, and some very fictitious costs.   Hammer - Powerdrill - Bluetooth speaker - Minifridge - Premium desk fan Dollars $- - $20.00 - $40.00 - $60.00 - $80.00 - $100.00 - $120.00 - $140.00 - $160.00 Cost    Chart 1
| locations, |  |  |
| Animal | Activity | Place |
| Giraffe | Driving a car | At the beach |
| Lion | Putting on sunscreen | At the park |
| Cat | Jumping onto a laptop | In a home office |
| Dog | Chasing a squirrel | In the front yard |
TestingDocument
A sample document with headings and placeholder text
Introduction
This is a placeholder document that can be used for any purpose. It contains some 
headings and some placeholder text to fill the space. The text is not important and contains 
no real value, but it is useful for 