In [4]:
# Cell 1: Import and setup
import os, time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob

print("üéâ NV-Ingest successfully imported in Jupyter!")
print("‚úÖ All packages loaded successfully!")

üéâ NV-Ingest successfully imported in Jupyter!
‚úÖ All packages loaded successfully!


In [5]:
# Create NV-Ingest client
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)
print("‚úÖ NV-Ingest client created successfully!")
print("üéØ Ready to process documents!")

‚úÖ NV-Ingest client created successfully!
üéØ Ready to process documents!


In [22]:
# Check for sample PDF file
sample_file = "data/pharmacopia-2014.pdf"
if os.path.exists(sample_file):
    print(f"‚úÖ Sample PDF found: {sample_file}")
    print(f"   File size: {os.path.getsize(sample_file):,} bytes")
else:
    print(f"‚ùå Sample file not found: {sample_file}")
    # List available files
    if os.path.exists("data/"):
        print("Available files in data/:")
        for file in os.listdir("data/"):
            print(f"  - {file}")
try:
    if os.path.exists("data/pharmacopia-2014.pdf"):
        ingestor = Ingestor(client=client).files("data/multimodal_test.pdf")
        print("‚úÖ Ingestor created successfully!")
        print("üéâ NV-Ingest setup is complete and working!")
        print("üìù Note: Full processing requires the Docker service to be stable")
    else:
        print("‚ö†Ô∏è  Sample file not found, but client setup is successful!")
except Exception as e:
    print(f"‚ö†Ô∏è  Ingestor creation: {e}")
    print("‚úÖ Client and imports are still working correctly!")

‚úÖ Sample PDF found: data/pharmacopia-2014.pdf
   File size: 3,355,718 bytes
‚úÖ Ingestor created successfully!
üéâ NV-Ingest setup is complete and working!
üìù Note: Full processing requires the Docker service to be stable


In [None]:
# Optional: Show file information for files in /data directory
print("üìã Sample Document Information:")
print("=" * 40)

sample_file = "data/multimodal_test.pdf"
if os.path.exists(sample_file):
    stat = os.stat(sample_file)
    print(f"üìÑ File: {sample_file}")
    print(f"üìä Size: {stat.st_size:,} bytes ({stat.st_size/1024:.1f} KB)")
    print(f"üìÖ Modified: {time.ctime(stat.st_mtime)}")
    
    # Try to read first few bytes to confirm it's a PDF
    with open(sample_file, 'rb') as f:
        header = f.read(10)
        if header.startswith(b'%PDF'):
            print(f"‚úÖ Confirmed: Valid PDF file")
            print(f"üîç PDF Header: {header}")
        else:
            print(f"‚ö†Ô∏è  Unexpected file format")
else:
    print(f"‚ùå File not found: {sample_file}")

# Show what files are available
print(f"\nüìÅ Available files in data directory:")
if os.path.exists("data"):
    for file in os.listdir("data"):
        file_path = os.path.join("data", file)
        if os.path.isfile(file_path):
            size = os.path.getsize(file_path)
            print(f"   üìÑ {file} ({size:,} bytes)")

In [14]:
# Test the connection to NV-Ingest service
import time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient

print("üîÑ Testing connection to NV-Ingest services...")

# Create client
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)

print("‚úÖ Client created successfully")

# Test with a simple extraction
ingestor = (
    Ingestor(client=client)
    .files("data/multimodal_test.pdf")
    .extract(extract_text=True)
)

print("‚úÖ Ingestor configured successfully")
print("üöÄ Starting ingestion test...")

# This is the test line from Alex's requirements
print("üöÄ Starting test...")
results = ingestor.ingest(show_progress=True)

print(f"üéâ SUCCESS! The test line worked perfectly!")
print(f"üìä Processed {len(results)} documents")

üîÑ Testing connection to NV-Ingest services...
‚úÖ Client created successfully
‚úÖ Ingestor configured successfully
üöÄ Starting ingestion test...
üöÄ Starting test...


Processing Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:02<00:00,  2.55s/doc]

üéâ SUCCESS! The test line worked perfectly!
üìä Processed 1 documents





In [29]:
# Full NV-INGEST pipeline with LOCAL Milvus
# Note services that are running locally vs. using Endpoints:
# RUNNING LOCALLY (Docker containers):
#   Redis - Message broker on port 6379
# 	etcd - Metadata storage on port 2379
#	MinIO - Object storage on ports 9000-9001
#	Milvus - Vector database on port 19530
#	NV-Ingest Runtime - Main orchestration service on ports 7670-7671
# NVIDIA hosted endpoints:
#	PaddleOCR: https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
#	Page Elements Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nv-yolox-page-elements-v1
#	Graphic Elements Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
#	Table Structure Detection: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
#	NemoRetriever Parse: https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-parse
#	Embeddings: https://integrate.api.nvidia.com/v1 (nvidia/llama-3.2-nv-embedqa-1b-v2)
#	Vision-Language Model: https://integrate.api.nvidia.com/v1 (meta/llama-3.2-11b-vision-instruct)
#	Speech-to-Text: https://ai.api.nvidia.com/v1/audio/nvidia/speechtotext

# Import required libraries for timing, client connections, and result processing
import time
from nv_ingest_client.client import Ingestor, NvIngestClient
from nv_ingest_api.util.message_brokers.simple_message_broker import SimpleClient
from nv_ingest_client.util.process_json_files import ingest_json_results_to_blob

print("üöÄ Testing FULL NV-Ingest pipeline with LOCAL Milvus...")

# Create connection to the NV-Ingest service running in Docker
# This connects to the main orchestration service on port 7671
# This code uses simple message broker and runs the client locally
client = NvIngestClient(
    message_client_allocator=SimpleClient,
    message_client_port=7671,
    message_client_hostname="localhost"
)

# Configure connection to local Milvus vector database
# Milvus runs locally on port 19530
milvus_uri = "http://localhost:19530"
collection_name = "nv_ingest_test"

# Build the processing pipeline using method chaining
ingestor = (
    Ingestor(client=client)
    #.files("data/multimodal_test.pdf")
    .files("data/pharmacopia-2014.pdf")
    # EXTRACTION PHASE: Extract different types of content types from the PDF
    .extract(              
        extract_text=True,
        extract_tables=False,
        extract_charts=False,
        #extract_images=True,
        extract_images=False,
        paddle_output_format="markdown",
        extract_infographics=False,
        text_depth="page"
    )
    # EMBEDDING PHASE: Generate vector embeddings for semantic search
    .embed()
    # STORAGE PHASE: Upload to vector database for retrieval
    .vdb_upload(
        collection_name=collection_name,
        milvus_uri=milvus_uri,
        sparse=False,
        dense_dim=2048,
        recreate=True
    )
)

print("Starting full ingestion with vector database upload...")
t0 = time.time()

# EXECUTE THE NV-INGEST PIPELINE
# The first line (from Alex's requirements) orchestrates the workflow:
# 1. Sends PDF to NV-Ingest service (localhost:7670)
# 2. NV-Ingest calls NVIDIA endpoints for AI processing:
#    - PaddleOCR for table extraction
#    - Page/Graphic elements detection for layout analysis  
#    - Vision-language model for image understanding
#    - Embedding model for vector generation
# 3. Results are aggregated and returned
# 4. Embeddings are uploaded to local Milvus database
# 5. Progress bar shows real-time status

#results = ingestor.ingest(show_progress=True)

try:
    results = ingestor.ingest(show_progress=True)
except Exception as e:
    print(f"‚ùå FAILED: {str(e)}")

t1 = time.time()
print(f"‚è±Ô∏è  Time taken: {t1-t0:.2f} seconds")
print(f"üìä Processed {len(results)} documents successfully!")

# Let user know if processing sucessfully completes
if results:
    print(f"\nüéâ COMPLETE SUCCESS!")
    print(f"‚úÖ Document processed and uploaded to vector database")
    print(f"‚úÖ Vector database collection '{collection_name}' created in Milvus")

# Show results
if results:
    print(f"\nüìÑ Results summary:")
    print(ingest_json_results_to_blob(results[0]))

üöÄ Testing FULL NV-Ingest pipeline with LOCAL Milvus...
Starting full ingestion with vector database upload...


Processing Documents: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:14<00:00, 14.08s/doc]


‚è±Ô∏è  Time taken: 20.41 seconds
üìä Processed 1 documents successfully!

üéâ COMPLETE SUCCESS!
‚úÖ Document processed and uploaded to vector database
‚úÖ Vector database collection 'nv_ingest_test' created in Milvus

üìÑ Results summary:

ÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩ
ÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩÔøΩ
‚ÄúDesire to take medicines ... distinguishes man from animals.‚Äù ‚ÄîSir William Osler
Editor-in-Chief
Richard J. Hamilton, MD, FAAEM, FACMT, FACEP
Professor and Chair, Department of Emergency Medicine
Drexel University College of Medicine
Philadelphia, PA
15TH EDITION
2014 Deluxe Lab-Coat Edition
World Headquarters
Jones & Bartlett Learning
5 Wall Street
Burlington, MA 01803
978-443-5000
info@jblearning.com
www.jblearning.com
Jones & Bartlett Learning books and products are available through most bookstores and online booksellers. 
To contact Jones & Bartlett Learning directly, call 800-832-0034, fax 978-443-8000, or visit our website www.
jblearning.com