In [1]:
import torch
import torch_neuronx
print(f"‚úÖ PyTorch: {torch.__version__}")
print(f"‚úÖ Neuron SDK: {torch_neuronx.__version__}")

‚úÖ PyTorch: 2.9.0+cu128
‚úÖ Neuron SDK: 2.9.0.2.11.19912+e48cd891


In [2]:
import os

# The lifecycle script copies artifacts here so they are visible inside the Docker kernel
artifact_path = '/home/ec2-user/SageMaker/neuron-compiled-models'

if os.path.exists(artifact_path):
    print(f"\n‚úÖ Found compilation artifacts at: {artifact_path}")
    
    print("\nüìÅ Compilation output directory:")
    !ls -lh {artifact_path}/output/
    
    print("\nüìÑ Original model files:")
    !ls -lh {artifact_path}/model/
else:
    print(f"\n‚ö†Ô∏è Artifacts not found at {artifact_path}")
    print("If the lifecycle script is still running, wait a few minutes and try again.")


‚úÖ Found compilation artifacts at: /home/ec2-user/SageMaker/neuron-compiled-models

üìÅ Compilation output directory:
total 81M
-rw-r--r-- 1 ubuntu ubuntu 81M Jan 20 02:39 compiled_model.pt
-rw-r--r-- 1 ubuntu ubuntu 213 Jan 20 02:39 result.json

üìÑ Original model files:
total 88M
-rw-r--r-- 1 ubuntu ubuntu  978 Jan 20 02:39 config.json
-rw-r--r-- 1 ubuntu ubuntu  87M Jan 20 02:39 model.safetensors
-rw-r--r-- 1 ubuntu ubuntu  695 Jan 20 02:39 special_tokens_map.json
-rw-r--r-- 1 ubuntu ubuntu 695K Jan 20 02:39 tokenizer.json
-rw-r--r-- 1 ubuntu ubuntu 1.5K Jan 20 02:39 tokenizer_config.json
-rw-r--r-- 1 ubuntu ubuntu 227K Jan 20 02:39 vocab.txt


In [3]:
import json
import os

# Path to the persistent artifact directory
result_path = '/home/ec2-user/SageMaker/neuron-compiled-models/output/result.json'

try:
    if os.path.exists(result_path):
        with open(result_path, 'r') as f:
            result = json.load(f)
        
        print("üìä Compilation Result:")
        print(json.dumps(result, indent=2))
    else:
        print(f"‚ö†Ô∏è Result file not found at: {result_path}")
        print("The lifecycle script may still be running, or compilation failed.")
        
except Exception as e:
    print(f"‚ùå Error reading result: {e}")

üìä Compilation Result:
{
  "status": "COMPATIBLE",
  "message": "Model compiled and loaded successfully",
  "torch_neuronx_version": "2.9.0.2.11.19912+e48cd891",
  "pytorch_version": "2.9.0+cu128",
  "environment": "Docker Neuron DLC"
}


In [4]:
import json
import os

result_path = '/home/ec2-user/SageMaker/neuron-compiled-models/output/result.json'

try:
    if os.path.exists(result_path):
        with open(result_path, 'r') as f:
            result = json.load(f)

        if result.get('status') == 'COMPATIBLE':
            print("\n‚úÖ Your model compiled successfully!")
            
            if 'input_shape' in result:
                print(f"üìê Input Shape: {result['input_shape']}")
            if 'model_type' in result:
                print(f"ü§ñ Model Type: {result['model_type']}")
            if 'detected_architecture' in result:
                print(f"üèóÔ∏è  Architecture: {result['detected_architecture']}")
            
            # The output path in result.json might refer to the compile-time path
            # We print the path accessible to this notebook
            model_path = '/home/ec2-user/SageMaker/neuron-compiled-models/output/compiled_model.pt'
            print(f"üíæ Compiled Model: {model_path}")
            
            print(f"\n{result.get('message', 'Compilation completed')}")
        else:
            print(f"\n‚ùå Compilation status: {result.get('status')}")
            # Check various error fields
            error = result.get('error') or result.get('error_message') or result.get('message', 'N/A')
            print(f"Error: {error}")
            print("\n‚ö†Ô∏è  Stop here - compilation failed.")
    else:
        print(f"‚ö†Ô∏è Result file not found at: {result_path}")
        print("Compilation may still be running. Wait a few minutes and re-run this cell.")

except Exception as e:
    print


‚úÖ Your model compiled successfully!
üíæ Compiled Model: /home/ec2-user/SageMaker/neuron-compiled-models/output/compiled_model.pt

Model compiled and loaded successfully


In [6]:
import torch
import torch_neuronx
from transformers import AutoTokenizer
import os

# Optional: Silence tqdm warnings
os.environ["TQDM_DISABLE"] = "1"

# The standard path where the lifecycle script persists artifacts
# This path is visible both in the File Browser and inside this Docker kernel
base_path = '/home/ec2-user/SageMaker/neuron-compiled-models'
model_path = os.path.join(base_path, 'output', 'compiled_model.pt')
tokenizer_path = os.path.join(base_path, 'model')

print(f"üìÇ Artifacts directory: {base_path}")

# 1. Load the Compiled Model
print(f"\nüîÑ Loading compiled model from: {model_path}...")
try:
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Model file not found at {model_path}")
        
    compiled_model = torch.jit.load(model_path)
    print("‚úÖ Success! Neuron model loaded.")
except Exception as e:
    print(f"‚ùå Failed to load model: {e}")
    # We continue to try loading the tokenizer for debugging purposes

# 2. Load the Tokenizer
print(f"\nüîÑ Loading tokenizer from: {tokenizer_path}...")
try:
    if not os.path.exists(tokenizer_path):
        raise FileNotFoundError(f"Tokenizer directory not found at {tokenizer_path}")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    print("‚úÖ Success! Tokenizer loaded.")
except Exception as e:
    print(f"‚ùå Failed to load tokenizer: {e}")

# 3. Final Status
if 'compiled_model' in locals() and 'tokenizer' in locals():
    print("\nüöÄ Ready for inference! Run the next cell to test.")
else:
    print("\n‚ö†Ô∏è  Setup incomplete. Check errors above.")

üìÇ Artifacts directory: /home/ec2-user/SageMaker/neuron-compiled-models

üîÑ Loading compiled model from: /home/ec2-user/SageMaker/neuron-compiled-models/output/compiled_model.pt...
‚úÖ Success! Neuron model loaded.

üîÑ Loading tokenizer from: /home/ec2-user/SageMaker/neuron-compiled-models/model...
‚úÖ Success! Tokenizer loaded.

üöÄ Ready for inference! Run the next cell to test.


In [7]:
# Test inference with example input
query = "What is machine learning?"
document = "Machine learning is a subset of artificial intelligence that uses algorithms to learn patterns from data."

# Tokenize input
inputs = tokenizer(
    query,
    document,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    padding=True
)

print(f"Input IDs shape: {inputs['input_ids'].shape}")
print(f"Attention mask shape: {inputs['attention_mask'].shape}")

Input IDs shape: torch.Size([1, 25])
Attention mask shape: torch.Size([1, 25])


In [None]:
import torch
import time
import numpy as np

# =============================================================================
# 1. Prepare Input (CRITICAL: Must match compiled shape)
# =============================================================================
query = "What is machine learning?"
document = "Machine learning is a subset of artificial intelligence that uses algorithms to learn patterns from data."

# Force padding to 512 to match the compiled model
inputs = tokenizer(
    query,
    document,
    return_tensors="pt",
    max_length=512,
    truncation=True,
    padding="max_length"  # <--- THIS IS THE FIX
)

print(f"Input Shape: {inputs['input_ids'].shape}")
# Prepare tuple calling convention
# Depending on how compile_script.py traced it, it might expect
# (input_ids,) OR (input_ids, attention_mask).
# Try passing both first, which is standard for BERT-like models.
example_inputs = (inputs['input_ids'], inputs['attention_mask'])

# =============================================================================
# 2. Functional Test
# =============================================================================
print("\nüîç Functional Test...")
try:
    with torch.no_grad():
        # Try passing both inputs
        outputs = compiled_model(*example_inputs)
        print("‚úÖ Inference successful with (input_ids, attention_mask)!")
except RuntimeError as e:
    if "expected" in str(e):
        print(f"‚ö†Ô∏è  Shape/Arg mismatch: {e}")
        print("Retrying with just input_ids...")
        try:
            # Fallback: maybe it was compiled with only input_ids?
            outputs = compiled_model(inputs['input_ids'])
            example_inputs = (inputs['input_ids'],) # Update for benchmark
            print("‚úÖ Inference successful with (input_ids) only!")
        except Exception as e2:
            print(f"‚ùå Failed again: {e2}")
            raise e
    else:
        print(f"‚ùå Inference failed: {e}")
        raise e

# =============================================================================
# 3. Latency Benchmark
# =============================================================================
print("\n‚è±Ô∏è  Running Latency Benchmark (100 runs)...")

# Warmup
for _ in range(5):
    with torch.no_grad():
        _ = compiled_model(*example_inputs)

# Measure
latencies = []
for _ in range(100):
    start = time.time()
    with torch.no_grad():
        _ = compiled_model(*example_inputs)
    latencies.append((time.time() - start) * 1000)

print(f"  Avg Latency: {np.mean(latencies):.2f} ms")
print(f"  P99 Latency: {np.percentile(latencies, 99):.2f} ms")

Input Shape: torch.Size([1, 512])

üîç Functional Test...
‚úÖ Inference successful with (input_ids, attention_mask)!

‚è±Ô∏è  Running Latency Benchmark (100 runs)...
  Avg Latency: 2.19 ms
  P99 Latency: 2.24 ms
