In [19]:
!wget "https://images.unsplash.com/photo-1533473359331-0135ef1b58bf?q=80&w=2670&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" -O car_image.jpg

--2025-04-29 01:10:51--  https://images.unsplash.com/photo-1533473359331-0135ef1b58bf?q=80&w=2670&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D
Resolving images.unsplash.com (images.unsplash.com)... 151.101.2.208, 151.101.130.208, 151.101.66.208, ...
Connecting to images.unsplash.com (images.unsplash.com)|151.101.2.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 794302 (776K) [image/jpeg]
Saving to: ‘car_image.jpg’


2025-04-29 01:10:51 (50.9 MB/s) - ‘car_image.jpg’ saved [794302/794302]



In [38]:
!wget "https://raw.githubusercontent.com/teaching-on-testbeds/serve-system-chi/refs/heads/main/workspace/input.json" -O input.json

--2025-04-29 01:44:28--  https://raw.githubusercontent.com/teaching-on-testbeds/serve-system-chi/refs/heads/main/workspace/input.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 80313 (78K) [text/plain]
Saving to: ‘input.json’


2025-04-29 01:44:28 (17.7 MB/s) - ‘input.json’ saved [80313/80313]



In [20]:
"""
Test script for connecting to Triton server and running inference with YOLOV11
"""

import time
import numpy as np
from PIL import Image
import requests
from ultralytics import YOLO

# Constants
TRITON_SERVER_URL = "triton_server:8000"
MODEL_NAME = "chest_xray_detector"
TEST_IMAGE_PATH = "./car_image.jpg"  # Update with your test image path

In [28]:
def check_server_status():
    """Check if Triton server is running and model is ready"""
    try:
        # Check server health
        health_url = f"http://{TRITON_SERVER_URL}/v2/health/ready"
        response = requests.get(health_url)
        if response.status_code == 200:
            print("✅ Triton server is ready")
            
            # Check model status
            model_url = f"http://{TRITON_SERVER_URL}/v2/models/{MODEL_NAME}"
            model_response = requests.get(model_url)
            if model_response.status_code == 200:
                print(f"✅ Model '{MODEL_NAME}' is available")
                return True
            else:
                print(f"❌ Model '{MODEL_NAME}' not found or not ready")
                return False
        else:
            print("❌ Triton server is not ready")
            return False
    except Exception as e:
        print(f"Error connecting to server: {e}")
        return False

def test_ultralytics_triton():
    """Test using the Ultralytics YOLO with Triton"""
    print("\nTesting Ultralytics YOLO with Triton...")
    
    try:
        # Load the Triton Server model - exactly as shown in the documentation
        model = YOLO(f"http://{TRITON_SERVER_URL}/{MODEL_NAME}", task="detect")
        print("Model loaded successfully")
        
        # Run inference on a test image
        if TEST_IMAGE_PATH and TEST_IMAGE_PATH != "/path/to/test_image.jpg":
            start_time = time.time()
            results = model(TEST_IMAGE_PATH)
            inference_time = time.time() - start_time
            
            print(f"Inference completed in {inference_time:.4f} seconds")
            
            # Display results
            for i, result in enumerate(results):
                boxes = result.boxes
                print(f"Found {len(boxes)} detections:")
                
                for j, box in enumerate(boxes):
                    cls = int(box.cls[0])
                    cls_name = result.names[cls] if cls in result.names else f"Class {cls}"
                    conf = float(box.conf[0])
                    print(f"  Detection {j+1}: {cls_name} (Confidence: {conf:.3f})")
        else:
            print("No test image path provided. Skipping inference test.")
    
    except Exception as e:
        print(f"Error during testing: {e}")
        print("\nTroubleshooting tips:")
        print("1. Check that Triton server is running (try the check_server_status function)")
        print("2. Verify that the model name is correct")
        print("3. Make sure the model is properly loaded in Triton (check server logs)")
        print("4. Ensure the test image path is correct")


In [61]:
check_server_status()

✅ Triton server is ready
✅ Model 'chest_xray_detector' is available


True

In [30]:
# Check server status
if check_server_status():
    # Run test with Ultralytics
    test_ultralytics_triton()
else:
    print("Server check failed. Please ensure Triton server is running.")

✅ Triton server is ready
✅ Model 'chest_xray_detector' is available

Testing Ultralytics YOLO with Triton...
Model loaded successfully

image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 66.7ms
Speed: 2.5ms preprocess, 66.7ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 640)
Inference completed in 15.8551 seconds
Found 1 detections:
  Detection 1: class7 (Confidence: 0.873)


In [31]:
import concurrent.futures
import time

def run_inference(image_path):
    try:
        model = YOLO(f"http://{TRITON_SERVER_URL}/{MODEL_NAME}", task="detect")
        results = model(image_path)
        return True
    except Exception as e:
        print(f"Inference error: {e}")
        return False

def stress_test(num_requests=100, concurrency=10):
    print(f"Starting stress test: {num_requests} requests with concurrency {concurrency}")
    start_time = time.time()
    
    success_count = 0
    total_time = 0
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        futures = [executor.submit(run_inference, TEST_IMAGE_PATH) for _ in range(num_requests)]
        for future in concurrent.futures.as_completed(futures):
            result = future.result()
            success_count += int(result)
    
    total_duration = time.time() - start_time
    print("\n=== Stress Test Results ===")
    print(f"Total requests: {num_requests}")
    print(f"Successful responses: {success_count}")
    print(f"Total duration: {total_duration:.2f} seconds")
    print(f"Average requests/sec: {num_requests/total_duration:.2f}")



In [76]:
stress_test(num_requests = 50, concurrency = 8)

Starting stress test: 50 requests with concurrency 8








image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 38.2ms
Speed: 2.5ms preprocess, 38.2ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 61.1ms
image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 68.4ms
Speed: 5.2ms preprocess, 61.1ms inference, 1.3ms postprocess per image at shape (1, 3, 640, 640)
Speed: 5.2ms preprocess, 68.4ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 640)


image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 73.5ms
Speed: 4.7ms preprocess, 73.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 640)

image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 100.3ms
image 1/1 /home/jovyan/work/car_image.jpg: 640x640 1 class7, 99.9ms
Speed: 4.7ms preprocess, 100.3ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 640)
Speed: 5.2ms preprocess, 99.9ms inferenc

In [43]:
!curl -X GET http://triton_server:8000/v2/models/chest_xray_detector

{"name":"chest_xray_detector","versions":["1"],"platform":"onnxruntime_onnx","inputs":[{"name":"images","datatype":"FP32","shape":[-1,3,-1,-1]}],"outputs":[{"name":"output0","datatype":"FP32","shape":[-1,84,-1]}]}

In [73]:
#!/usr/bin/env python3
import json
import numpy as np
import os
from PIL import Image

def resize_and_convert(image_path='car_image.jpg', output_file='input_fixed.json'):
    """
    Resizes image to be compatible with the model's expected internal dimensions
    and creates a proper input JSON file.
    """
    try:
        # Load the image
        img = Image.open(image_path)
        
        # Convert to RGB if not already
        if img.mode != 'RGB':
            img = img.convert('RGB')
            
        # Calculate dimensions that should work with the model
        # We need dimensions that will result in 112x168 feature maps
        # Common ONNX models often downsample by factors of 32, 16, or 8
        # Let's try dimensions that are multiples of 8 and should produce 168 in the final dimension
        
        # Resize to 336x504 (should result in 112x168 when downsampled by factor of 3)
        # This is a guess based on the error message
        new_width, new_height = 504, 336  # Width corresponds to the 168 dimension
        
        # Resize image
        resized_img = img.resize((new_width, new_height), Image.LANCZOS)
        
        # Convert to numpy array and normalize to 0-1 range
        img_array = np.array(resized_img).astype(np.float32) / 255.0
        
        # Transpose from HWC to CHW format (height, width, channels) -> (channels, height, width)
        img_array = np.transpose(img_array, (2, 0, 1))
        
        # Flatten the array for JSON serialization
        float_data = img_array.flatten().tolist()
        
        # Create the input JSON
        input_data = {
            "data": [
                {
                    "images": float_data
                }
            ]
        }
        
        # Write to output file
        with open(output_file, 'w') as f:
            json.dump(input_data, f)
        
        print(f"Successfully created {output_file}")
        print(f"Resized image dimensions: {new_width}x{new_height}")
        print(f"Tensor shape: [1, 3, {new_height}, {new_width}]")
        
        # Print the command to use with perf_analyzer
        print("\nRun perf_analyzer with the following command:")
        print(f"perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data {output_file} -b 1 --shape images:1,3,{new_height},{new_width} --concurrency-range 8")
        
        # Option to create alternative sizes if the first one doesn't work
        alt_sizes = [
            (344, 512),  # Alternative 1
            (328, 496),  # Alternative 2
            (224, 336)   # Smaller alternative
        ]
        
        print("\nIf the above dimensions don't work, try these alternatives:")
        for i, (h, w) in enumerate(alt_sizes, 1):
            alt_file = f"input_alt{i}.json"
            alt_img = img.resize((w, h), Image.LANCZOS)
            alt_array = np.transpose(np.array(alt_img).astype(np.float32) / 255.0, (2, 0, 1))
            alt_data = {"data": [{"images": alt_array.flatten().tolist()}]}
            
            with open(alt_file, 'w') as f:
                json.dump(alt_data, f)
                
            print(f"\nAlternative {i}: {w}x{h}")
            print(f"Created {alt_file}")
            print(f"perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data {alt_file} -b 1 --shape images:1,3,{h},{w} --concurrency-range 8")
        
    except FileNotFoundError:
        print(f"Error: The file '{image_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")

resize_and_convert()


Successfully created input_fixed.json
Resized image dimensions: 504x336
Tensor shape: [1, 3, 336, 504]

Run perf_analyzer with the following command:
perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data input_fixed.json -b 1 --shape images:1,3,336,504 --concurrency-range 8

If the above dimensions don't work, try these alternatives:

Alternative 1: 512x344
Created input_alt1.json
perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data input_alt1.json -b 1 --shape images:1,3,344,512 --concurrency-range 8

Alternative 2: 496x328
Created input_alt2.json
perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data input_alt2.json -b 1 --shape images:1,3,328,496 --concurrency-range 8

Alternative 3: 336x224
Created input_alt3.json
perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data input_alt3.json -b 1 --shape images:1,3,224,336 --concurrency-range 8


In [77]:
# !perf_analyzer -u triton_server:8000 -m chest_xray_detector --input-data input_fixed.json -b 1 --shape images:1,3,336,504 --concurrency-range 8