# One time Setup
- uv sync --group dev
- uv add onnxruntime transformers torch
- uv pip list
- uv run python -c "import torch; import onnx; import onnxruntime; import transformers; print('All packages installed successfully with uv!')"

In [1]:
!uv run python -c "import torch; import onnx; import onnxruntime; import transformers; print('All packages installed successfully with uv!')"

All packages installed successfully with uv!


# 1: Create/Load/Infer Onnx Model

### 1.1 Load Sentiment Analysis Model

In [2]:
# Command 1: Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import onnxruntime as ort
from pathlib import Path
import numpy as np

# Command 2: Set model name
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
print(f"Model: {model_name}")


Model: cardiffnlp/twitter-roberta-base-sentiment-latest


In [2]:

# Command 3: Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded")

# Command 4: Load PyTorch model
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
print("PyTorch model loaded")

# Command 5: Create dummy input
dummy_input = tokenizer(
    "This is a test sentence.",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512
)
print("Dummy input created")


Tokenizer loaded


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


PyTorch model loaded
Dummy input created


### 1.2 Convert to ONNX

In [3]:
# Command 6: Create output directory
output_dir = Path.home() / ".cache" / "huggingface" / "hub" / "onnx_models" / model_name.replace("/", "_")
output_dir.mkdir(parents=True, exist_ok=True)
onnx_path = output_dir / "model.onnx"
print(f"Output directory: {output_dir}")


Output directory: /Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest


In [4]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [None]:

# Command 7: Convert to ONNX
torch.onnx.export(
    model,
    (dummy_input['input_ids'], dummy_input['attention_mask']),
    str(onnx_path),
    export_params=True,
    opset_version=14,
    do_constant_folding=True,
    input_names=['input_ids', 'attention_mask'],
    output_names=['logits'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence'},
        'attention_mask': {0: 'batch_size', 1: 'sequence'},
        'logits': {0: 'batch_size'}
    }
)
print(f"ONNX model saved to: {onnx_path}")


### 1.3 SKIP TO HERE

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import onnxruntime as ort
from pathlib import Path
import numpy as np

tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Tokenizer loaded")

onnx_path='/Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx'
!ls -l {onnx_path}

Tokenizer loaded
-rw-r--r--@ 1 chang  staff  498864302 Aug 18 14:49 /Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx


In [4]:
# Command 8: Load ONNX model
# https://onnxruntime.ai/docs/tutorials/
onnx_session = ort.InferenceSession(str(onnx_path))

#sess_opt = SessionOptions()
#sess_opt.log_severity_level = 0 // Verbose

#onnx_session = ort.InferenceSession(
#                model_path,
#                sess_options=sess_opt, #https://onnxruntime.ai/docs/api/python/api_summary.html#sessionoptions
#                providers=['CUDAExecutionProvider', 'CPUExecutionProvider'] #https://onnxruntime.ai/docs/execution-providers/
#            )

print("ONNX model loaded")


ONNX model loaded


In [13]:

# Command 9: Test the model
test_texts = ["I love this product!", "This is terrible!", "It's okay."]

# Command 10: Tokenize test texts
tokenized = tokenizer(
    test_texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="np"
)

# Command 11: Run inference
outputs = onnx_session.run(
    None,
    {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }
)

outputs

[array([[-2.037826  , -1.2209561 ,  3.316388  ],
        [ 2.1412935 , -0.4224534 , -1.9739852 ],
        [-1.728252  ,  0.5253647 ,  0.74663556]], dtype=float32)]

In [15]:

# Process results
logits = outputs[0]
probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
print(f'probabilities={probabilities}\n')
labels = ['negative', 'neutral', 'positive']

print("Test Results:")
for i, text in enumerate(test_texts):
    pred_label = labels[np.argmax(probabilities[i])]
    print(f"Text: {text}")
    print(f"Prediction: {pred_label}")
    print(f"Probabilities: {probabilities[i]}")
    print("-" * 40)



probabilities=[[0.00465634 0.01053917 0.9848045 ]
 [0.914631   0.07044088 0.01492805]
 [0.04463791 0.42504716 0.5303149 ]]

Test Results:
Text: I love this product!
Prediction: positive
Probabilities: [0.00465634 0.01053917 0.9848045 ]
----------------------------------------
Text: This is terrible!
Prediction: negative
Probabilities: [0.914631   0.07044088 0.01492805]
----------------------------------------
Text: It's okay.
Prediction: positive
Probabilities: [0.04463791 0.42504716 0.5303149 ]
----------------------------------------


### 1.4 API Summary
- https://onnxruntime.ai/docs/api/python/api_summary.html

# 2 Use Onnx_memory_tracking module
- code is from example_usage_factored.py


In [1]:
import sys
from pathlib import Path
import onnxruntime as ort
from transformers import AutoTokenizer
import numpy as np

from onnx_memory_tracking import (
    track_with_psutil,
    track_with_tracemalloc,
    track_with_onnx_providers,
    track_disk_io,
    track_with_system_monitor,
    get_onnx_model_info,
    create_onnx_session_with_profiling,
    run_comprehensive_tracking,
    track_model_unloading,
    track_memory_leaks_during_unloading
)

  from .autonotebook import tqdm as notebook_tqdm


## 2.1 Main Usage

In [34]:
# WARNING: tracker is not thread safe, make it a singleton
# tracker = ONNXMemoryTracker() 
# tracker.start_tracking()
# .....
# metrics = tracker.end_tracking()

def example_basic_usage(onnx_path: Path):
    """Example 1: Basic usage with your own ONNX model and tokenizer."""
    print("=== Example 1: Basic Usage ===")
    
    # Load your model and tokenizer
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    
    if not onnx_path.exists():
        print(f"ONNX model not found at {onnx_path}")
        return
    
    # Create your ONNX session
    onnx_session = ort.InferenceSession(str(onnx_path))
    
    # Your test data
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    
    # Run basic memory tracking
    metrics = track_with_psutil(onnx_session, tokenizer, test_texts)
    print(f"Basic tracking completed. Memory used: {metrics['memory_used'] / 1024 / 1024:.2f} MB")


example_basic_usage(Path(onnx_path))


=== Example 1: Basic Usage ===
=== Method 1: psutil Memory Tracking ===
Execution time: 0.0214 seconds
Memory used: 0.17 MB
Peak memory: 1194.94 MB
Virtual memory: 405377.36 MB
Basic tracking completed. Memory used: 0.17 MB


In [28]:

def example_with_preprocessed_data(onnx_path):
    """Example 2: Using pre-processed input data."""
    print("\n=== Example 2: Pre-processed Data ===")
    
    # Load your model and tokenizer
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    if not onnx_path.exists():
        print(f"ONNX model not found at {onnx_path}")
        return
    
    # Create your ONNX session
    onnx_session = ort.InferenceSession(str(onnx_path))
    
    # Pre-process your data
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    tokenized = tokenizer(
        test_texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="np"
    )
    
    input_data = {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }
    
    # Run tracking with pre-processed data
    metrics = track_with_psutil(onnx_session, tokenizer, input_data=input_data)
    print(f"Tracking with pre-processed data completed. Memory used: {metrics['memory_used'] / 1024 / 1024:.2f} MB")

example_with_preprocessed_data(Path(onnx_path))


=== Example 2: Pre-processed Data ===
=== Method 1: psutil Memory Tracking ===
Execution time: 0.0210 seconds
Memory used: 0.50 MB
Peak memory: 1090.75 MB
Virtual memory: 405380.34 MB
Tracking with pre-processed data completed. Memory used: 0.50 MB


In [29]:

def example_with_profiling(onnx_path):
    """Example 3: Using ONNX profiling."""
    print("\n=== Example 3: ONNX Profiling ===")
    
    # Load your model and tokenizer
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Find your ONNX model
    onnx_path = Path.home() / ".cache" / "huggingface" / "hub" / "onnx_models" / model_name.replace("/", "_") / "model.onnx"
    
    if not onnx_path.exists():
        print(f"ONNX model not found at {onnx_path}")
        return
    
    # Create ONNX session with profiling enabled
    profiling_session = create_onnx_session_with_profiling(onnx_path, "my_profile")
    
    # Your test data
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    
    # Run profiling
    profile_results = track_with_onnx_providers(profiling_session, tokenizer, test_texts, num_runs=10)
    print(f"Profiling completed. Profile file: {profile_results['profile_file']}")

example_with_profiling(Path(onnx_path))



=== Example 3: ONNX Profiling ===

=== Method 3: ONNX Runtime Provider Tracking ===
Profiling completed. Profile file: my_profile_2025-08-18_16-11-37.json
Profiling completed. Profile file: my_profile_2025-08-18_16-11-37.json


In [31]:
def example_comprehensive_tracking(onnx_path):
    """Example 4: Comprehensive tracking with all methods."""
    print("\n=== Example 4: Comprehensive Tracking ===")
    
    # Load your model and tokenizer
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
        
    # Create your ONNX session
    onnx_session = ort.InferenceSession(str(onnx_path))
    
    # Your test data
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    
    # Run comprehensive tracking
    results = run_comprehensive_tracking(
        onnx_session=onnx_session,
        tokenizer=tokenizer,
        test_texts=test_texts,
        enable_debug=True,      # Enable tracemalloc
        enable_optimization=False,  # Disable ONNX profiling (would need separate session)
        enable_disk_io=True,    # Enable disk I/O tracking
        enable_real_time=False  # Disable real-time monitoring
    )
    
    print("\nComprehensive tracking results:")
    for method, result in results.items():
        print(f"  {method}: {type(result).__name__}")

example_comprehensive_tracking(Path(onnx_path))


=== Example 4: Comprehensive Tracking ===
Running basic memory tracking...
=== Method 1: psutil Memory Tracking ===
Execution time: 0.0199 seconds
Memory used: 0.11 MB
Peak memory: 1183.84 MB
Virtual memory: 405385.34 MB

Running tracemalloc analysis...

=== Method 2: tracemalloc Memory Tracking ===
Top 10 memory differences:
+3 blocks: 0.0 MB
  ['  File "/Users/chang/Documents/dev/git/ml/coachKata/.venv/lib/python3.11/site-packages/transformers/tokenization_utils_fast.py", line 553', '    encodings = self._tokenizer.encode_batch(']
+4 blocks: 0.0 MB
  ['  File "/Users/chang/Documents/dev/git/ml/coachKata/.venv/lib/python3.11/site-packages/transformers/tokenization_utils_base.py", line 756', '    return np.asarray(value, dtype=dtype)']
+6 blocks: 0.0 MB
  ['  File "/Users/chang/Documents/dev/git/ml/coachKata/.venv/lib/python3.11/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 273', '    return self._sess.run(output_names, input_feed, run_options)']
+2 blocks:

## 2.2 Model Unloading
= Model unloading: see model_example_unloading.py
- Metrics
    * Memory freed (negative memory_used)
    * Final memory footprint (memory_current)
    * Relative impact (memory_percent_change)
    * Memory leaks (run multiple cycles)
    * Unloading efficiency (compare with loading)
- Metric	|  What to Look For	|  Good Sign	|  Bad Sign
    * memory_used |	Should be negative	| -50 MB	| +10 MB
    * memory_current|	Should decrease	| 100 MB → 50 MB	| 100 MB → 110 MB
    * memory_percent_change |	Should be negative	| -0.5% |	+0.1%
    * unloading_successful |	Should be True	| True	| False


In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import onnxruntime as ort
from pathlib import Path
import numpy as np


from onnx_memory_tracking import (
    track_with_psutil,
    track_model_unloading,
    track_memory_leaks_during_unloading,
    unload_model_with_tracking,

    track_onnx_runtime_memory_release,
    demonstrate_actual_onnx_memory_release
)

onnx_path='/Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx'


  from .autonotebook import tqdm as notebook_tqdm


##### Basic  - INaccurate; only reference

In [2]:

def example_basic_unloading_v2(onnx_path=onnx_path):
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Create ONNX session
    onnx_session = ort.InferenceSession(str(onnx_path))
    
    # Test texts
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    
    print("1. Testing basic unloading tracking...")
    try:
        unloading_metrics = track_model_unloading(
            onnx_session=onnx_session,
            tokenizer=tokenizer,
            test_texts=test_texts
        )
        
        print(f"✅ Basic unloading tracking completed")
        print(f"   Success: {unloading_metrics['unloading_successful']}")
        print(f"   Memory freed: {unloading_metrics['memory_freed_mb']:.2f} MB")
        
        # Check if the function returns expected keys
        expected_keys = [
            'unloading_successful', 'memory_freed_bytes', 'memory_freed_mb',
            'memory_freed_percent', 'memory_before_unload', 'memory_after_unload',
            'memory_before_percent', 'memory_after_percent', 'unloading_time'
        ]
        
        missing_keys = [key for key in expected_keys if key not in unloading_metrics]
        if missing_keys:
            print(f"❌ Missing keys: {missing_keys}")
            return False
        else:
            print(f"✅ All expected keys present")
        
    except Exception as e:
        print(f"❌ Basic unloading tracking failed: {e}")
        return False
    
    print("\n2. Testing actual unloading tracking...")
    try:
        # Create new session and tokenizer for actual unloading test
        tokenizer2 = AutoTokenizer.from_pretrained(model_name)
        onnx_session2 = ort.InferenceSession(str(onnx_path))
        
        unloading_metrics2 = unload_model_with_tracking(
            onnx_session=onnx_session2,
            tokenizer=tokenizer2,
            test_texts=test_texts
        )
        
        print(f"✅ Actual unloading tracking completed")
        print(f"   Success: {unloading_metrics2['unloading_successful']}")
        print(f"   Memory freed: {unloading_metrics2['memory_freed_mb']:.2f} MB")
        
    except Exception as e:
        print(f"❌ Actual unloading tracking failed: {e}")
        return False
    
    print("\n✅ All tests passed!")
    return True

example_basic_unloading_v2()

1. Testing basic unloading tracking...
=== Model Unloading Memory Tracking ===
Final inference completed
Unloading model...
Unloading time: 0.0413 seconds
Memory before unload: 1329.09 MB
Memory after unload: 1329.09 MB
Memory freed: 0.00 MB
Memory freed percentage: 0.00%
✅ Model unloading successful - memory freed or stable
✅ Basic unloading tracking completed
   Success: True
   Memory freed: 0.00 MB
✅ All expected keys present

2. Testing actual unloading tracking...
=== Actual Model Unloading with Tracking ===
Final inference completed
Actually unloading model...
Unloading time: 0.0375 seconds
Memory before unload: 1716.77 MB
Memory after unload: 1716.92 MB
Memory freed: -0.16 MB
Memory freed percentage: -0.00%
✅ Model unloading successful - memory freed or stable
✅ Actual unloading tracking completed
   Success: True
   Memory freed: -0.16 MB

✅ All tests passed!


True

##### MOre accurate: System + Process memory

In [4]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
    
onnx_session = ort.InferenceSession(str(onnx_path))
test_texts = ["I love this product!", "This is terrible!", "It's okay."]

# More similear to my usecase
release_metrics = track_onnx_runtime_memory_release(
    onnx_session=onnx_session,
    tokenizer=tokenizer,
    test_texts=test_texts
)
print(release_metrics)

=== ONNX Runtime Memory Release Tracking ===
Initial system memory: 4716.66 MB available
Initial process memory: 1425.16 MB RSS
Final inference completed
Releasing ONNX Runtime memory...
  - Session providers: ['CPUExecutionProvider']
  - Session inputs: ['input_ids', 'attention_mask']
  - Session outputs: ['logits']
  - Destroying ONNX session...

--- Memory Release Results ---
Unloading time: 0.0745 seconds
Process memory freed: -0.16 MB
System memory freed: -5.58 MB
Process memory freed: -0.97 MB
Memory freed percentage: -0.00%
✅ ONNX Runtime memory release successful
{'unloading_successful': True, 'memory_freed_bytes': -163840, 'memory_freed_mb': -0.15625, 'memory_freed_percent': -0.00095367431640625, 'memory_before_unload': 1495236608, 'memory_after_unload': 1495400448, 'memory_before_percent': 8.703422546386719, 'memory_after_percent': 8.704376220703125, 'unloading_time': 0.07452392578125, 'system_memory_freed_mb': -5.578125, 'process_memory_freed_mb': -0.96875, 'session_provider

##### More accurate, but non suited my use case: Process separation

In [5]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_name)
    
onnx_session = ort.InferenceSession(str(onnx_path))
test_texts = ["I love this product!", "This is terrible!", "It's okay."]

demonstration_results = demonstrate_actual_onnx_memory_release(
    onnx_path=onnx_path,
    model_name=model_name,
    test_texts=test_texts
)

=== Demonstrating Actual ONNX Memory Release ===
Initial system memory: 4649.78 MB available
Initial process memory: 1515.89 MB RSS
Running model in separate process...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Model loaded and unloaded successfully in separate process
Process output: Inference completed in subprocess
Session ready for destruction

--- Memory Release Demonstration Results ---
System memory change: -9.06 MB
Process memory change: +458.12 MB
Final system memory: 4640.72 MB available
Final process memory: 1057.77 MB RSS
✅ Actual ONNX memory release demonstrated!
   This shows that ONNX Runtime does release memory when sessions are destroyed


##### A more releastic example

In [10]:
import os
from pathlib import Path
import onnxruntime as ort
from transformers import AutoTokenizer

def smoke_check_model_size():
    """Smoke check to verify if 41MB memory release makes sense."""
    
    # CardiffNLP model path
    model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"
    onnx_path = Path.home() / ".cache" / "huggingface" / "hub" / "onnx_models" / model_name.replace("/", "_") / "model.onnx"
    
    print("=== CardiffNLP Model Size Smoke Check ===")
    
    # 1. Check ONNX file size
    if onnx_path.exists():
        file_size_mb = onnx_path.stat().st_size / 1024 / 1024
        print(f"ONNX file size: {file_size_mb:.2f} MB")
    else:
        print(f"ONNX file not found at {onnx_path}")
        return
    
    # 2. Check HuggingFace model size
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoTokenizer.from_pretrained(model_name)
    
    # Get model size from HuggingFace cache
    cache_dir = Path.home() / ".cache" / "huggingface" / "hub"
    model_cache_dir = cache_dir / model_name.replace("/", "_")
    
    if model_cache_dir.exists():
        total_size = sum(f.stat().st_size for f in model_cache_dir.rglob('*') if f.is_file())
        model_size_mb = total_size / 1024 / 1024
        print(f"HuggingFace model cache size: {model_size_mb:.2f} MB")
    
    # 3. Load ONNX session and check memory usage
    session = ort.InferenceSession(str(onnx_path))
    
    # Get session info
    inputs = session.get_inputs()
    outputs = session.get_outputs()
    providers = session.get_providers()
    
    print(f"ONNX session providers: {providers}")
    print(f"Inputs: {[input.name for input in inputs]}")
    print(f"Outputs: {[output.name for output in outputs]}")
    
    # 4. Memory analysis
    print(f"\n=== Memory Analysis ===")
    print(f"ONNX file size: {file_size_mb:.2f} MB")
    print(f"Your observed memory release: 41 MB average")
    print(f"Memory release ratio: {41/file_size_mb:.2f}x")
    
    # 5. Expected ranges
    print(f"\n=== Expected Ranges ===")
    print(f"Minimum expected (file size): {file_size_mb:.2f} MB")
    print(f"Typical expected (1.5-3x file size): {file_size_mb*1.5:.2f} - {file_size_mb*3:.2f} MB")
    print(f"Maximum expected (5x file size): {file_size_mb*5:.2f} MB")
    
    # 6. Assessment
    print(f"\n=== Assessment ===")
    if 41 < file_size_mb:
        print(f"❌ 41MB is LESS than file size ({file_size_mb:.2f}MB) - This suggests incomplete unloading")
    elif 41 < file_size_mb * 1.5:
        print(f"⚠️  41MB is close to file size - May be memory-mapped or cached")
    elif 41 <= file_size_mb * 3:
        print(f"✅ 41MB is in expected range - Reasonable for ONNX Runtime")
    else:
        print(f"✅ 41MB is higher than expected - Good memory release")

# Run the smoke check
smoke_check_model_size()

=== CardiffNLP Model Size Smoke Check ===
ONNX file size: 475.75 MB
ONNX session providers: ['CPUExecutionProvider']
Inputs: ['input_ids', 'attention_mask']
Outputs: ['logits']

=== Memory Analysis ===
ONNX file size: 475.75 MB
Your observed memory release: 41 MB average
Memory release ratio: 0.09x

=== Expected Ranges ===
Minimum expected (file size): 475.75 MB
Typical expected (1.5-3x file size): 713.63 - 1427.26 MB
Maximum expected (5x file size): 2378.77 MB

=== Assessment ===
❌ 41MB is LESS than file size (475.75MB) - This suggests incomplete unloading


In [11]:
# Memory mapped = on disk; 
# we are releasing he RAM, which is about 10% of our model file size, but not the disk.
# To release memory mapped, need process separation, but the mapped memory can be resused withi the same process.
def load_and_unload_multiple_models_v2(
    model_paths: list,
    model_name: str = "cardiffnlp/twitter-roberta-base-sentiment-latest",
    pause_duration: int = 3
):
    """
    Load 3 ONNX models and track memory release for each one individually.
    Enhanced with file size comparison and memory mapping analysis.
    """
    print("=== Multi-Model Loading and Unloading with Memory Tracking ===")
    print(f"Loading {len(model_paths)} models...")
    print(f"Pause duration between unloads: {pause_duration} seconds")
    print("=" * 60)
    
    # Test texts for inference
    test_texts = ["I love this product!", "This is terrible!", "It's okay."]
    
    # Load tokenizer once (shared across models)
    print("Loading shared tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # Store loaded models
    loaded_models = []
    
    # Step 1: Load all models and get file sizes
    total_file_size = 0
    for i, model_path in enumerate(model_paths, 1):
        print(f"\n--- Loading Model {i}: {Path(model_path).name} ---")
        
        if not Path(model_path).exists():
            print(f"❌ Model not found at {model_path}")
            continue
        
        # Get file size
        file_size_mb = Path(model_path).stat().st_size / 1024 / 1024
        total_file_size += file_size_mb
        print(f"   File size: {file_size_mb:.2f} MB")
        
        try:
            # Load ONNX session
            # This line does NOT load the model into RAM yet
            # What actually happens:
            #   1. ONNX Runtime opens the file
            #   2. Maps the 475MB file to virtual memory address space
            #   3. Reads the model metadata (structure, inputs, outputs)
            #   4. Sets up the inference session
            #   5. Allocates minimal memory for session management
            #   6. Model weights stay on disk (memory-mapped)
            # Memory impact after line executes:
            #   - Virtual Memory (VMS): +475MB (mapped)
            #   - Physical RAM (RSS): +~5-10MB (session overhead only)
            #   - Disk I/O: Minimal (just metadata)
            #   - Model weights: Still on disk, not in RAM
            onnx_session = ort.InferenceSession(str(model_path))
            
            # Test inference to ensure model is loaded                       
            tokenized = tokenizer(
                test_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="np"
            )
            
            input_data = {
                'input_ids': tokenized['input_ids'],
                'attention_mask': tokenized['attention_mask']
            }

            # This line triggers actual model loading from disk
            # What actually happens:
            #   1. ONNX Runtime processes the input data
            #   2. Accesses model weights for computation
            #   3. OS triggers page faults for missing pages
            #   4. Loads required model pages from disk into RAM
            #   5. Performs inference computation
            #   6. Returns results
            # After this line:
            #   - Virtual Memory (VMS): Still 475MB (mapped)
            #   - Physical RAM (RSS): +~42MB (loaded pages)
            #   - Disk I/O: Significant (loading model weights)
            #   - Model weights: Partially in RAM (actively used portions)
            outputs = onnx_session.run(None, input_data)
            print(f"✅ Model {i} loaded successfully")
            print(f"   Output shape: {outputs[0].shape}")
            
            # Store model info
            loaded_models.append({
                'index': i,
                'path': model_path,
                'name': Path(model_path).name,
                'session': onnx_session,
                'tokenizer': tokenizer,
                'file_size_mb': file_size_mb
            })
            
        except Exception as e:
            print(f"❌ Failed to load model {i}: {e}")
    
    print(f"\n�� Loaded {len(loaded_models)} models successfully")
    print(f"�� Total file size: {total_file_size:.2f} MB")
    
    # Step 2: Unload each model individually and track memory
    total_memory_freed = 0
    memory_analysis = []
    
    for model_info in loaded_models:
        print(f"\n{'='*60}")
        print(f"--- Unloading Model {model_info['index']}: {model_info['name']} ---")
        print(f"{'='*60}")
        
        # Track memory release for this specific model
        release_metrics = track_onnx_runtime_memory_release(
            onnx_session=model_info['session'],
            tokenizer=model_info['tokenizer'],
            test_texts=test_texts
        )
        
        # Calculate efficiency metrics
        file_size = model_info['file_size_mb']
        memory_freed = release_metrics['memory_freed_mb']
        efficiency_ratio = memory_freed / file_size if file_size > 0 else 0
        
        # Display results for this model
        print(f"\n📈 Memory Release Results for Model {model_info['index']}:")
        print(f"   File size: {file_size:.2f} MB")
        print(f"   Process memory freed: {memory_freed:.2f} MB")
        print(f"   System memory freed: {release_metrics['system_memory_freed_mb']:.2f} MB")
        print(f"   Memory freed percentage: {release_metrics['memory_freed_percent']:.2f}%")
        print(f"   Unloading time: {release_metrics['unloading_time']:.4f} seconds")
        print(f"   Efficiency ratio: {efficiency_ratio:.2f} ({efficiency_ratio*100:.1f}% of file size)")
        
        # Memory mapping analysis
        if efficiency_ratio < 0.5:
            print(f"   ⚠️  Low efficiency - likely memory-mapped or cached")
        elif efficiency_ratio < 1.0:
            print(f"   ⚠️  Partial release - memory-mapped model")
        elif efficiency_ratio < 2.0:
            print(f"   ✅ Good release - typical ONNX Runtime behavior")
        else:
            print(f"   ✅ Excellent release - better than expected")
        
        # Accumulate totals
        total_memory_freed += memory_freed
        memory_analysis.append({
            'model': model_info['name'],
            'file_size': file_size,
            'memory_freed': memory_freed,
            'efficiency_ratio': efficiency_ratio
        })
        
        # Pause before next unload
        if model_info['index'] < len(loaded_models):
            print(f"\n⏸️  Pausing for {pause_duration} seconds before next unload...")
            time.sleep(pause_duration)
    
    # Final summary with analysis
    print(f"\n{'='*60}")
    print("📊 FINAL SUMMARY")
    print(f"{'='*60}")
    print(f"Total models unloaded: {len(loaded_models)}")
    print(f"Total file size: {total_file_size:.2f} MB")
    print(f"Total process memory freed: {total_memory_freed:.2f} MB")
    print(f"Average memory freed per model: {total_memory_freed/len(loaded_models):.2f} MB")
    print(f"Overall efficiency: {total_memory_freed/total_file_size:.2f} ({total_memory_freed/total_file_size*100:.1f}% of total file size)")
    
    # Memory mapping explanation
    print(f"\n🔍 Memory Mapping Analysis:")
    print(f"   Your models are likely memory-mapped by ONNX Runtime")
    print(f"   This means the 475MB file stays in memory-mapped region")
    print(f"   Only 42MB (9%) is freed because:")
    print(f"     - File remains memory-mapped")
    print(f"     - Runtime keeps caches for performance")
    print(f"     - Memory pools are reused")
    print(f"   For complete memory release, consider:")
    print(f"     - Using separate processes")
    print(f"     - Restarting the application")
    print(f"     - Using memory-mapped file alternatives")
    
    print(f"Memory tracking completed!")
    
    return memory_analysis

In [12]:
model_paths = [
        "/Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx",
        "/Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx",  # Same model for demo
        "/Users/chang/.cache/huggingface/hub/onnx_models/cardiffnlp_twitter-roberta-base-sentiment-latest/model.onnx"   # Same model for demo
    ]
    
# You can replace with different models:
# model_paths = [
#     "/path/to/model1.onnx",
#     "/path/to/model2.onnx", 
#     "/path/to/model3.onnx"
# ]

#load_and_unload_multiple_models(
load_and_unload_multiple_models_v2(
    model_paths=model_paths,
    model_name="cardiffnlp/twitter-roberta-base-sentiment-latest",
    pause_duration=3  # 3 seconds pause between unloads
)

=== Multi-Model Loading and Unloading with Memory Tracking ===
Loading 3 models...
Pause duration between unloads: 3 seconds
Loading shared tokenizer...

--- Loading Model 1: model.onnx ---
   File size: 475.75 MB
✅ Model 1 loaded successfully
   Output shape: (3, 3)

--- Loading Model 2: model.onnx ---
   File size: 475.75 MB
✅ Model 2 loaded successfully
   Output shape: (3, 3)

--- Loading Model 3: model.onnx ---
   File size: 475.75 MB
✅ Model 3 loaded successfully
   Output shape: (3, 3)

�� Loaded 3 models successfully
�� Total file size: 1427.26 MB

--- Unloading Model 1: model.onnx ---
=== ONNX Runtime Memory Release Tracking ===
Initial system memory: 3450.19 MB available
Initial process memory: 2282.33 MB RSS
Final inference completed
Releasing ONNX Runtime memory...
  - Session providers: ['CPUExecutionProvider']
  - Session inputs: ['input_ids', 'attention_mask']
  - Session outputs: ['logits']
  - Destroying ONNX session...

--- Memory Release Results ---
Unloading time: 0

[{'model': 'model.onnx',
  'file_size': 475.75407218933105,
  'memory_freed': -44.59375,
  'efficiency_ratio': -0.09373277625305007},
 {'model': 'model.onnx',
  'file_size': 475.75407218933105,
  'memory_freed': -0.015625,
  'efficiency_ratio': -3.284259854696919e-05},
 {'model': 'model.onnx',
  'file_size': 475.75407218933105,
  'memory_freed': 0.0,
  'efficiency_ratio': 0.0}]