From b0e7bfc2bc140a0809e2267dd02267c6bc201f1c Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 16:51:44 -0700
Subject: [PATCH 01/22] [IR] Implement tofile for tensors

---
 onnxscript/ir/_core.py      | 26 ++++++++++++++++++++++++++
 onnxscript/ir/_protocols.py |  4 ++++
 2 files changed, 30 insertions(+)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 51c6d83502..dff56e408d 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -16,6 +16,7 @@
 import contextlib
 import dataclasses
 import heapq
+import io
 import math
 import mmap
 import os
@@ -26,6 +27,7 @@
 from typing import (
     AbstractSet,
     Any,
+    BinaryIO,
     Callable,
     Collection,
     Generic,
@@ -122,6 +124,12 @@ def nbytes(self) -> int:
         # Use math.ceil because when dtype is INT4, the itemsize is 0.5
         return math.ceil(self.dtype.itemsize * self.size)
 
+    def tofile(self, file: BinaryIO, /) -> None:
+        """Write the tensor content as bytes to a file-like object."""
+        # The naive implementation calls tobytes(), which creates a copy of the data.
+        # Advanced implementations can directly write to the file to avoid the copy.
+        file.write(self.tobytes())
+
     def display(self, *, page: bool = False) -> None:
         rich = _display.require_rich()
 
@@ -456,6 +464,24 @@ def tobytes(self) -> bytes:
             array = array.view(array.dtype.newbyteorder("<"))
         return array.tobytes()
 
+    def tofile(self, file: BinaryIO, /) -> None:
+        """Write the tensor content as bytes to a file-like object."""
+        if self.dtype in {
+            _enums.DataType.INT4,
+            _enums.DataType.UINT4,
+            _enums.DataType.FLOAT4E2M1,
+        }:
+            # Packing is required. So we call tobytes() directly
+            file.write(self.tobytes())
+            return
+
+        # Otherwise use tofile from the numpy array
+        array = self.numpy()
+        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
+        if not _IS_LITTLE_ENDIAN:
+            array = array.view(array.dtype.newbyteorder("<"))
+        return array.tofile(file)
+
     @property
     def metadata_props(self) -> dict[str, str]:
         if self._metadata_props is None:
diff --git a/onnxscript/ir/_protocols.py b/onnxscript/ir/_protocols.py
index fbc2c7c054..56c5a245a6 100644
--- a/onnxscript/ir/_protocols.py
+++ b/onnxscript/ir/_protocols.py
@@ -33,6 +33,7 @@
 import typing
 from typing import (
     Any,
+    BinaryIO,
     Collection,
     Iterable,
     Iterator,
@@ -145,6 +146,9 @@ def tobytes(self) -> bytes:
         """Return the tensor as a byte string conformed to the ONNX specification, in little endian."""
         ...
 
+    def tofile(self, file: BinaryIO, /) -> None:
+        """Write the tensor content as bytes to a file-like object."""
+
 
 @typing.runtime_checkable
 class ValueProtocol(Protocol):

From 8bb87501ea4057e427d389b727a39be1a74f2a57 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 17:57:59 -0700
Subject: [PATCH 02/22] Implement tofile

---
 onnxscript/ir/_core.py | 50 +++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index dff56e408d..d22dfea48f 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -16,7 +16,6 @@
 import contextlib
 import dataclasses
 import heapq
-import io
 import math
 import mmap
 import os
@@ -124,11 +123,23 @@ def nbytes(self) -> int:
         # Use math.ceil because when dtype is INT4, the itemsize is 0.5
         return math.ceil(self.dtype.itemsize * self.size)
 
-    def tofile(self, file: BinaryIO, /) -> None:
+    def tofile(self: _protocols.TensorProtocol, file: BinaryIO, /) -> None:
         """Write the tensor content as bytes to a file-like object."""
-        # The naive implementation calls tobytes(), which creates a copy of the data.
-        # Advanced implementations can directly write to the file to avoid the copy.
-        file.write(self.tobytes())
+        if self.dtype in {
+            _enums.DataType.INT4,
+            _enums.DataType.UINT4,
+            _enums.DataType.FLOAT4E2M1,
+        }:
+            # Packing is required. So we call tobytes() directly
+            file.write(self.tobytes())
+            return
+
+        # Otherwise use tofile from the numpy array
+        array = self.numpy()
+        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
+        if not _IS_LITTLE_ENDIAN:
+            array = array.view(array.dtype.newbyteorder("<"))
+        return array.tofile(file)
 
     def display(self, *, page: bool = False) -> None:
         rich = _display.require_rich()
@@ -464,24 +475,6 @@ def tobytes(self) -> bytes:
             array = array.view(array.dtype.newbyteorder("<"))
         return array.tobytes()
 
-    def tofile(self, file: BinaryIO, /) -> None:
-        """Write the tensor content as bytes to a file-like object."""
-        if self.dtype in {
-            _enums.DataType.INT4,
-            _enums.DataType.UINT4,
-            _enums.DataType.FLOAT4E2M1,
-        }:
-            # Packing is required. So we call tobytes() directly
-            file.write(self.tobytes())
-            return
-
-        # Otherwise use tofile from the numpy array
-        array = self.numpy()
-        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
-        if not _IS_LITTLE_ENDIAN:
-            array = array.view(array.dtype.newbyteorder("<"))
-        return array.tofile(file)
-
     @property
     def metadata_props(self) -> dict[str, str]:
         if self._metadata_props is None:
@@ -725,6 +718,17 @@ def tobytes(self) -> bytes:
         length = self._length or self.nbytes
         return self.raw[offset : offset + length]
 
+    def tofile(self, file: BinaryIO, /) -> None:
+        """Write the tensor content as bytes to a file-like object."""
+        self._check_validity()
+        if self.raw is None:
+            self._load()
+        assert self.raw is not None
+        offset = self._offset or 0
+        length = self._length or self.nbytes
+        # FIXME avoid a copy
+        file.write(self.raw[offset : offset + length])
+
     def valid(self) -> bool:
         """Check if the tensor is valid.
 

From 5115427353e6a47554e8130c4eb9adeab6cec4ea Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 18:00:43 -0700
Subject: [PATCH 03/22] use tofile in external data

---
 onnxscript/ir/external_data.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 4ca9ca5036..58d7bbf08c 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -173,14 +173,13 @@ def _write_external_data(
         for tensor, tensor_info in zip(tensors, external_data_infos, strict=True):
             current_offset = tensor_info.offset
             assert tensor is not None
-            raw_data = tensor.tobytes()
-            if isinstance(tensor, _core.ExternalTensor):
-                tensor.release()
             # Pad file to required offset if needed
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
-            data_file.write(raw_data)
+            tensor.tofile(data_file)
+            if isinstance(tensor, _core.ExternalTensor):
+                tensor.release()
 
 
 def _create_external_tensor(

From 08225924419a2f9650f984fbe86c1f9155d71ce1 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:20:27 -0700
Subject: [PATCH 04/22] type

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index d22dfea48f..2a6a50035b 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -123,7 +123,7 @@ def nbytes(self) -> int:
         # Use math.ceil because when dtype is INT4, the itemsize is 0.5
         return math.ceil(self.dtype.itemsize * self.size)
 
-    def tofile(self: _protocols.TensorProtocol, file: BinaryIO, /) -> None:
+    def tofile(self, file: BinaryIO, /) -> None:
         """Write the tensor content as bytes to a file-like object."""
         if self.dtype in {
             _enums.DataType.INT4,

From b2381658492510a9bcc8c0a8574db7368e33bceb Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:22:32 -0700
Subject: [PATCH 05/22] do not return

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 2a6a50035b..abc3eb65c4 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -139,7 +139,7 @@ def tofile(self, file: BinaryIO, /) -> None:
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
             array = array.view(array.dtype.newbyteorder("<"))
-        return array.tofile(file)
+        array.tofile(file)
 
     def display(self, *, page: bool = False) -> None:
         rich = _display.require_rich()

From 97973cef4c7ee1ee7038ab33a972906b42e00fc2 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:41:31 -0700
Subject: [PATCH 06/22] Implement memoryview

---
 onnxscript/ir/_core.py         | 63 ++++++++++++++++++----------------
 onnxscript/ir/_protocols.py    |  8 ++---
 onnxscript/ir/external_data.py |  2 +-
 3 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index abc3eb65c4..5fc9b43709 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -26,7 +26,6 @@
 from typing import (
     AbstractSet,
     Any,
-    BinaryIO,
     Callable,
     Collection,
     Generic,
@@ -41,7 +40,7 @@
 
 import ml_dtypes
 import numpy as np
-from typing_extensions import TypeIs
+from typing_extensions import Buffer, TypeIs
 
 import onnxscript
 from onnxscript.ir import (
@@ -96,7 +95,7 @@ def _compatible_with_dlpack(obj: Any) -> TypeGuard[_protocols.DLPackCompatible]:
     return hasattr(obj, "__dlpack__")
 
 
-class TensorBase(abc.ABC, _protocols.TensorProtocol, _display.PrettyPrintable):
+class TensorBase(abc.ABC, Buffer, _protocols.TensorProtocol, _display.PrettyPrintable):
     """Convenience Shared methods for classes implementing TensorProtocol."""
 
     __slots__ = ()
@@ -112,34 +111,36 @@ def _repr_base(self) -> str:
         """
         return f"{self.__class__.__name__}<{self._printable_type_shape()}>"
 
-    @property
-    def size(self) -> int:
-        """The number of elements in the tensor."""
-        return math.prod(self.shape.numpy())  # type: ignore[attr-defined]
-
-    @property
-    def nbytes(self) -> int:
-        """The number of bytes in the tensor."""
-        # Use math.ceil because when dtype is INT4, the itemsize is 0.5
-        return math.ceil(self.dtype.itemsize * self.size)
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a memoryview of the tensor.
 
-    def tofile(self, file: BinaryIO, /) -> None:
-        """Write the tensor content as bytes to a file-like object."""
+        This is used to support the buffer protocol.
+        """
         if self.dtype in {
             _enums.DataType.INT4,
             _enums.DataType.UINT4,
             _enums.DataType.FLOAT4E2M1,
         }:
             # Packing is required. So we call tobytes() directly
-            file.write(self.tobytes())
-            return
+            return memoryview(self.tobytes())
 
-        # Otherwise use tofile from the numpy array
+        # Otherwise get the memoryview from the numpy array
         array = self.numpy()
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
             array = array.view(array.dtype.newbyteorder("<"))
-        array.tofile(file)
+        return memoryview(array)
+
+    @property
+    def size(self) -> int:
+        """The number of elements in the tensor."""
+        return math.prod(self.shape.numpy())  # type: ignore[attr-defined]
+
+    @property
+    def nbytes(self) -> int:
+        """The number of bytes in the tensor."""
+        # Use math.ceil because when dtype is INT4, the itemsize is 0.5
+        return math.ceil(self.dtype.itemsize * self.size)
 
     def display(self, *, page: bool = False) -> None:
         rich = _display.require_rich()
@@ -676,6 +677,19 @@ def __array__(self, dtype: Any = None) -> np.ndarray:
         assert self._array is not None
         return self._array.__array__(dtype)
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a memoryview of the tensor.
+
+        This is used to support the buffer protocol.
+        """
+        self._check_validity()
+        if self.raw is None:
+            self._load()
+        assert self.raw is not None
+        offset = self._offset or 0
+        length = self._length or self.nbytes
+        return memoryview(self.raw)[offset : offset + length]
+
     def __dlpack__(self, *, stream: Any = None) -> Any:
         raise NotImplementedError(
             "ExternalTensor does not support DLPack because it uses memory mapping. "
@@ -718,17 +732,6 @@ def tobytes(self) -> bytes:
         length = self._length or self.nbytes
         return self.raw[offset : offset + length]
 
-    def tofile(self, file: BinaryIO, /) -> None:
-        """Write the tensor content as bytes to a file-like object."""
-        self._check_validity()
-        if self.raw is None:
-            self._load()
-        assert self.raw is not None
-        offset = self._offset or 0
-        length = self._length or self.nbytes
-        # FIXME avoid a copy
-        file.write(self.raw[offset : offset + length])
-
     def valid(self) -> bool:
         """Check if the tensor is valid.
 
diff --git a/onnxscript/ir/_protocols.py b/onnxscript/ir/_protocols.py
index 56c5a245a6..eaf7037635 100644
--- a/onnxscript/ir/_protocols.py
+++ b/onnxscript/ir/_protocols.py
@@ -33,7 +33,6 @@
 import typing
 from typing import (
     Any,
-    BinaryIO,
     Collection,
     Iterable,
     Iterator,
@@ -134,6 +133,10 @@ def __array__(self, dtype: Any = None) -> np.ndarray:
         """Return the tensor as a numpy array, compatible with np.array."""
         ...
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a view of the tensor data."""
+        ...
+
     def __dlpack__(self, *, stream: Any = ...) -> Any:
         """Return PyCapsule."""
         ...
@@ -146,9 +149,6 @@ def tobytes(self) -> bytes:
         """Return the tensor as a byte string conformed to the ONNX specification, in little endian."""
         ...
 
-    def tofile(self, file: BinaryIO, /) -> None:
-        """Write the tensor content as bytes to a file-like object."""
-
 
 @typing.runtime_checkable
 class ValueProtocol(Protocol):
diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 58d7bbf08c..086bc574dc 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -177,7 +177,7 @@ def _write_external_data(
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
-            tensor.tofile(data_file)
+            data_file.write(memoryview(tensor))
             if isinstance(tensor, _core.ExternalTensor):
                 tensor.release()
 

From 57db02e287c130a67382c358c8c8e5ff0b0703ed Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:45:52 -0700
Subject: [PATCH 07/22] ascontiguousarray

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 5fc9b43709..557a8a4bd3 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -125,7 +125,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
             return memoryview(self.tobytes())
 
         # Otherwise get the memoryview from the numpy array
-        array = self.numpy()
+        array = np.ascontiguousarray(self.numpy())
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
             array = array.view(array.dtype.newbyteorder("<"))

From e20fd7a72233474603ab9fabd979a39ca6e1cb56 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:57:10 -0700
Subject: [PATCH 08/22] Buffer

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 557a8a4bd3..22fd9ad407 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -95,7 +95,7 @@ def _compatible_with_dlpack(obj: Any) -> TypeGuard[_protocols.DLPackCompatible]:
     return hasattr(obj, "__dlpack__")
 
 
-class TensorBase(abc.ABC, Buffer, _protocols.TensorProtocol, _display.PrettyPrintable):
+class TensorBase(Buffer, _protocols.TensorProtocol, _display.PrettyPrintable):
     """Convenience Shared methods for classes implementing TensorProtocol."""
 
     __slots__ = ()

From c398f40b11dc0a6a942db40d318c3733a96a6836 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 21:59:44 -0700
Subject: [PATCH 09/22] contiguous

---
 onnxscript/ir/tensor_adapters.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxscript/ir/tensor_adapters.py b/onnxscript/ir/tensor_adapters.py
index 0a74e0a74c..7a8ee87139 100644
--- a/onnxscript/ir/tensor_adapters.py
+++ b/onnxscript/ir/tensor_adapters.py
@@ -79,7 +79,9 @@ def __init__(
     def numpy(self) -> npt.NDArray:
         import torch
 
-        self.raw: torch.Tensor
+        # Calling .contiguous() is usually less costly than calling it on numpy arrays
+        # so we do it first for users assuming a contiguous array is needed for most usages
+        self.raw: torch.Tensor = self.raw.contiguous()
         if self.dtype == ir.DataType.BFLOAT16:
             return self.raw.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
         if self.dtype in {

From b26411b4f3e15f8fcfd2c956be17a7b5a8cf1a4d Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:00:12 -0700
Subject: [PATCH 10/22] lint

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 22fd9ad407..81710b8cf4 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -129,7 +129,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
             array = array.view(array.dtype.newbyteorder("<"))
-        return memoryview(array)
+        return memoryview(array)  # type: ignore[arg-type]
 
     @property
     def size(self) -> int:

From 4420ea4a5057707d2dbc18ae46189f309832f968 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:00:21 -0700
Subject: [PATCH 11/22] lint

---
 onnxscript/ir/_core.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 81710b8cf4..17ad426798 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -12,7 +12,6 @@
 
 from __future__ import annotations
 
-import abc
 import contextlib
 import dataclasses
 import heapq

From 38e86506b88c1e7096b347ff335f73c24cc041df Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:03:00 -0700
Subject: [PATCH 12/22] Fix variable

---
 onnxscript/ir/tensor_adapters.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxscript/ir/tensor_adapters.py b/onnxscript/ir/tensor_adapters.py
index 7a8ee87139..3d94bb6ad8 100644
--- a/onnxscript/ir/tensor_adapters.py
+++ b/onnxscript/ir/tensor_adapters.py
@@ -81,18 +81,18 @@ def numpy(self) -> npt.NDArray:
 
         # Calling .contiguous() is usually less costly than calling it on numpy arrays
         # so we do it first for users assuming a contiguous array is needed for most usages
-        self.raw: torch.Tensor = self.raw.contiguous()
+        torch_tensor: torch.Tensor = self.raw.contiguous()
         if self.dtype == ir.DataType.BFLOAT16:
-            return self.raw.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
+            return torch_tensor.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
         if self.dtype in {
             ir.DataType.FLOAT8E4M3FN,
             ir.DataType.FLOAT8E4M3FNUZ,
             ir.DataType.FLOAT8E5M2,
             ir.DataType.FLOAT8E5M2FNUZ,
         }:
-            return self.raw.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
+            return torch_tensor.view(torch.uint8).numpy(force=True).view(self.dtype.numpy())
 
-        return self.raw.numpy(force=True)
+        return torch_tensor.numpy(force=True)
 
     def __array__(self, dtype: Any = None, copy: bool | None = None) -> npt.NDArray:
         del copy  # Unused, but needed for the signature

From 764d0bfa961ca33ce77715965893a8eeee005403 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:26:01 -0700
Subject: [PATCH 13/22] array

---
 onnxscript/ir/_core.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 17ad426798..0a1747ab70 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -124,7 +124,9 @@ def __buffer__(self, flags: int, /) -> memoryview:
             return memoryview(self.tobytes())
 
         # Otherwise get the memoryview from the numpy array
-        array = np.ascontiguousarray(self.numpy())
+        array = self.numpy()
+        if not array.data.contiguous:
+            array = np.ascontiguousarray(array)
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
             array = array.view(array.dtype.newbyteorder("<"))

From 7e27906c7c5a2de1178a7f6b61cada8d168c1466 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 22:31:07 -0700
Subject: [PATCH 14/22] c_contiguous

---
 onnxscript/ir/_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 0a1747ab70..1cd5347f37 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -125,7 +125,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
 
         # Otherwise get the memoryview from the numpy array
         array = self.numpy()
-        if not array.data.contiguous:
+        if not array.data.c_contiguous:
             array = np.ascontiguousarray(array)
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:

From 5ae862a28e3be713eee7c3e87ccbbb0b223df8a8 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:03:37 -0700
Subject: [PATCH 15/22] contiguous

---
 onnxscript/ir/_core.py           | 5 +++--
 onnxscript/ir/tensor_adapters.py | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index 1cd5347f37..abbc76cddd 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -129,8 +129,9 @@ def __buffer__(self, flags: int, /) -> memoryview:
             array = np.ascontiguousarray(array)
         assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
         if not _IS_LITTLE_ENDIAN:
-            array = array.view(array.dtype.newbyteorder("<"))
-        return memoryview(array)  # type: ignore[arg-type]
+            # Need to copy because we are returning the underlying data directly
+            array = array.view(array.dtype.newbyteorder("<")).copy()
+        return array.data
 
     @property
     def size(self) -> int:
diff --git a/onnxscript/ir/tensor_adapters.py b/onnxscript/ir/tensor_adapters.py
index 3d94bb6ad8..7c8bc4f79e 100644
--- a/onnxscript/ir/tensor_adapters.py
+++ b/onnxscript/ir/tensor_adapters.py
@@ -81,7 +81,9 @@ def numpy(self) -> npt.NDArray:
 
         # Calling .contiguous() is usually less costly than calling it on numpy arrays
         # so we do it first for users assuming a contiguous array is needed for most usages
-        torch_tensor: torch.Tensor = self.raw.contiguous()
+        torch_tensor: torch.Tensor = self.raw
+        if not torch_tensor.is_contiguous():
+            torch_tensor = torch_tensor.contiguous()
         if self.dtype == ir.DataType.BFLOAT16:
             return torch_tensor.view(torch.uint16).numpy(force=True).view(self.dtype.numpy())
         if self.dtype in {

From 76b540143f0576141115828a78adb0a3fcaed180 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:26:48 -0700
Subject: [PATCH 16/22] TensorProtoTensor

---
 onnxscript/ir/serde.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/onnxscript/ir/serde.py b/onnxscript/ir/serde.py
index 64703b2baa..0dcdda8112 100644
--- a/onnxscript/ir/serde.py
+++ b/onnxscript/ir/serde.py
@@ -373,6 +373,9 @@ def numpy(self) -> np.ndarray:
             # Note we cannot use view() here because the storage dtype may not be the same size as the target
             return array.astype(dtype.numpy()).reshape(self._proto.dims)
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        return memoryview(self.tobytes())
+
     def tobytes(self) -> bytes:
         """Return the tensor as a byte string conformed to the ONNX specification, in little endian.
 

From 717e16273d5f83b7a036099e4da42d565a9029ce Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:31:47 -0700
Subject: [PATCH 17/22] view

---
 onnxscript/ir/external_data.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 086bc574dc..3adb580fc4 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -177,7 +177,8 @@ def _write_external_data(
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
-            data_file.write(memoryview(tensor))
+            with memoryview(tensor) as view:
+                data_file.write(view)
             if isinstance(tensor, _core.ExternalTensor):
                 tensor.release()
 

From edbf5f5b59263f1cf3e7027ea4251672794a0997 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:48:00 -0700
Subject: [PATCH 18/22] fix write order

---
 onnxscript/ir/external_data.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 3adb580fc4..18c4d8dae8 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -172,13 +172,14 @@ def _write_external_data(
     with open(file_path, "wb") as data_file:
         for tensor, tensor_info in zip(tensors, external_data_infos, strict=True):
             current_offset = tensor_info.offset
+            print(tensor.name)
             assert tensor is not None
+            with memoryview(tensor) as view:
+                data_file.write(view)
             # Pad file to required offset if needed
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
-            with memoryview(tensor) as view:
-                data_file.write(view)
             if isinstance(tensor, _core.ExternalTensor):
                 tensor.release()
 

From c5c0aaac51ade5688349d92f13b4e73bd9c3b095 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:48:27 -0700
Subject: [PATCH 19/22] release

---
 onnxscript/ir/external_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 18c4d8dae8..cdce6763c9 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -176,12 +176,12 @@ def _write_external_data(
             assert tensor is not None
             with memoryview(tensor) as view:
                 data_file.write(view)
+            if isinstance(tensor, _core.ExternalTensor):
+                tensor.release()
             # Pad file to required offset if needed
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
-            if isinstance(tensor, _core.ExternalTensor):
-                tensor.release()
 
 
 def _create_external_tensor(

From 787ad007b9be4613c1f193010fbeda89fe3ce7b9 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:49:03 -0700
Subject: [PATCH 20/22] nvm

---
 onnxscript/ir/external_data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index cdce6763c9..80e15324a0 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -174,14 +174,14 @@ def _write_external_data(
             current_offset = tensor_info.offset
             print(tensor.name)
             assert tensor is not None
-            with memoryview(tensor) as view:
-                data_file.write(view)
-            if isinstance(tensor, _core.ExternalTensor):
-                tensor.release()
             # Pad file to required offset if needed
             file_size = data_file.tell()
             if current_offset > file_size:
                 data_file.write(b"\0" * (current_offset - file_size))
+            with memoryview(tensor) as view:
+                data_file.write(view)
+            if isinstance(tensor, _core.ExternalTensor):
+                tensor.release()
 
 
 def _create_external_tensor(

From c1a2e1c20c3a90790f39c846ab18ff1716478025 Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Mon, 28 Apr 2025 23:49:17 -0700
Subject: [PATCH 21/22] fix

---
 onnxscript/ir/external_data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxscript/ir/external_data.py b/onnxscript/ir/external_data.py
index 80e15324a0..3adb580fc4 100644
--- a/onnxscript/ir/external_data.py
+++ b/onnxscript/ir/external_data.py
@@ -172,7 +172,6 @@ def _write_external_data(
     with open(file_path, "wb") as data_file:
         for tensor, tensor_info in zip(tensors, external_data_infos, strict=True):
             current_offset = tensor_info.offset
-            print(tensor.name)
             assert tensor is not None
             # Pad file to required offset if needed
             file_size = data_file.tell()

From e69470b7a9b60745a5eae29678167212b64f8faa Mon Sep 17 00:00:00 2001
From: Justin Chu <justinchuby@users.noreply.github.com>
Date: Tue, 29 Apr 2025 08:08:22 -0700
Subject: [PATCH 22/22] update

---
 onnxscript/ir/_core.py           | 48 +++++++++++++++++++++-----------
 onnxscript/ir/serde.py           |  3 --
 onnxscript/ir/tensor_adapters.py | 19 +++++++++++++
 3 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/onnxscript/ir/_core.py b/onnxscript/ir/_core.py
index abbc76cddd..898855edef 100644
--- a/onnxscript/ir/_core.py
+++ b/onnxscript/ir/_core.py
@@ -115,23 +115,7 @@ def __buffer__(self, flags: int, /) -> memoryview:
 
         This is used to support the buffer protocol.
         """
-        if self.dtype in {
-            _enums.DataType.INT4,
-            _enums.DataType.UINT4,
-            _enums.DataType.FLOAT4E2M1,
-        }:
-            # Packing is required. So we call tobytes() directly
-            return memoryview(self.tobytes())
-
-        # Otherwise get the memoryview from the numpy array
-        array = self.numpy()
-        if not array.data.c_contiguous:
-            array = np.ascontiguousarray(array)
-        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
-        if not _IS_LITTLE_ENDIAN:
-            # Need to copy because we are returning the underlying data directly
-            array = array.view(array.dtype.newbyteorder("<")).copy()
-        return array.data
+        return self.tobytes().__buffer__(flags)
 
     @property
     def size(self) -> int:
@@ -430,6 +414,29 @@ def __dlpack_device__(self) -> tuple[int, int]:
     def __repr__(self) -> str:
         return f"{self._repr_base()}({self._raw!r}, name={self.name!r})"
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a memoryview of the tensor.
+
+        This is used to support the buffer protocol.
+        """
+        if self.dtype in {
+            _enums.DataType.INT4,
+            _enums.DataType.UINT4,
+            _enums.DataType.FLOAT4E2M1,
+        }:
+            # Packing is required. So we call tobytes() directly
+            return self.tobytes().__buffer__(flags)
+
+        # Otherwise get the memoryview from the numpy array
+        array = self.numpy()
+        if not array.data.c_contiguous:
+            array = np.ascontiguousarray(array)
+        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
+        if not _IS_LITTLE_ENDIAN:
+            # Need to copy because we are returning the underlying data directly
+            array = array.view(array.dtype.newbyteorder("<")).copy()
+        return array.__buffer__(flags)
+
     @property
     def dtype(self) -> _enums.DataType:
         """The data type of the tensor. Immutable."""
@@ -988,6 +995,13 @@ def __dlpack_device__(self) -> tuple[int, int]:
     def __repr__(self) -> str:
         return f"{self._repr_base()}(func={self._func!r}, name={self.name!r})"
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a memoryview of the tensor.
+
+        This is used to support the buffer protocol.
+        """
+        return self._evaluate().__buffer__(flags)
+
     @property
     def raw(self) -> Callable[[], _protocols.TensorProtocol]:
         return self._func
diff --git a/onnxscript/ir/serde.py b/onnxscript/ir/serde.py
index 0dcdda8112..64703b2baa 100644
--- a/onnxscript/ir/serde.py
+++ b/onnxscript/ir/serde.py
@@ -373,9 +373,6 @@ def numpy(self) -> np.ndarray:
             # Note we cannot use view() here because the storage dtype may not be the same size as the target
             return array.astype(dtype.numpy()).reshape(self._proto.dims)
 
-    def __buffer__(self, flags: int, /) -> memoryview:
-        return memoryview(self.tobytes())
-
     def tobytes(self) -> bytes:
         """Return the tensor as a byte string conformed to the ONNX specification, in little endian.
 
diff --git a/onnxscript/ir/tensor_adapters.py b/onnxscript/ir/tensor_adapters.py
index 7c8bc4f79e..eba04e7e96 100644
--- a/onnxscript/ir/tensor_adapters.py
+++ b/onnxscript/ir/tensor_adapters.py
@@ -102,6 +102,25 @@ def __array__(self, dtype: Any = None, copy: bool | None = None) -> npt.NDArray:
             return self.numpy()
         return self.numpy().__array__(dtype)
 
+    def __buffer__(self, flags: int, /) -> memoryview:
+        """Return a memoryview of the tensor.
+
+        This is used to support the buffer protocol.
+        """
+        if self.dtype in {
+            ir.DataType.INT4,
+            ir.DataType.UINT4,
+            ir.DataType.FLOAT4E2M1,
+        }:
+            # Packing is required. So we call tobytes() directly
+            return self.tobytes().__buffer__(flags)
+
+        # Otherwise get the memoryview from the numpy array
+        array = self.numpy()
+        assert array.data.c_contiguous, "Bug: The array should be contiguous"
+        assert self.dtype.itemsize == array.itemsize, "Bug: The itemsize should match"
+        return array.__buffer__(flags)
+
     def tobytes(self) -> bytes:
         # Implement tobytes to support native PyTorch types so we can use types like bloat16
         # Reading from memory directly is also more efficient because