vega · mattijn · Feb 18, 2023 · Feb 15, 2023 · Feb 15, 2023 · Feb 15, 2023
diff --git a/altair/utils/data.py b/altair/utils/data.py
@@ -76,6 +76,8 @@ def limit_rows(data, max_rows=5000):
             values = data["values"]
         else:
             return data
+    elif hasattr(data, "__dataframe__"):
+        values = data
     if max_rows is not None and len(values) > max_rows:
         raise MaxRowsError(
             "The number of rows in your dataset is greater "
@@ -98,6 +100,13 @@ def sample(data, n=None, frac=None):
             n = n if n else int(frac * len(values))
             values = random.sample(values, n)
             return {"values": values}
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        n = n if n else int(frac * len(pa_table))
+        indices = random.sample(range(len(pa_table)), n)
+        return pa_table.take(indices)
 
 
 @curried.curry
@@ -152,12 +161,17 @@ def to_values(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present.")
         return data
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        return {"values": pa_table.to_pylist()}
 
 
 def check_data_type(data):
     """Raise if the data is not a dict or DataFrame."""
-    if not isinstance(data, (dict, pd.DataFrame)) and not hasattr(
-        data, "__geo_interface__"
+    if not isinstance(data, (dict, pd.DataFrame)) and not any(
+        hasattr(data, attr) for attr in ["__geo_interface__", "__dataframe__"]
     ):
         raise TypeError(
             "Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format(
@@ -190,6 +204,11 @@ def _data_to_json_string(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present.")
         return json.dumps(data["values"], sort_keys=True)
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        pa_table = pi.from_dataframe(data)
+        return json.dumps(pa_table.to_pylist())
     else:
         raise NotImplementedError(
             "to_json only works with data expressed as " "a DataFrame or as a dict"
@@ -211,6 +230,16 @@ def _data_to_csv_string(data):
         if "values" not in data:
             raise KeyError("values expected in data dict, but not present")
         return pd.DataFrame.from_dict(data["values"]).to_csv(index=False)
+    elif hasattr(data, "__dataframe__"):
+        # experimental interchange dataframe support
+        pi = import_pyarrow_interchange()
+        import pyarrow as pa
+        import pyarrow.csv as pa_csv
+
+        pa_table = pi.from_dataframe(data)
+        csv_buffer = pa.BufferOutputStream()
+        pa_csv.write_csv(pa_table, csv_buffer)
+        return csv_buffer.getvalue().to_pybytes().decode()
     else:
         raise NotImplementedError(
             "to_csv only works with data expressed as " "a DataFrame or as a dict"
@@ -242,3 +271,25 @@ def curry(*args, **kwargs):
         AltairDeprecationWarning,
     )
     return curried.curry(*args, **kwargs)
+
+
+def import_pyarrow_interchange():
+    import pkg_resources
+
+    try:
+        pkg_resources.require("pyarrow>=11.0.0")
+        # The package is installed and meets the minimum version requirement
+        import pyarrow.interchange as pi
+
+        return pi
+    except pkg_resources.DistributionNotFound:
+        # The package is not installed
+        raise ImportError(
+            "Usage of the DataFrame Interchange Protocol requires the package 'pyarrow', but it is not installed."
+        )
+    except pkg_resources.VersionConflict:
+        # The package is installed but does not meet the minimum version requirement
+        raise ImportError(
+            "The installed version of 'pyarrow' does not meet the minimum requirement of version 11.0.0. "
+            "Please update 'pyarrow' to use the DataFrame Interchange Protocol."
+        )
diff --git a/altair/vegalite/v5/api.py b/altair/vegalite/v5/api.py
@@ -97,13 +97,16 @@ def _prepare_data(data, context=None):
         return data
 
     # convert dataframes  or objects with __geo_interface__ to dict
-    if isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
+    elif isinstance(data, pd.DataFrame) or hasattr(data, "__geo_interface__"):
         data = _pipe(data, data_transformers.get())
 
     # convert string input to a URLData
-    if isinstance(data, str):
+    elif isinstance(data, str):
         data = core.UrlData(data)
 
+    elif hasattr(data, "__dataframe__"):
+        data = _pipe(data, data_transformers.get())
+
     # consolidate inline data to top-level datasets
     if context is not None and data_transformers.consolidate_datasets:
         data = _consolidate_data(data, context)

diff --git a/doc/releases/changes.rst b/doc/releases/changes.rst
@@ -22,6 +22,7 @@ Enhancements
 - The documentation page has been revamped, both in terms of appearance and content.
 - More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846)
 - Improved error messages (#2842)
+- Include experimental support for the DataFrame Interchange Protocol (through `__dataframe__` attribute). This requires `pyarrow>=11.0.0` (#2888).
 
 Grammar Changes
 ~~~~~~~~~~~~~~~

diff --git a/doc/user_guide/data.rst b/doc/user_guide/data.rst
@@ -21,6 +21,7 @@ there are many different ways of specifying a dataset:
 - as a url string pointing to a ``json`` or ``csv`` formatted text file
 - as a `geopandas GeoDataFrame <http://geopandas.org/data_structures.html#geodataframe>`_, `Shapely Geometries <https://shapely.readthedocs.io/en/latest/manual.html#geometric-objects>`_, `GeoJSON Objects <https://github.com/jazzband/geojson#geojson-objects>`_ or other objects that support the ``__geo_interface__``
 - as a generated dataset such as numerical sequences or geographic reference elements
+- as a DataFrame that supports the DataFrame Interchange Protocol (contains a `__dataframe__` attribute). This is experimental.
 
 When data is specified as a DataFrame, the encoding is quite simple, as Altair
 uses the data type information provided by pandas to automatically determine