diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index d8b97bb92c3b0b..c87143a959cf2f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -367,6 +367,7 @@ I/O - Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`) - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`) - Fixed bug in loading objects from S3 that contain ``#`` characters in the URL (:issue:`25945`) +- Adds ``use_bqstorage_api`` parameter to :func:`read_gbq` to speed up downloads of large data frames. This feature requires version 0.10.0 of the ``pandas-gbq`` library as well as the ``google-cloud-bigquery-storage`` and ``fastavro`` libraries. (:issue:`26104`) Plotting ^^^^^^^^ diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index a6cec7ea8fb163..871bc4a8221c20 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -1,7 +1,5 @@ """ Google BigQuery support """ -import warnings - def _try_import(): # since pandas is a dependency of pandas-gbq @@ -26,7 +24,7 @@ def _try_import(): def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, auth_local_webserver=False, dialect=None, location=None, configuration=None, credentials=None, - private_key=None, verbose=None): + use_bqstorage_api=None, private_key=None, verbose=None): """ Load data from Google BigQuery. @@ -103,6 +101,21 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, *New in version 0.8.0 of pandas-gbq*. .. versionadded:: 0.24.0 + use_bqstorage_api : bool, default False + Use the `BigQuery Storage API + `__ to + download query results quickly, but at an increased cost. To use this + API, first `enable it in the Cloud Console + `__. + You must also have the `bigquery.readsessions.create + `__ + permission on the project you are billing queries to. + + This feature requires version 0.10.0 or later of the ``pandas-gbq`` + package. It also requires the ``google-cloud-bigquery-storage`` and + ``fastavro`` packages. + + .. versionadded:: 0.25.0 private_key : str, deprecated Deprecated in pandas-gbq version 0.8.0. Use the ``credentials`` parameter and @@ -131,22 +144,27 @@ def read_gbq(query, project_id=None, index_col=None, col_order=None, """ pandas_gbq = _try_import() - if dialect is None: - dialect = "legacy" - warnings.warn( - 'The default value for dialect is changing to "standard" in a ' - 'future version of pandas-gbq. Pass in dialect="legacy" to ' - "disable this warning.", - FutureWarning, - stacklevel=2, - ) + kwargs = {} + + # START: new kwargs. Don't populate unless explicitly set. + if use_bqstorage_api is not None: + kwargs["use_bqstorage_api"] = use_bqstorage_api + # END: new kwargs + + # START: deprecated kwargs. Don't populate unless explicitly set. + if verbose is not None: + kwargs["verbose"] = verbose + + if private_key is not None: + kwargs["private_key"] = private_key + # END: deprecated kwargs return pandas_gbq.read_gbq( query, project_id=project_id, index_col=index_col, col_order=col_order, reauth=reauth, auth_local_webserver=auth_local_webserver, dialect=dialect, location=location, configuration=configuration, - credentials=credentials, verbose=verbose, private_key=private_key) + credentials=credentials, **kwargs) def to_gbq(dataframe, destination_table, project_id=None, chunksize=None, diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py index 87ffc94f7d0469..21e0a63bf4ce79 100644 --- a/pandas/tests/io/test_gbq.py +++ b/pandas/tests/io/test_gbq.py @@ -8,7 +8,6 @@ import pandas as pd from pandas import DataFrame -import pandas.util.testing as tm api_exceptions = pytest.importorskip("google.api_core.exceptions") bigquery = pytest.importorskip("google.cloud.bigquery") @@ -90,16 +89,59 @@ def make_mixed_dataframe_v2(test_size): index=range(test_size)) -def test_read_gbq_without_dialect_warns_future_change(monkeypatch): - # Default dialect is changing to standard SQL. See: - # https://github.com/pydata/pandas-gbq/issues/195 +def test_read_gbq_with_deprecated_kwargs(monkeypatch): + captured_kwargs = {} - def mock_read_gbq(*args, **kwargs): + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) return DataFrame([[1.0]]) - monkeypatch.setattr(pandas_gbq, 'read_gbq', mock_read_gbq) - with tm.assert_produces_warning(FutureWarning): - pd.read_gbq("SELECT 1") + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + private_key = object() + pd.read_gbq("SELECT 1", verbose=True, private_key=private_key) + + assert captured_kwargs["verbose"] + assert captured_kwargs["private_key"] is private_key + + +def test_read_gbq_without_deprecated_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1") + + assert "verbose" not in captured_kwargs + assert "private_key" not in captured_kwargs + + +def test_read_gbq_with_new_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1", use_bqstorage_api=True) + + assert captured_kwargs["use_bqstorage_api"] + + +def test_read_gbq_without_new_kwargs(monkeypatch): + captured_kwargs = {} + + def mock_read_gbq(sql, **kwargs): + captured_kwargs.update(kwargs) + return DataFrame([[1.0]]) + + monkeypatch.setattr("pandas_gbq.read_gbq", mock_read_gbq) + pd.read_gbq("SELECT 1") + + assert "use_bqstorage_api" not in captured_kwargs @pytest.mark.single