Skip to content

MNT: Bump dev pin on NumPy #60987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 16 commits into from
Apr 3, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions asv_bench/benchmarks/indexing_engines.py
Original file line number Diff line number Diff line change
@@ -67,6 +67,14 @@ class NumericEngineIndexing:
def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype

if (
index_type == "non_monotonic"
and dtype in [np.int16, np.int8, np.uint8]
and unique
):
# Values overflow
raise NotImplementedError

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
@@ -115,6 +123,14 @@ def setup(self, engine_and_dtype, index_type, unique, N):
engine, dtype = engine_and_dtype
dtype = dtype.lower()

if (
index_type == "non_monotonic"
and dtype in ["int16", "int8", "uint8"]
and unique
):
# Values overflow
raise NotImplementedError

if index_type == "monotonic_incr":
if unique:
arr = np.arange(N * 3, dtype=dtype)
4 changes: 2 additions & 2 deletions doc/source/getting_started/comparison/comparison_with_r.rst
Original file line number Diff line number Diff line change
@@ -383,7 +383,7 @@ In Python, since ``a`` is a list, you can simply use list comprehension.

.. ipython:: python

a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4)
a = np.array(list(range(1, 24)) + [np.nan]).reshape(2, 3, 4)
pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)])

meltlist
@@ -402,7 +402,7 @@ In Python, this list would be a list of tuples, so

.. ipython:: python

a = list(enumerate(list(range(1, 5)) + [np.NAN]))
a = list(enumerate(list(range(1, 5)) + [np.nan]))
pd.DataFrame(a)

For more details and examples see :ref:`the Intro to Data Structures
4 changes: 2 additions & 2 deletions doc/source/user_guide/basics.rst
Original file line number Diff line number Diff line change
@@ -2064,12 +2064,12 @@ different numeric dtypes will **NOT** be combined. The following example will gi

.. ipython:: python

df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float32")
df1 = pd.DataFrame(np.random.randn(8, 1), columns=["A"], dtype="float64")
df1
df1.dtypes
df2 = pd.DataFrame(
{
"A": pd.Series(np.random.randn(8), dtype="float16"),
"A": pd.Series(np.random.randn(8), dtype="float32"),
"B": pd.Series(np.random.randn(8)),
"C": pd.Series(np.random.randint(0, 255, size=8), dtype="uint8"), # [0,255] (range of uint8)
}
2 changes: 2 additions & 0 deletions doc/source/user_guide/enhancingperf.rst
Original file line number Diff line number Diff line change
@@ -171,6 +171,7 @@ can be improved by passing an ``np.ndarray``.
In [4]: %%cython
...: cimport numpy as np
...: import numpy as np
...: np.import_array()
...: cdef double f_typed(double x) except? -2:
...: return x * (x - 1)
...: cpdef double integrate_f_typed(double a, double b, int N):
@@ -225,6 +226,7 @@ and ``wraparound`` checks can yield more performance.
...: cimport cython
...: cimport numpy as np
...: import numpy as np
...: np.import_array()
...: cdef np.float64_t f_typed(np.float64_t x) except? -2:
...: return x * (x - 1)
...: cpdef np.float64_t integrate_f_typed(np.float64_t a, np.float64_t b, np.int64_t N):
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.11.0.rst
Original file line number Diff line number Diff line change
@@ -74,10 +74,10 @@ Numeric dtypes will propagate and can coexist in DataFrames. If a dtype is passe

.. ipython:: python

df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float32')
df1 = pd.DataFrame(np.random.randn(8, 1), columns=['A'], dtype='float64')
df1
df1.dtypes
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float16'),
df2 = pd.DataFrame({'A': pd.Series(np.random.randn(8), dtype='float32'),
'B': pd.Series(np.random.randn(8)),
'C': pd.Series(range(8), dtype='uint8')})
df2
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
@@ -23,7 +23,7 @@ dependencies:

# required dependencies
- python-dateutil
- numpy<2
- numpy<3

# optional dependencies
- beautifulsoup4>=4.11.2
4 changes: 2 additions & 2 deletions pandas/compat/numpy/__init__.py
Original file line number Diff line number Diff line change
@@ -36,8 +36,8 @@
r".*In the future `np\.long` will be defined as.*",
FutureWarning,
)
np_long = np.long # type: ignore[attr-defined]
np_ulong = np.ulong # type: ignore[attr-defined]
np_long = np.long
np_ulong = np.ulong
except AttributeError:
np_long = np.int_
np_ulong = np.uint
2 changes: 1 addition & 1 deletion pandas/core/accessor.py
Original file line number Diff line number Diff line change
@@ -351,7 +351,7 @@ def register_dataframe_accessor(name: str) -> Callable[[TypeT], TypeT]:
AttributeError: The series must contain integer data only.
>>> df = pd.Series([1, 2, 3])
>>> df.int_accessor.sum()
6"""
np.int64(6)"""


@doc(_register_accessor, klass="Series", examples=_register_series_examples)
18 changes: 9 additions & 9 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
@@ -941,7 +941,7 @@ def argmin(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmin()
1
np.int64(1)
"""
# Implementer note: You have two places to override the behavior of
# argmin.
@@ -975,7 +975,7 @@ def argmax(self, skipna: bool = True) -> int:
--------
>>> arr = pd.array([3, 1, 2, 5, 4])
>>> arr.argmax()
3
np.int64(3)
"""
# Implementer note: You have two places to override the behavior of
# argmax.
@@ -1959,10 +1959,10 @@ def _formatter(self, boxed: bool = False) -> Callable[[Any], str | None]:
--------
>>> class MyExtensionArray(pd.arrays.NumpyExtensionArray):
... def _formatter(self, boxed=False):
... return lambda x: "*" + str(x) + "*" if boxed else repr(x) + "*"
... return lambda x: "*" + str(x) + "*"
>>> MyExtensionArray(np.array([1, 2, 3, 4]))
<MyExtensionArray>
[1*, 2*, 3*, 4*]
[*1*, *2*, *3*, *4*]
Length: 4, dtype: int64
"""
if boxed:
@@ -2176,15 +2176,15 @@ def _reduce(
Examples
--------
>>> pd.array([1, 2, 3])._reduce("min")
1
np.int64(1)
>>> pd.array([1, 2, 3])._reduce("max")
3
np.int64(3)
>>> pd.array([1, 2, 3])._reduce("sum")
6
np.int64(6)
>>> pd.array([1, 2, 3])._reduce("mean")
2.0
np.float64(2.0)
>>> pd.array([1, 2, 3])._reduce("median")
2.0
np.float64(2.0)
"""
meth = getattr(self, name, None)
if meth is None:
2 changes: 1 addition & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
@@ -275,7 +275,7 @@ def _unbox_scalar(
--------
>>> arr = pd.array(np.array(["1970-01-01"], "datetime64[ns]"))
>>> arr._unbox_scalar(arr[0])
numpy.datetime64('1970-01-01T00:00:00.000000000')
np.datetime64('1970-01-01T00:00:00.000000000')
"""
raise AbstractMethodError(self)

3 changes: 2 additions & 1 deletion pandas/core/arrays/interval.py
Original file line number Diff line number Diff line change
@@ -1775,7 +1775,8 @@ def to_tuples(self, na_tuple: bool = True) -> np.ndarray:
[(0, 1], (1, 2]]
Length: 2, dtype: interval[int64, right]
>>> idx.to_tuples()
array([(0, 1), (1, 2)], dtype=object)
array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))],
dtype=object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In https://github.com/pandas-dev/pandas/pull/60987/files#r2016346826 I commented because the dtype of the array was the same as the dtype of the scalars.

Thinking some more, we currently have...

pd.Series([123,"123"])
# 0    123
# 1    123
# dtype: object

so for consistency, even object arrays should probably not show the NEP51 repr?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that's an option here, this is what happens when you call str on a tuple of NumPy scalars.

print(str((np.int64(0), np.int64(1))))
# (np.int64(0), np.int64(1))

arr = pd.array([(np.int64(0), np.int64(1)), (np.int64(1), np.int64(2))], dtype=object)
print(type(arr[0]), str(arr[0]))
# <class 'tuple'> (np.int64(0), np.int64(1))

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems reasonable however extending the example I gave for the strings in an object array...

s = pd.Series([123, "123", ("123", "123")])
print(s)
print(s[2])
# 0           123
# 1           123
# 2    (123, 123)
# dtype: object
# ('123', '123')

however, we don't currently display the repr for strings in a collection either when the array is displayed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

my bad, the numpy array of values does show the repr for the strings in a tuple.

s.values
# array([123, '123', ('123', '123')], dtype=object)


For :class:`pandas.IntervalIndex`:

32 changes: 16 additions & 16 deletions pandas/core/arrays/masked.py
Original file line number Diff line number Diff line change
@@ -1378,25 +1378,25 @@ def any(
skips NAs):

>>> pd.array([True, False, True]).any()
True
np.True_
>>> pd.array([True, False, pd.NA]).any()
True
np.True_
>>> pd.array([False, False, pd.NA]).any()
False
np.False_
>>> pd.array([], dtype="boolean").any()
False
np.False_
>>> pd.array([pd.NA], dtype="boolean").any()
False
np.False_
>>> pd.array([pd.NA], dtype="Float64").any()
False
np.False_

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):

>>> pd.array([True, False, pd.NA]).any(skipna=False)
True
np.True_
>>> pd.array([1, 0, pd.NA]).any(skipna=False)
True
np.True_
>>> pd.array([False, False, pd.NA]).any(skipna=False)
<NA>
>>> pd.array([0, 0, pd.NA]).any(skipna=False)
@@ -1466,17 +1466,17 @@ def all(
skips NAs):

>>> pd.array([True, True, pd.NA]).all()
True
np.True_
>>> pd.array([1, 1, pd.NA]).all()
True
np.True_
>>> pd.array([True, False, pd.NA]).all()
False
np.False_
>>> pd.array([], dtype="boolean").all()
True
np.True_
>>> pd.array([pd.NA], dtype="boolean").all()
True
np.True_
>>> pd.array([pd.NA], dtype="Float64").all()
True
np.True_

With ``skipna=False``, the result can be NA if this is logically
required (whether ``pd.NA`` is True or False influences the result):
@@ -1486,9 +1486,9 @@ def all(
>>> pd.array([1, 1, pd.NA]).all(skipna=False)
<NA>
>>> pd.array([True, False, pd.NA]).all(skipna=False)
False
np.False_
>>> pd.array([1, 0, pd.NA]).all(skipna=False)
False
np.False_
"""
nv.validate_all((), kwargs)

4 changes: 2 additions & 2 deletions pandas/core/arrays/sparse/accessor.py
Original file line number Diff line number Diff line change
@@ -297,7 +297,7 @@ class SparseFrameAccessor(BaseAccessor, PandasDelegate):
--------
>>> df = pd.DataFrame({"a": [1, 2, 0, 0], "b": [3, 0, 0, 4]}, dtype="Sparse[int]")
>>> df.sparse.density
0.5
np.float64(0.5)
"""

def _validate(self, data) -> None:
@@ -459,7 +459,7 @@ def density(self) -> float:
--------
>>> df = pd.DataFrame({"A": pd.arrays.SparseArray([0, 1, 0, 1])})
>>> df.sparse.density
0.5
np.float64(0.5)
"""
tmp = np.mean([column.array.density for _, column in self._parent.items()])
return tmp
10 changes: 5 additions & 5 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
@@ -804,9 +804,9 @@ def argmax(
dtype: float64

>>> s.argmax()
2
np.int64(2)
>>> s.argmin()
0
Comment on lines 806 to -809
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are a few of these where I'm wondering if we should be returning Python scalars instead of NumPy. Should issues be opened for these?

cc @pandas-dev/pandas-core

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think generally we always want to return Python scalars (IIRC we got a lot of issues about this in iteration and iteration-like APIs in the past)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Even just wrapping the result of Series._reduce with maybe_box_naive breaks 692 tests. From a cursory look, they're tests that are expecting a NumPy scalar back. A lot however are something like op(data).any().any() so that they will work with DataFrame and Series. I plan to bring this up in the next dev meeting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree we should always return Python scalars. I'm surprised at the amount of failures that expect NumPy scalars

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd think you need a deprecation on this, because people may have code that depends on the result being a numpy scalar. I think that the tests we have in pandas-stubs for typing may depend on this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could put it up behind a future option, maybe something like always_return_python_scalars

np.int64(0)

The maximum cereal calories is the third element and
the minimum cereal calories is the first element,
@@ -1360,7 +1360,7 @@ def factorize(
dtype: int64

>>> ser.searchsorted(4)
3
np.int64(3)

>>> ser.searchsorted([0, 4])
array([0, 3])
@@ -1379,7 +1379,7 @@ def factorize(
dtype: datetime64[s]

>>> ser.searchsorted('3/14/2000')
3
np.int64(3)

>>> ser = pd.Categorical(
... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
@@ -1389,7 +1389,7 @@ def factorize(
Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']

>>> ser.searchsorted('bread')
1
np.int64(1)

>>> ser.searchsorted(['bread'], side='right')
array([3])
3 changes: 2 additions & 1 deletion pandas/core/common.py
Original file line number Diff line number Diff line change
@@ -246,7 +246,8 @@ def asarray_tuplesafe(values: Iterable, dtype: NpDtype | None = None) -> ArrayLi
with warnings.catch_warnings():
# Can remove warning filter once NumPy 1.24 is min version
if not np_version_gte1p24:
warnings.simplefilter("ignore", np.VisibleDeprecationWarning)
# np.VisibleDeprecationWarning only in np.exceptions in 2.0
warnings.simplefilter("ignore", np.VisibleDeprecationWarning) # type: ignore[attr-defined]
result = np.asarray(values, dtype=dtype)
except ValueError:
# Using try/except since it's more performant than checking is_list_like
6 changes: 3 additions & 3 deletions pandas/core/dtypes/missing.py
Original file line number Diff line number Diff line change
@@ -428,9 +428,9 @@ def array_equivalent(
Examples
--------
>>> array_equivalent(np.array([1, 2, np.nan]), np.array([1, 2, np.nan]))
True
np.True_
>>> array_equivalent(np.array([1, np.nan, 2]), np.array([1, 2, np.nan]))
False
np.False_
"""
left, right = np.asarray(left), np.asarray(right)

@@ -626,7 +626,7 @@ def na_value_for_dtype(dtype: DtypeObj, compat: bool = True):
>>> na_value_for_dtype(np.dtype("bool"))
False
>>> na_value_for_dtype(np.dtype("datetime64[ns]"))
numpy.datetime64('NaT')
np.datetime64('NaT')
"""

if isinstance(dtype, ExtensionDtype):
8 changes: 4 additions & 4 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
@@ -887,7 +887,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
dtype: int64

>>> even_primes.squeeze()
2
np.int64(2)

Squeezing objects with more than one value in every axis does nothing:

@@ -945,7 +945,7 @@ def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame:
Squeezing all axes will project directly into a scalar:

>>> df_0a.squeeze()
1
np.int64(1)
"""
axes = range(self._AXIS_LEN) if axis is None else (self._get_axis_number(axis),)
result = self.iloc[
@@ -7954,7 +7954,7 @@ def asof(self, where, subset=None):
dtype: float64

>>> s.asof(20)
2.0
np.float64(2.0)

For a sequence `where`, a Series is returned. The first value is
NaN, because the first element of `where` is before the first
@@ -7969,7 +7969,7 @@ def asof(self, where, subset=None):
NaN, even though NaN is at the index location for ``30``.

>>> s.asof(30)
2.0
np.float64(2.0)

Take all columns into consideration

Loading
Oops, something went wrong.