Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TRACKER: milestones #44

Open
24 of 32 tasks
DiegoAlbertoTorres opened this issue May 28, 2020 · 11 comments
Open
24 of 32 tasks

TRACKER: milestones #44

DiegoAlbertoTorres opened this issue May 28, 2020 · 11 comments

Comments

@DiegoAlbertoTorres
Copy link
Collaborator

DiegoAlbertoTorres commented May 28, 2020

In order in which we should tackle them:

@jreback
Copy link
Collaborator

jreback commented Jun 3, 2020

@jreback
Copy link
Collaborator

jreback commented Aug 13, 2020

@mroeschke added 14. above, but i think should be straightforward, if you can work on this when you have a chance.

@RhysU
Copy link

RhysU commented Aug 19, 2020

@mroeschke, below please find some sample code that can be used to exercise pandas-dev#35690. Apologies for the inline-- couldn't get an attachment up on this tracker even applying some obvious trickery. Stick this into, say, foo.py and run with python -m unittest foo.py -v:

## Implementation file
import datetime
import numbers
import typing

import pandas as pd


T = typing.TypeVar("T")
U = typing.TypeVar("U")


def half_open(df: T, start: typing.Optional[U], stop: typing.Optional[U]) -> T:
    """
    Return the portion of df within the half-open interval [start, stop).

    See half_open_slice(...) for how DatetimeIndex-related slicing is handled.
    """
    # Multiple lines to make stack traces more informative
    slyce = half_open_slice(df=df, s=slice(start, stop, None))
    subset = df[slyce]
    return subset


# The following is an altogether unpleasant function hiding many gotchas
# See https://github.com/pandas-dev/pandas/issues/35690 for some dragons
def half_open_slice(df: typing.Union[pd.DataFrame, pd.Series], s: slice) -> slice:
    """
    Return slice(start, stop, step) matching [s.start, s.stop) within df.

    When the DataFrame has a DatetimeIndex-like index with non-None tzinfo,
    this routine infers the UTC timezone for any slice start or stop
    value that lacks tzinfo entirely or that has tzinfo is None.
    This UTC-inference allows pre-1.0 Pandas logic to work unchanged post-1.0.
    """
    # Check incoming df then grab the index for brevity
    assert isinstance(df, pd.DataFrame) or isinstance(df, pd.Series), type(df)
    index = df.index

    # Unpack incoming s for brevity and retain original s in local variables
    assert isinstance(s, slice), type(s)
    start, stop, step = s.start, s.stop, s.step

    # Additional start/stop coercion for only DatetimeIndex-like indices
    # Use Timestamps due to https://github.com/pandas-dev/pandas/issues/35690
    # TODO After #35690 fixed, converting start/stop to Timestamp may not be needed
    attr_tz = "tzinfo"
    if hasattr(index, attr_tz):
        # Below, calls to pd.Timestamp(...) surprisingly interpret integers.
        # Refuse ambiguous requests to reduce likelihood for surprising bugs.
        if isinstance(start, numbers.Number) or isinstance(stop, numbers.Number):
            raise TypeError(
                f"{type(start)}, {type(stop)} prohibited: {start}, {stop}."
                f' Strings like "20200817" can succinctly express dates.'
                f" Supply pandas.Timestamps to express fine-grained times like"
                f" Timestamp('1970-01-01 00:00:00.020200817+0000', tz='UTC')."
            )
        index_has_tz = index.tzinfo is not None
        infer_start = index_has_tz and getattr(start, attr_tz, None) is None
        infer_stop = index_has_tz and getattr(stop, attr_tz, None) is None
        if start is not None:
            start = pd.Timestamp(
                start, tz=datetime.timezone.utc if infer_start else None
            )
        if stop is not None:
            stop = pd.Timestamp(stop, tz=datetime.timezone.utc if infer_stop else None)

    # Convert start to Optional[int]-valued index location start_idx
    # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
    # When below index.min() return 0 as if index.get_slice_bound(...) ran
    # TODO After #35690 fixed, may not need start < index.min() clause.
    # TODO After #35690 fixed, may not need start > index.max() clause.
    if start is None:
        start_idx = None
    elif start < index.min():
        start_idx = 0
    elif start > index.max():  # TODO Branch not unit tested below
        start_idx = len(df)
    else:
        start_idx = index.get_slice_bound(start, side="left", kind="ix")

    # Convert stop to Optional[int]-valued index location stop_idx
    # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
    # When above index.max() return len as if index.get_slice_bound(...) ran
    # TODO After #35690 fixed, may not need stop < index.min() clause.
    # TODO After #35690 fixed, may not need stop > index.max() clause.
    if stop is None:
        stop_idx = None
    elif stop < index.min():  # TODO Branch not unit tested below
        stop_idx = 0
    elif stop > index.max():
        stop_idx = len(df)
    else:
        stop_idx = index.get_slice_bound(stop, side="left", kind="ix")

    # Return the requested half-open slice
    return slice(start_idx, stop_idx, step)


## Test file
import datetime
import unittest
import typing

import numpy as np
import numpy.testing as npt
import pandas as pd
import pandas.util.testing as put  # Later, pandas.testing!
import pytz


# MAINTAINERS:
#
#   First, understand the contract implied by TZ_PAIRS_ACCEPTED/REJECTED.
#   Second, make the routine test_series_with_datetimeindex_accepted pass.
#   If that is not working, say after a Pandas upgrade, nothing else will.
#   After that, proceed down the file in the order tests are written.
#
# Beefy coverage due to https://github.com/pandas-dev/pandas/issues/35690
class HalfOpen(unittest.TestCase):
    """Test half_open(...) and half_open_slice(...)."""

    # Which pairs of timezones should run to completion without raising?
    # "Europe/Amsterdam" is a stand-in for something very different from UTC.
    # "Etc/Greenwich" is chosen as tzinfos UTC != Greenwich but times align.
    # In particular, times aligning eases reusing the same tests/assertions.
    TZ_PAIRS_ACCEPTED = (
        # No timezone information in either the Pandas Index or slicing
        (None, None),
        # UTC information in both the Index and slicing
        ("UTC", "utc"),
        # Something other than UTC in both spots
        ("Europe/Amsterdam", "Europe/Amsterdam"),
        # UTC in the Pandas Index and None slicing implies UTC slicing
        ("utc", None),
        # Two different timezones with UTC on the DatetimeIndex
        ("UTC", "Etc/Greenwich"),
        # Two different timezones with Greenwich on the DatetimeIndex
        ("Etc/Greenwich", "utc"),
        # Greenwich in the Pandas Index and None slicing implies UTC slicing
        ("Etc/Greenwich", None),
    )

    # Which pairs of timezones should be confirmed as raising errors?
    TZ_PAIRS_REJECTED = (
        # No timezone on the Pandas index but UTC slicing
        (None, "utc"),
        # No timezone on the Pandas index but Amsterdam slicing
        (None, "Europe/Amsterdam"),
        # No timezone on the Pandas index but Greenwich slicing
        (None, "Etc/Greenwich"),
    )

    def series_with_datetimeindex(self, index_tz: str) -> pd.DataFrame:
        """Construct Series with multiple entries per index Timestamp."""
        # Construct sample with multiple entries per index Timestamp
        # Then defensively confirm construction is as expected.
        s = pd.concat(
            [
                put.makeTimeSeries(nper=10, freq="B"),
                put.makeTimeSeries(nper=10, freq="B"),
            ]
        ).sort_index()
        s = self.localize(s, index_tz)
        self.assertEqual(s.index[+0], pd.Timestamp("2000-01-03", tz=index_tz))
        self.assertEqual(s.index[-1], pd.Timestamp("2000-01-14", tz=index_tz))
        return s

    def localize(self, df, tz):
        """Ensure DataFrame or Series is appropriately tz-localized."""
        self.assertIsNone(df.index.tzinfo)
        if tz is not None:
            self.assertIsInstance(tz, str)
            df = df.tz_localize(tz)
            self.assertEqual(df.index.tzinfo, pytz.timezone(tz))
        return df

    def test_series_with_datetimeindex_accepted(self):
        """Confirm [start, stop) successfully slices a Series."""
        for index_tz, tz in self.TZ_PAIRS_ACCEPTED:
            with self.subTest(index_tz=index_tz, tz=tz):
                # Prepare the sample Series
                s = self.series_with_datetimeindex(index_tz=index_tz)

                # Degenerate slices retrieve all information
                a1 = half_open(s, start=None, stop=None)
                a2 = half_open(
                    s, start=pd.Timestamp("1970-01-01", tz=tz), stop=None
                )
                a3 = half_open(
                    s, start=None, stop=pd.Timestamp("2038-01-01", tz=tz)
                )
                a4 = half_open(
                    s,
                    start=pd.Timestamp("1970-01-01", tz=tz),
                    stop=pd.Timestamp("2038-01-01", tz=tz),
                )
                put.assert_series_equal(a1, s)
                put.assert_series_equal(a2, s)
                put.assert_series_equal(a3, s)
                put.assert_series_equal(a4, s)

                # Degenerate slices retrieve no information
                # TODO Test start > s.index.max()
                # TODO Test stop < s.index.min()

                # Half-open with left boundary both finite and internal
                b = half_open(
                    s, start=pd.Timestamp("2000-01-04", tz=tz), stop=None
                )
                put.assert_series_equal(b, s.iloc[+2:])

                # Half-open with right boundary both finite and internal
                c = half_open(
                    s, start=None, stop=pd.Timestamp("2000-01-14", tz=tz)
                )
                put.assert_series_equal(c, s.iloc[:-2])

                # Half-open with both boundaries finite and internal
                d = half_open(
                    s,
                    start=pd.Timestamp("2000-01-04", tz=tz),
                    stop=pd.Timestamp("2000-01-14", tz=tz),
                )
                put.assert_series_equal(d, s.iloc[+2:-2])

    def test_series_with_datetimeindex_no_number_footcannons(self):
        """Confirm [start, stop) is rejected whenever an integer is given."""
        # Both Pandas and half_open_slice(...) can emit TypeErrors
        # A regex ensures the ones below are raised by half_open_slice(...).
        regex = " prohibited"

        # Generate sample data
        index_tz, _ = self.TZ_PAIRS_ACCEPTED[0]
        s = self.series_with_datetimeindex(index_tz=index_tz)

        # An number in either or both slots should raise
        with self.assertRaisesRegex(TypeError, regex):
            half_open(s, start=123, stop=None)
        with self.assertRaisesRegex(TypeError, regex):
            half_open(s, start=None, stop=456)
        with self.assertRaisesRegex(TypeError, regex):
            half_open(s, start=123.0, stop=456.0)

    def test_slicing_with_int64index(self):
        """
        half_open_slice(...) must work for integer-valued, non-unique indices.

        Generally, the method will be used on DatetimeIndex-es, but it
        should turn off cleanly when some other index type is supplied.
        Additionally, non-None slice steps are tested here.
        """
        # Create an index with duplicates like [0, 0, 1, 1, ..., 4, 4].
        df = pd.concat(
            (pd.DataFrame(data=list("abcde")), pd.DataFrame(data=list("abcde"))), axis=0
        )
        df.sort_index(inplace=True)
        self.assertEqual(10, len(df))
        self.assertEqual("int64", df.index.dtype.name)

        # Degenerate slices retrieve the entire range
        for step in (None, 1, 3, -2):
            self.assertEqual(
                slice(None, None, step),
                half_open_slice(df, slice(None, None, step)),
            )
            self.assertEqual(
                slice(0, None, step),
                half_open_slice(df, slice(-999999, None, step)),
            )
            self.assertEqual(
                slice(None, 10, step),
                half_open_slice(df, slice(None, +999999, step)),
            )
            self.assertEqual(
                slice(0, 10, step),
                half_open_slice(df, slice(-999999, +999999, step)),
            )

        # Degenerate slices retrieve no information
        # TODO Test start > s.index.max()
        # TODO Test stop < s.index.min()

        # Half-open with left boundary both finite and internal
        for step in (None, 1, 3, -2):
            self.assertEqual(
                slice(2, None, step), half_open_slice(df, slice(1, None, step))
            )

        # Half-open with right boundary both finite and internal
        for step in (None, 1, 3, -2):
            self.assertEqual(
                slice(None, 6, step), half_open_slice(df, slice(None, 3, step))
            )

        # Half-open with both boundaries finite and internal
        for step in (None, 1, 3, -2):
            self.assertEqual(
                slice(4, 8, step), half_open_slice(df, slice(2, 4, step))
            )

        # Defend against off-by-one coding mistakes at the left boundary
        self.assertEqual(slice(0, None), half_open_slice(df, slice(-1, None)))
        self.assertEqual(slice(0, None), half_open_slice(df, slice(+0, None)))
        self.assertEqual(slice(2, None), half_open_slice(df, slice(+1, None)))

        # Defend against off-by-one coding mistakes at the right boundary
        self.assertEqual(slice(None, 8), half_open_slice(df, slice(None, +4)))
        self.assertEqual(slice(None, 10), half_open_slice(df, slice(None, +5)))
        self.assertEqual(slice(None, 10), half_open_slice(df, slice(None, +6)))

    def test_slicing_with_object(self):
        """
        half_open_slice(...) must work for string-valued keys.

        Generally, the method will be used on DatetimeIndex-es, but it
        should turn off cleanly when some other index type is supplied.
        In particular, keys like "2020-01-01" should be dates for
        DatetimeIndex-es but they should be strings for other indices.
        """
        # Create an index like ["b", "c", "d", "e", "f"]
        df = pd.DataFrame(index=list("bcdef"), data=(0, 1, 2, 3, 4))
        self.assertEqual(5, len(df))
        self.assertEqual("object", df.index.dtype.name)

        # Degenerate slices retrieve the entire range
        self.assertEqual(
            slice(None, None), half_open_slice(df, slice(None, None))
        )
        self.assertEqual(slice(0, None), half_open_slice(df, slice("a", None)))
        self.assertEqual(slice(None, 5), half_open_slice(df, slice(None, "g")))
        self.assertEqual(slice(0, 5), half_open_slice(df, slice("a", "g")))

        # Degenerate slices retrieve no information
        # TODO Test start > s.index.max()
        # TODO Test stop < s.index.min()

        # Date-ish strings are treated lexicographically for this index
        self.assertEqual(
            slice(0, 0), half_open_slice(df, slice("2020-08-13", "2020-08-14"))
        )

        # Half-open with left boundary both finite and internal
        self.assertEqual(slice(2, None), half_open_slice(df, slice("d", None)))

        # Half-open with right boundary both finite and internal
        self.assertEqual(slice(None, 4), half_open_slice(df, slice(None, "f")))

        # Half-open with both boundaries finite and internal
        self.assertEqual(slice(1, 4), half_open_slice(df, slice("c", "f")))

    def dataframe_with_datetimeindex(self, index_tz: str) -> pd.DataFrame:
        """Construct DataFrame with multiple entries per index Timestamp."""
        df = []
        for tid in (3, 5, 7):
            df.append(put.makeTimeDataFrame(nper=10, freq="B"))
            df[-1]["tid"] = tid
        df = pd.concat(df).sort_index()
        df = self.localize(df, index_tz)
        self.assertEqual(df.index[+0], pd.Timestamp("2000-01-03", tz=index_tz))
        self.assertEqual(df.index[-1], pd.Timestamp("2000-01-14", tz=index_tz))
        return df

    @staticmethod
    def constructors(
        tz: str
    ) -> typing.Iterable[
        typing.Tuple[str, typing.Callable[[int, int, int], typing.Any]]
    ]:
        """
        An iterable of named, date-like object *constructors* for testing.

        Some of the test cases below iterate across these various objects.
        The names allow self.subTest(...) can identify specific constructors.
        """
        # Pandas' Timestamp object.  Nothing below will pass if this does not.
        yield (
            "Timestamp",
            lambda y, m, d: pd.Timestamp(datetime.date(year=y, month=m, day=d), tz=tz),
        )

        # The Python standard datetime.datetime using pytz for tzinfo
        #
        # Beware that datetime.datetime(..., tzinfo=...) is misleading per
        # https://typesandtimes.net/2019/05/royal-astronomical-society-python.
        #
        # Beware also pd.to_datetime(...) standardizes timezones in ways that
        # pd.Timestamp(...) does not.  In particular, "fixing" the API above.
        yield (
            "datetime",
            (
                lambda y, m, d: datetime.datetime(year=y, month=m, day=d)
                if tz is None
                else pytz.timezone(tz).localize(
                    datetime.datetime(year=y, month=m, day=d)
                )
            ),
        )

        # A datetime.date cannot possess a timezone so only test when possible
        if tz is None:
            yield ("date", lambda y, m, d: datetime.date(y, m, d))

        # A str can specify a timezone but here restrict to the case without
        if tz is None:
            yield ("str1", lambda y, m, d: f"{y:d}-{m:02d}-{d:02d}")  # Hypens
            yield ("str2", lambda y, m, d: f"{y:d}{m:02d}{d:02d}")  # None

        # A str that may contain a timezone specification
        # For example, "2020-01-04T00:00:00+01:00"
        yield (
            "isoformat",
            lambda y, m, d: pd.Timestamp(
                datetime.date(year=y, month=m, day=d), tz=tz
            ).isoformat(),
        )

    def test_dataframe_with_datetimeindex_accepted(self):
        """Confirm [start, stop) successfully slices a DataFrame."""
        for index_tz, tz in self.TZ_PAIRS_ACCEPTED:
            for name, constructor in self.constructors(tz):
                with self.subTest(index_tz=index_tz, tz=tz, name=name):
                    # Prepare the sample Dataframe
                    df = self.dataframe_with_datetimeindex(index_tz=index_tz)

                    # Degenerate slices retrieve all information
                    a1 = half_open(df, start=None, stop=None)
                    a2 = half_open(df, start=constructor(1970, 1, 1), stop=None)
                    a3 = half_open(df, start=None, stop=constructor(2038, 1, 1))
                    a4 = half_open(
                        df, start=constructor(1970, 1, 1), stop=constructor(2038, 1, 1)
                    )
                    put.assert_frame_equal(a1, df)
                    put.assert_frame_equal(a2, df)
                    put.assert_frame_equal(a3, df)
                    put.assert_frame_equal(a4, df)

                    # Degenerate slices retrieve no information
                    # TODO Test start > s.index.max()
                    # TODO Test stop < s.index.min()

                    # Half-open with left boundary both finite and internal
                    b = half_open(df, start=constructor(2000, 1, 4), stop=None)
                    put.assert_frame_equal(b, df.iloc[+3:, :])

                    # Half-open with right boundary both finite and internal
                    c = half_open(df, start=None, stop=constructor(2000, 1, 14))
                    put.assert_frame_equal(c, df.iloc[:-3, :])

                    # Half-open with both boundaries finite and internal
                    d = half_open(
                        df, start=constructor(2000, 1, 4), stop=constructor(2000, 1, 14)
                    )
                    put.assert_frame_equal(d, df.iloc[+3:-3, :])

    # No Series variant of this function because underlying logic identical
    def test_dataframe_with_datetimeindex_rejected(self):
        """Confirm invalid timezone pairs are rejected."""
        for index_tz, tz in self.TZ_PAIRS_REJECTED:
            for name, constructor in self.constructors(tz):
                with self.subTest(index_tz=index_tz, tz=tz, name=name):
                    # Prepare the sample Dataframe
                    df = self.dataframe_with_datetimeindex(index_tz=index_tz)

                    # Finite slices outside the bounds of the DatetimeIndex
                    with self.assertRaises(TypeError):
                        half_open(df, start=constructor(1970, 1, 1), stop=None)
                    with self.assertRaises(TypeError):
                        half_open(df, start=None, stop=constructor(2038, 1, 1))
                    with self.assertRaises(TypeError):
                        half_open(
                            df,
                            start=constructor(1970, 1, 1),
                            stop=constructor(2038, 1, 1),
                        )

                    # Degenerate slices retrieve no information
                    # TODO Test start > s.index.max()
                    # TODO Test stop < s.index.min()

                    # Half-open with left boundary both finite and internal
                    with self.assertRaises(TypeError):
                        half_open(df, start=constructor(2000, 1, 4), stop=None)

                    # Half-open with right boundary both finite and internal
                    with self.assertRaises(TypeError):
                        half_open(df, start=None, stop=constructor(2000, 1, 14))

                    # Half-open with both boundaries finite and internal
                    with self.assertRaises(TypeError):
                        half_open(
                            df,
                            start=constructor(2000, 1, 4),
                            stop=constructor(2000, 1, 14),
                        )

@mroeschke
Copy link
Collaborator

@RhysU I have a PR in the pandas repo that addresses the get_slice_bound bugs: pandas-dev#35848

It passed your unit tests on my branch; I adapted some of the tests you had there in my PR

@RhysU
Copy link

RhysU commented Aug 22, 2020

@mroeschke, thank you. I pulled down pandas-dev#35848. I confirm that it passes the unit tests from above after I change my kind="ix" to kind="loc".

I then modified my half_open_slice(...) logic to remove the start < index.min()-like workarounds as follows and re-tested:

diff --git a/foo.py b/foo.py
index 3666ce3..6a2e55e 100644
--- a/foo.py
+++ b/foo.py
@@ -68,30 +68,18 @@ def half_open_slice(df: typing.Union[pd.DataFrame, pd.Series], s: slice) -> slic
     # Convert start to Optional[int]-valued index location start_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When below index.min() return 0 as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need start < index.min() clause.
-    # TODO After #35690 fixed, may not need start > index.max() clause.
     if start is None:
         start_idx = None
-    elif start < index.min():
-        start_idx = 0
-    elif start > index.max():  # TODO Branch not unit tested below
-        start_idx = len(df)
     else:
-        start_idx = index.get_slice_bound(start, side="left", kind="ix")
+        start_idx = index.get_slice_bound(start, side="left", kind="loc")
 
     # Convert stop to Optional[int]-valued index location stop_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When above index.max() return len as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need stop < index.min() clause.
-    # TODO After #35690 fixed, may not need stop > index.max() clause.
     if stop is None:
         stop_idx = None
-    elif stop < index.min():  # TODO Branch not unit tested below
-        stop_idx = 0
-    elif stop > index.max():
-        stop_idx = len(df)
     else:
-        stop_idx = index.get_slice_bound(stop, side="left", kind="ix")
+        stop_idx = index.get_slice_bound(stop, side="left", kind="loc")
 
     # Return the requested half-open slice
     return slice(start_idx, stop_idx, step)

I observe that when the DatetimeIndex.tzinfo is None the changes in pandas-dev#35848 do not raise TypeErrors when the location includes a non-None tzinfo. There are 6 failures in the above test suite then, and they all look like:

======================================================================
FAIL: test_dataframe_with_datetimeindex_rejected (foo.HalfOpen) (index_tz=None, tz='utc', name='Timestamp')
Confirm invalid timezone pairs are rejected.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/rhys/tmp/foo.py", line 471, in test_dataframe_with_datetimeindex_rejected
    half_open(df, start=constructor(2000, 1, 4), stop=None)
AssertionError: TypeError not raised

This behavior goes to some of my uncertainty in pandas-dev#35690. My tests assert that a TypeError arises when questions are asked about index.tzinfo is None using a pd.Timestamp.tzinfo is not None because I don't believe there's a well-defined answer. That said, pandas may have some coercion policy of which I'm not aware.

@mroeschke
Copy link
Collaborator

Looks like pandas currently allows indexing a tz-naive index with a tz-aware argument but with the implicit assumption to index the tz-naive index "as UTC"

In [11]: df = pd.DataFrame(range(3), pd.date_range('2020', freq='D', periods=3))

In [12]: df
Out[12]:
            0
2020-01-01  0
2020-01-02  1
2020-01-03  2

In [13]: df.loc[pd.Timestamp('2020-01-01', tz='UTC'), 0]
Out[13]: 0

In [14]: df.loc['2020-01-01 00:00:00+00:00', 0]
Out[14]: 0

In [15]: df.loc[pd.Timestamp('2019-12-31 16:00:00', tz='US/Pacific'), 0]
Out[15]: 0

So given this behavior, I would say is okay to have get_slice_bounds accept a tz-aware Timestamp when the index is a tz-naive DatetimeIndex

@RhysU
Copy link

RhysU commented Aug 24, 2020

'Evening @mroeschke.

I don't disagree with your assessment but I'll note that there seems to be little consistency in inferring UTC for a None tzinfo:

>>> import pandas as pd
>>> a = pd.Timestamp('2020-01-01', tz='UTC')
>>> b = pd.Timestamp('2020-01-01', tz=None)
>>> a < b
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "pandas/_libs/tslibs/timestamps.pyx", line 255, in pandas._libs.tslibs.timestamps._Timestamp.__richcmp__
  File "pandas/_libs/tslibs/timestamps.pyx", line 303, in pandas._libs.tslibs.timestamps._Timestamp._assert_tzawareness_compat
TypeError: Cannot compare tz-naive and tz-aware timestamps

I see the same for a == b. That inconsistency in inferring UTC makes me think it'll cause odd edge cases. That looks out-of-scope for this ticket but it'll reappear below.

Returning to my original tests, if I understand your point correctly then I expect that all None/non-None pairs of timezones should work. Going back to the above file and applying the following diff is equivalent to saying that None/UTC, None/Greenwich, etc should pass my tests:

diff --git a/foo.py b/foo.py
index 3666ce3..0a278b1 100644
--- a/foo.py
+++ b/foo.py
@@ -68,30 +68,18 @@ def half_open_slice(df: typing.Union[pd.DataFrame, pd.Series], s: slice) -> slic
     # Convert start to Optional[int]-valued index location start_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When below index.min() return 0 as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need start < index.min() clause.
-    # TODO After #35690 fixed, may not need start > index.max() clause.
     if start is None:
         start_idx = None
-    elif start < index.min():
-        start_idx = 0
-    elif start > index.max():  # TODO Branch not unit tested below
-        start_idx = len(df)
     else:
-        start_idx = index.get_slice_bound(start, side="left", kind="ix")
+        start_idx = index.get_slice_bound(start, side="left", kind="loc")
 
     # Convert stop to Optional[int]-valued index location stop_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When above index.max() return len as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need stop < index.min() clause.
-    # TODO After #35690 fixed, may not need stop > index.max() clause.
     if stop is None:
         stop_idx = None
-    elif stop < index.min():  # TODO Branch not unit tested below
-        stop_idx = 0
-    elif stop > index.max():
-        stop_idx = len(df)
     else:
-        stop_idx = index.get_slice_bound(stop, side="left", kind="ix")
+        stop_idx = index.get_slice_bound(stop, side="left", kind="loc")
 
     # Return the requested half-open slice
     return slice(start_idx, stop_idx, step)
@@ -139,10 +127,6 @@ class HalfOpen(unittest.TestCase):
         ("Etc/Greenwich", "utc"),
         # Greenwich in the Pandas Index and None slicing implies UTC slicing
         ("Etc/Greenwich", None),
-    )
-
-    # Which pairs of timezones should be confirmed as raising errors?
-    TZ_PAIRS_REJECTED = (
         # No timezone on the Pandas index but UTC slicing
         (None, "utc"),
         # No timezone on the Pandas index but Amsterdam slicing
@@ -151,6 +135,10 @@ class HalfOpen(unittest.TestCase):
         (None, "Etc/Greenwich"),
     )
 
+    # Which pairs of timezones should be confirmed as raising errors?
+    TZ_PAIRS_REJECTED = (
+    )
+
     def series_with_datetimeindex(self, index_tz: str) -> pd.DataFrame:
         """Construct Series with multiple entries per index Timestamp."""
         # Construct sample with multiple entries per index Timestamp

That is, all TZ_PAIRS should be acceptable and len(TZ_PAIRS_REJECTED) == 0. (I have to remove the start < index.min() workarounds now because they run afoul of my a < b counterexample and get_slice_bound would otherwise not be invoked). Running with that edit, I observe:

ERROR: test_dataframe_with_datetimeindex_accepted (foo.HalfOpen) (index_tz=None, tz='utc', name='Timestamp')
Confirm [start, stop) successfully slices a DataFrame.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/rhys/tmp/pandas/pandas/core/indexes/base.py", line 2876, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 409, in pandas._libs.index.DatetimeEngine.get_loc
  File "pandas/_libs/index.pyx", line 437, in pandas._libs.index.DatetimeEngine.get_loc
  File "pandas/_libs/index.pyx", line 116, in pandas._libs.index.IndexEngine._get_loc_duplicates
KeyError: 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rhys/tmp/pandas/pandas/core/indexes/datetimes.py", line 589, in get_loc
    return Index.get_loc(self, key, method, tolerance)
  File "/home/rhys/tmp/pandas/pandas/core/indexes/base.py", line 2878, in get_loc
    raise KeyError(key) from err
KeyError: Timestamp('1970-01-01 00:00:00')

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/rhys/tmp/pandas/pandas/core/indexes/base.py", line 5059, in get_slice_bound
    slc = self.get_loc(label)
  File "/home/rhys/tmp/pandas/pandas/core/indexes/datetimes.py", line 591, in get_loc
    raise KeyError(orig_key) from err
KeyError: Timestamp('1970-01-01 00:00:00+0000', tz='UTC')

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/rhys/tmp/foo.py", line 416, in test_dataframe_with_datetimeindex_accepted
    a2 = half_open(df, start=constructor(1970, 1, 1), stop=None)
  File "/home/rhys/tmp/foo.py", line 20, in half_open
    slyce = half_open_slice(df=df, s=slice(start, stop, None))
  File "/home/rhys/tmp/foo.py", line 74, in half_open_slice
    start_idx = index.get_slice_bound(start, side="left", kind="loc")
  File "/home/rhys/tmp/pandas/pandas/core/indexes/base.py", line 5062, in get_slice_bound
    return self._searchsorted_monotonic(label, side)
  File "/home/rhys/tmp/pandas/pandas/core/indexes/base.py", line 5013, in _searchsorted_monotonic
    return self.searchsorted(label, side=side)
  File "/home/rhys/tmp/pandas/pandas/core/indexes/datetimelike.py", line 206, in searchsorted
    return self._data.searchsorted(value, side=side, sorter=sorter)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimelike.py", line 929, in searchsorted
    value = self._validate_searchsorted_value(value)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimelike.py", line 851, in _validate_searchsorted_value
    return self._unbox(value)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimelike.py", line 892, in _unbox
    other = self._unbox_scalar(other)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimes.py", line 451, in _unbox_scalar
    self._check_compatible_with(value)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimes.py", line 460, in _check_compatible_with
    self._assert_tzawareness_compat(other)
  File "/home/rhys/tmp/pandas/pandas/core/arrays/datetimes.py", line 644, in _assert_tzawareness_compat
    raise TypeError(
TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.

where something down in the guts disagrees with the None/utc pair. It looks like my a < b counterexample is happening somewhere but it may not be the root cause.

@mroeschke
Copy link
Collaborator

Thanks for bring up that example and testing out my patch.

Taking a step back, I am not 100% sure where pandas lies today in regard to operations that involve an operation with a tz-naive and a tz-aware elements (indexing, comparisons, etc). Ideally it would be nice if there was consistency across all operations. I think there might be a push towards disallowing tz-naive and tz-aware objects from interacting, but that might need a larger discussion in pandas-dev#35690 or with the core team.

What would be your opinion if in get_slice_bound the timezones needed to match e.g.

print(index)
# DatetimeIndex(['2000-01-03 00:00:00+00:00', '2000-01-04 00:00:00+00:00',
#                ...
#                '2000-02-10 00:00:00+00:00', '2000-02-11 00:00:00+00:00'],
#               dtype='datetime64[ns, UTC]', freq='B')

Would raise: TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.
index.get_slice_bounds(date.datetime(2020, 2, 10), ...)

Would raise: TypeError: Cannot compare tz-naive and tz-aware datetime-like objects.
index.get_slice_bounds(pd.Timestamp(2020, 2, 10), ...)

@RhysU
Copy link

RhysU commented Aug 25, 2020

@mroeschke, The timezones needing to match (in the sense that mixing tz-naive and tz-aware raises TypeError) in get_slice_bound is A-OK by me. So long as that implementation is consistent in its behavior, users can accomplish something else with a helper like my half_open(...).

@mroeschke
Copy link
Collaborator

Sorry for the delay @RhysU. I think my PR to fix get_slice_bound is pretty much ready to go. I think it will be easier if I address the interaction between tz-aware index and tz-naive argument in another PR where I can address other indexing methods like loc (since these methods share the same internal methods)

@RhysU
Copy link

RhysU commented Sep 2, 2020

Thank you for following-up @mroeschke. On the PR I now see consistency in index.get_slice_bound(...) in that the behavior is as I expect then the bound is before, internal, or after the data in the index. On the PR I now also see consistency in handling of Timestamps, datetimes, etc. I don't see rejection of mixing tz-naive and tz-aware data, but that's to be expected given your comments.

Concretely, with python -m unittest -v foo.py my prior tests now pass with the following diff:

diff --git a/foo.py b/foo.py
index 3666ce3..68481a0 100644
--- a/foo.py
+++ b/foo.py
@@ -43,7 +43,6 @@ def half_open_slice(df: typing.Union[pd.DataFrame, pd.Series], s: slice) -> slic
 
     # Additional start/stop coercion for only DatetimeIndex-like indices
     # Use Timestamps due to https://github.com/pandas-dev/pandas/issues/35690
-    # TODO After #35690 fixed, converting start/stop to Timestamp may not be needed
     attr_tz = "tzinfo"
     if hasattr(index, attr_tz):
         # Below, calls to pd.Timestamp(...) surprisingly interpret integers.
@@ -55,43 +54,22 @@ def half_open_slice(df: typing.Union[pd.DataFrame, pd.Series], s: slice) -> slic
                 f" Supply pandas.Timestamps to express fine-grained times like"
                 f" Timestamp('1970-01-01 00:00:00.020200817+0000', tz='UTC')."
             )
-        index_has_tz = index.tzinfo is not None
-        infer_start = index_has_tz and getattr(start, attr_tz, None) is None
-        infer_stop = index_has_tz and getattr(stop, attr_tz, None) is None
-        if start is not None:
-            start = pd.Timestamp(
-                start, tz=datetime.timezone.utc if infer_start else None
-            )
-        if stop is not None:
-            stop = pd.Timestamp(stop, tz=datetime.timezone.utc if infer_stop else None)
 
     # Convert start to Optional[int]-valued index location start_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When below index.min() return 0 as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need start < index.min() clause.
-    # TODO After #35690 fixed, may not need start > index.max() clause.
     if start is None:
         start_idx = None
-    elif start < index.min():
-        start_idx = 0
-    elif start > index.max():  # TODO Branch not unit tested below
-        start_idx = len(df)
     else:
-        start_idx = index.get_slice_bound(start, side="left", kind="ix")
+        start_idx = index.get_slice_bound(start, side="left", kind="loc")
 
     # Convert stop to Optional[int]-valued index location stop_idx
     # Clipping sidesteps https://github.com/pandas-dev/pandas/issues/35690
     # When above index.max() return len as if index.get_slice_bound(...) ran
-    # TODO After #35690 fixed, may not need stop < index.min() clause.
-    # TODO After #35690 fixed, may not need stop > index.max() clause.
     if stop is None:
         stop_idx = None
-    elif stop < index.min():  # TODO Branch not unit tested below
-        stop_idx = 0
-    elif stop > index.max():
-        stop_idx = len(df)
     else:
-        stop_idx = index.get_slice_bound(stop, side="left", kind="ix")
+        stop_idx = index.get_slice_bound(stop, side="left", kind="loc")
 
     # Return the requested half-open slice
     return slice(start_idx, stop_idx, step)
@@ -131,8 +109,6 @@ class HalfOpen(unittest.TestCase):
         ("UTC", "utc"),
         # Something other than UTC in both spots
         ("Europe/Amsterdam", "Europe/Amsterdam"),
-        # UTC in the Pandas Index and None slicing implies UTC slicing
-        ("utc", None),
         # Two different timezones with UTC on the DatetimeIndex
         ("UTC", "Etc/Greenwich"),
         # Two different timezones with Greenwich on the DatetimeIndex
@@ -142,14 +118,17 @@ class HalfOpen(unittest.TestCase):
     )
 
     # Which pairs of timezones should be confirmed as raising errors?
-    TZ_PAIRS_REJECTED = (
-        # No timezone on the Pandas index but UTC slicing
-        (None, "utc"),
-        # No timezone on the Pandas index but Amsterdam slicing
-        (None, "Europe/Amsterdam"),
-        # No timezone on the Pandas index but Greenwich slicing
-        (None, "Etc/Greenwich"),
-    )
+    TZ_PAIRS_REJECTED = tuple()
+    #TZ_PAIRS_REJECTED = (
+    #    # UTC in the Pandas Index and None slicing implies UTC slicing
+    #    ("utc", None),
+    #    # No timezone on the Pandas index but UTC slicing
+    #    (None, "utc"),
+    #    # No timezone on the Pandas index but Amsterdam slicing
+    #    (None, "Europe/Amsterdam"),
+    #    # No timezone on the Pandas index but Greenwich slicing
+    #    (None, "Etc/Greenwich"),
+    #)
 
     def series_with_datetimeindex(self, index_tz: str) -> pd.DataFrame:
         """Construct Series with multiple entries per index Timestamp."""

In any follow-on work, please do check that any ValueErrors are consistent across the various data types.

Appreciate all your effort here. Land away!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants