Skip to content

Commit

Permalink
PERF: Datetime/Timestamp.normalize for timezone naive datetimes (pand…
Browse files Browse the repository at this point in the history
  • Loading branch information
mroeschke authored and tm9k1 committed Nov 19, 2018
1 parent f46013b commit 46d3891
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 31 deletions.
5 changes: 4 additions & 1 deletion asv_bench/benchmarks/timestamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def time_microsecond(self, tz, freq):


class TimestampOps(object):
params = [None, 'US/Eastern']
params = [None, 'US/Eastern', 'UTC']
param_names = ['tz']

def setup(self, tz):
Expand All @@ -102,6 +102,9 @@ def time_replace_None(self, tz):
def time_to_pydatetime(self, tz):
self.ts.to_pydatetime()

def time_normalize(self, tz):
self.ts.normalize()


class TimestampAcrossDst(object):
def setup(self):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,7 @@ Performance Improvements
- Improved performance of :func:`IndexEngine.get_indexer_non_unique` for sorted, non-unique indexes (:issue:`9466`)
- Improved performance of :func:`PeriodIndex.unique` (:issue:`23083`)
- Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`)
- Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`)


.. _whatsnew_0240.docs:
Expand Down
3 changes: 3 additions & 0 deletions pandas/_libs/tslibs/ccalendar.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ DAYS_FULL = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday',
int_to_weekday = {num: name for num, name in enumerate(DAYS)}
weekday_to_int = {int_to_weekday[key]: key for key in int_to_weekday}

DAY_SECONDS = 86400
HOUR_SECONDS = 3600

# ----------------------------------------------------------------------


Expand Down
31 changes: 11 additions & 20 deletions pandas/_libs/tslibs/conversion.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ from cpython.datetime cimport (datetime, tzinfo,
PyDateTime_CheckExact, PyDateTime_IMPORT)
PyDateTime_IMPORT

from ccalendar import DAY_SECONDS, HOUR_SECONDS

from np_datetime cimport (check_dts_bounds,
npy_datetimestruct,
pandas_datetime_to_datetimestruct, _string_to_dts,
Expand All @@ -41,8 +43,6 @@ from nattype cimport NPY_NAT, checknull_with_nat
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL
cdef int64_t HOURS_NS = 3600000000000
NS_DTYPE = np.dtype('M8[ns]')
TD_DTYPE = np.dtype('m8[ns]')

Expand Down Expand Up @@ -875,6 +875,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
int64_t *tdata
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
int64_t HOURS_NS = HOUR_SECONDS * 1000000000
ndarray[int64_t] result, result_a, result_b, dst_hours
npy_datetimestruct dts
bint infer_dst = False, is_dst = False, fill = False
Expand Down Expand Up @@ -931,10 +932,10 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None,
result_b[:] = NPY_NAT

idx_shifted_left = (np.maximum(0, trans.searchsorted(
vals - DAY_NS, side='right') - 1)).astype(np.int64)
vals - DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)

idx_shifted_right = (np.maximum(0, trans.searchsorted(
vals + DAY_NS, side='right') - 1)).astype(np.int64)
vals + DAY_SECONDS * 1000000000, side='right') - 1)).astype(np.int64)

for i in range(n):
val = vals[i]
Expand Down Expand Up @@ -1116,9 +1117,9 @@ def normalize_date(dt: object) -> datetime:
@cython.boundscheck(False)
def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
"""
Normalize each of the (nanosecond) timestamps in the given array by
rounding down to the beginning of the day (i.e. midnight). If `tz`
is not None, then this is midnight for this timezone.
Normalize each of the (nanosecond) timezone aware timestamps in the given
array by rounding down to the beginning of the day (i.e. midnight).
This is midnight for timezone, `tz`.
Parameters
----------
Expand All @@ -1130,21 +1131,11 @@ def normalize_i8_timestamps(int64_t[:] stamps, object tz=None):
result : int64 ndarray of converted of normalized nanosecond timestamps
"""
cdef:
Py_ssize_t i, n = len(stamps)
npy_datetimestruct dts
Py_ssize_t n = len(stamps)
int64_t[:] result = np.empty(n, dtype=np.int64)

if tz is not None:
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)
else:
with nogil:
for i in range(n):
if stamps[i] == NPY_NAT:
result[i] = NPY_NAT
continue
dt64_to_dtstruct(stamps[i], &dts)
result[i] = _normalized_stamp(&dts)
tz = maybe_get_tz(tz)
result = _normalize_local(stamps, tz)

return result.base # .base to access underlying np.ndarray

Expand Down
5 changes: 3 additions & 2 deletions pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ cimport numpy as cnp
from numpy cimport ndarray, int64_t, int32_t, int8_t
cnp.import_array()

from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL
from ccalendar import get_locale_names, MONTHS_FULL, DAYS_FULL, DAY_SECONDS
from ccalendar cimport (get_days_in_month, is_leapyear, dayofweek,
get_week_of_year, get_day_of_year)
from np_datetime cimport (npy_datetimestruct, pandas_timedeltastruct,
Expand All @@ -36,7 +36,8 @@ def get_time_micros(ndarray[int64_t] dtindex):
cdef:
ndarray[int64_t] micros

micros = np.mod(dtindex, 86400000000000, dtype=np.int64) // 1000LL
micros = np.mod(dtindex, DAY_SECONDS * 1000000000, dtype=np.int64)
micros //= 1000LL
return micros


Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/tslibs/np_datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cdef extern from "src/datetime/np_datetime_strings.h":
npy_datetimestruct *out,
int *out_local, int *out_tzoffset)


# ----------------------------------------------------------------------
# numpy object inspection

Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ from util cimport (is_timedelta64_object, is_datetime64_object,
is_integer_object, is_float_object,
is_string_object)

from ccalendar import DAY_SECONDS

from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct,
pandas_timedeltastruct)

Expand All @@ -38,8 +40,6 @@ from offsets cimport to_offset
# ----------------------------------------------------------------------
# Constants

cdef int64_t DAY_NS = 86400000000000LL

# components named tuple
Components = collections.namedtuple('Components', [
'days', 'hours', 'minutes', 'seconds',
Expand Down Expand Up @@ -266,10 +266,10 @@ cdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
m = 1000000000L * 2629746
p = 9
elif unit == 'W':
m = 1000000000L * 86400 * 7
m = 1000000000L * DAY_SECONDS * 7
p = 9
elif unit == 'D' or unit == 'd':
m = 1000000000L * 86400
m = 1000000000L * DAY_SECONDS
p = 9
elif unit == 'h':
m = 1000000000L * 3600
Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ from util cimport (is_datetime64_object, is_timedelta64_object,
is_offset_object)

cimport ccalendar
from ccalendar import DAY_SECONDS
from conversion import tz_localize_to_utc, normalize_i8_timestamps
from conversion cimport (tz_convert_single, _TSObject,
convert_to_tsobject, convert_datetime_to_tsobject)
Expand Down Expand Up @@ -1285,6 +1286,10 @@ class Timestamp(_Timestamp):
Normalize Timestamp to midnight, preserving
tz information.
"""
if self.tz is None or is_utc(self.tz):
DAY_NS = DAY_SECONDS * 1000000000
normalized_value = self.value - (self.value % DAY_NS)
return Timestamp(normalized_value).tz_localize(self.tz)
normalized_value = normalize_i8_timestamps(
np.array([self.value], dtype='i8'), tz=self.tz)[0]
return Timestamp(normalized_value).tz_localize(self.tz)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timezones.pxd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

cdef bint is_utc(object tz)
cpdef bint is_utc(object tz)
cdef bint is_tzlocal(object tz)

cdef bint treat_tz_as_pytz(object tz)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/tslibs/timezones.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ cdef int64_t NPY_NAT = get_nat()

# ----------------------------------------------------------------------

cdef inline bint is_utc(object tz):
cpdef inline bint is_utc(object tz):
return tz is UTC or isinstance(tz, _dateutil_tzutc)


Expand Down
11 changes: 9 additions & 2 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pandas._libs import lib, tslib
from pandas._libs.tslib import Timestamp, NaT, iNaT
from pandas._libs.tslibs import (
normalize_date,
ccalendar, normalize_date,
conversion, fields, timezones,
resolution as libresolution)

Expand Down Expand Up @@ -853,7 +853,14 @@ def normalize(self):
'2014-08-01 00:00:00+05:30'],
dtype='datetime64[ns, Asia/Calcutta]', freq=None)
"""
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
if self.tz is None or timezones.is_utc(self.tz):
not_null = self.notna()
DAY_NS = ccalendar.DAY_SECONDS * 1000000000
new_values = self.asi8.copy()
adjustment = (new_values[not_null] % DAY_NS)
new_values[not_null] = new_values[not_null] - adjustment
else:
new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz)
return type(self)(new_values, freq='infer').tz_localize(self.tz)

def to_period(self, freq=None):
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/indexes/datetimes/test_scalar_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,12 @@ def test_normalize(self):
assert result.is_normalized
assert not rng.is_normalized

def test_normalize_nat(self):
dti = DatetimeIndex([pd.NaT, Timestamp('2018-01-01 01:00:00')])
result = dti.normalize()
expected = DatetimeIndex([pd.NaT, Timestamp('2018-01-01')])
tm.assert_index_equal(result, expected)


class TestDateTimeIndexToJulianDate(object):

Expand Down
11 changes: 11 additions & 0 deletions pandas/tests/scalar/timestamp/test_unary_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,17 @@ def test_replace_dst_border(self):
expected = Timestamp('2013-11-3 03:00:00', tz='America/Chicago')
assert result == expected

# --------------------------------------------------------------
# Timestamp.normalize

@pytest.mark.parametrize('arg', ['2013-11-30', '2013-11-30 12:00:00'])
def test_normalize(self, tz_naive_fixture, arg):
tz = tz_naive_fixture
ts = Timestamp(arg, tz=tz)
result = ts.normalize()
expected = Timestamp('2013-11-30', tz=tz)
assert result == expected

# --------------------------------------------------------------

@td.skip_if_windows
Expand Down

0 comments on commit 46d3891

Please sign in to comment.