# 日期和时间数据类型及工具

In [2]:
from datetime import datetime

In [3]:
now = datetime.now()

In [4]:
now

datetime.datetime(2019, 3, 15, 17, 14, 16, 397894)

In [5]:
now.year, now.month, now.day

(2019, 3, 15)

In [6]:
# datetime以毫秒形式存储日期和时间
# timedelta表示两个datetime对象之间的时间差
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)

In [7]:
delta

datetime.timedelta(926, 56700)

In [8]:
delta.days

926

In [9]:
delta.seconds

56700

In [10]:
# 给datetime对象加上（或减去）一个或多个timedelta，会产生一个新的对象
from datetime import timedelta

In [11]:
start = datetime(2011, 1, 7)

In [12]:
start + timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [13]:
start - 2 * timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

## 字符串和datetime的相互转换

In [14]:
# 利用str或strftime方法（传入一个格式化字符串），datetime对象和pandas的Timestamp对象可以被格式化为字符串
stamp = datetime(2011, 1, 3)

In [15]:
str(stamp)

'2011-01-03 00:00:00'

In [16]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

In [17]:
# datatime.strptime可以用格式化编码将字符串转换为日期
value = '2011-01-03'

In [18]:
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2011, 1, 3, 0, 0)

In [19]:
datestrs = ['7/6/2011', '8/6/2011']

In [20]:
[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

In [21]:
# datetime.strptime是通过已知格式进行日期解析
# 也可以用dateutil包中的parser.parse方法
from dateutil.parser import parse

In [22]:
parse('2011-01-03')

datetime.datetime(2011, 1, 3, 0, 0)

In [23]:
# dateutil可以解析几乎所有人类能理解的日期表示形式
parse('Jan 31, 1997 10:45 PM')

datetime.datetime(1997, 1, 31, 22, 45)

In [24]:
# 国际通用格式中，一般是日出现在月前面，可以通过dayfirst=True进行设置
parse('6/12/2011', dayfirst = True)

datetime.datetime(2011, 12, 6, 0, 0)

In [25]:
# pandas一般用于处理成组日期，不管这些日期是DataFrame的轴索引还是列
# to_datetime方法可以解析多种不同的日期表示形式
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']

In [27]:
import pandas as pd
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

In [28]:
# 还可以处理缺失值（None、空字符串）
idx = pd.to_datetime(datestrs + [None])

In [29]:
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [30]:
# MaT(Not a Time)是pandas中时间戳数据的null值
idx[2]

NaT

In [31]:
pd.isnull(idx)

array([False, False,  True])

# 时间序列基础

In [32]:
# pandas最基本的时间序列类型就是以时间戳为索引的Series
from datetime import datetime

In [33]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
        datetime(2011, 1, 7), datetime(2011, 1, 8),
        datetime(2011, 1, 10), datetime(2011, 1, 12)]

In [35]:
import numpy as np
ts = pd.Series(np.random.randn(6), index = dates)

In [36]:
ts

2011-01-02   -1.162415
2011-01-05    0.522776
2011-01-07    1.210968
2011-01-08   -0.170009
2011-01-10    0.612213
2011-01-12   -0.323268
dtype: float64

In [37]:
# 这些datetime对象实际上是被放在一个DatetimeIndex中的
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

In [38]:
# 不同索引的时间序列之间的算术运算会自动按日期对齐
# ts[::2]是每隔两个取一个
ts + ts[::2]

2011-01-02   -2.324829
2011-01-05         NaN
2011-01-07    2.421937
2011-01-08         NaN
2011-01-10    1.224426
2011-01-12         NaN
dtype: float64

In [40]:
# pandas用NumPy的datetime64数据类型以纳秒形式存储时间戳
ts.index.dtype

dtype('<M8[ns]')

In [41]:
# DatatimeIndex中的各个标量值是pandas的Timestamp对象
stamp = ts.index[0]

In [42]:
stamp

Timestamp('2011-01-02 00:00:00')

## 索引、选取、子集构造

In [43]:
# 当根据标签索引选取数据时，时间序列和其它的pandas.Series很像
stamp = ts.index[2]

In [44]:
ts[stamp]

1.2109682718692396

In [45]:
# 传入一个可以被解释为日期的字符串
ts['1/10/2011']

0.6122131613922823

In [46]:
ts['20110110']

0.6122131613922823

In [47]:
# 对于较长的时间序列，只需传入“年”或“年月”即可轻松选取数据的切片
longer_ts = pd.Series(np.random.randn(1000),
                     index = pd.date_range('1/1/2000', periods = 1000))

In [48]:
longer_ts

2000-01-01   -1.271530
2000-01-02    0.335965
2000-01-03   -3.308510
2000-01-04    1.504856
2000-01-05   -1.466635
2000-01-06    1.185935
2000-01-07   -0.676281
2000-01-08   -0.561876
2000-01-09    1.121762
2000-01-10   -0.036170
2000-01-11    0.504041
2000-01-12    1.566320
2000-01-13    0.522217
2000-01-14   -0.663521
2000-01-15   -1.676687
2000-01-16    0.106562
2000-01-17    1.657472
2000-01-18    1.073067
2000-01-19   -0.397158
2000-01-20    0.713463
2000-01-21    0.183137
2000-01-22    0.953563
2000-01-23   -0.390028
2000-01-24    1.839432
2000-01-25   -0.897447
2000-01-26   -0.739479
2000-01-27   -1.019371
2000-01-28    0.658014
2000-01-29    1.265184
2000-01-30    0.132942
                ...   
2002-08-28    0.266780
2002-08-29   -1.110509
2002-08-30   -0.587794
2002-08-31   -0.606861
2002-09-01    0.190302
2002-09-02    0.002222
2002-09-03    0.057842
2002-09-04    0.607159
2002-09-05    0.025267
2002-09-06    0.436024
2002-09-07   -0.740153
2002-09-08   -0.466390
2002-09-09 

In [49]:
longer_ts['2001']

2001-01-01   -0.064978
2001-01-02   -0.112674
2001-01-03    2.581254
2001-01-04    0.319592
2001-01-05   -1.706345
2001-01-06    0.527732
2001-01-07    1.647522
2001-01-08   -1.534138
2001-01-09    0.301600
2001-01-10   -0.862641
2001-01-11   -0.490166
2001-01-12   -0.351058
2001-01-13   -0.491127
2001-01-14   -0.802293
2001-01-15    0.040909
2001-01-16   -2.966855
2001-01-17   -0.121505
2001-01-18    0.452879
2001-01-19    0.172654
2001-01-20    1.553781
2001-01-21   -0.218264
2001-01-22    0.211947
2001-01-23   -0.014758
2001-01-24    1.067640
2001-01-25   -0.683667
2001-01-26   -0.277548
2001-01-27    0.741239
2001-01-28   -1.307261
2001-01-29   -1.349601
2001-01-30   -1.081751
                ...   
2001-12-02   -1.959626
2001-12-03   -0.290138
2001-12-04    0.115474
2001-12-05   -0.889708
2001-12-06   -0.163085
2001-12-07   -1.105574
2001-12-08   -0.039035
2001-12-09    0.621195
2001-12-10   -0.630941
2001-12-11    0.869105
2001-12-12   -0.426507
2001-12-13    0.677093
2001-12-14 

In [50]:
# 这里字符串“2001”被结束为年，并根据它选取时间区间。指定月也可以
longer_ts['2001-05']

2001-05-01    1.425968
2001-05-02    0.808835
2001-05-03   -0.526030
2001-05-04    2.166207
2001-05-05    0.596710
2001-05-06   -0.328866
2001-05-07    1.450597
2001-05-08    0.786515
2001-05-09   -0.066507
2001-05-10    0.058046
2001-05-11    0.844817
2001-05-12   -0.282076
2001-05-13    0.079508
2001-05-14   -0.094524
2001-05-15    1.161613
2001-05-16   -0.045819
2001-05-17   -0.546205
2001-05-18    1.000352
2001-05-19   -0.883132
2001-05-20   -1.227942
2001-05-21   -0.136271
2001-05-22    0.352286
2001-05-23   -0.547628
2001-05-24    0.657594
2001-05-25    2.867253
2001-05-26    1.233002
2001-05-27    0.356025
2001-05-28   -0.050438
2001-05-29    0.723436
2001-05-30   -0.690240
2001-05-31    0.603828
Freq: D, dtype: float64

In [51]:
# datetime对象也可以进行切片
ts[datetime(2011, 1, 7):]

2011-01-07    1.210968
2011-01-08   -0.170009
2011-01-10    0.612213
2011-01-12   -0.323268
dtype: float64

In [52]:
# 可以用不存在于该时间序列中的时间戳对其进行切片（范围查询）：
ts

2011-01-02   -1.162415
2011-01-05    0.522776
2011-01-07    1.210968
2011-01-08   -0.170009
2011-01-10    0.612213
2011-01-12   -0.323268
dtype: float64

In [53]:
ts['1/6/2011':'1/11/2011']

2011-01-07    1.210968
2011-01-08   -0.170009
2011-01-10    0.612213
dtype: float64

可以传入字符串日期、datetime或Timestamp。这样切片产生的是原时间序列的视图，跟NumPy数组的切片运算是一样的。

意味着没有数据被复制，对切片进行修改会反映到原始数据上。

In [55]:
# 还有一个等价的实例方法也可以截取两个日期之间TimeSeries
ts.truncate(after='1/9/2011')

2011-01-02   -1.162415
2011-01-05    0.522776
2011-01-07    1.210968
2011-01-08   -0.170009
dtype: float64

In [56]:
# 对DataFrame也有效
dates = pd.date_range('1/1/2000', periods = 100, freq = 'W-WED')

In [58]:
long_df = pd.DataFrame(np.random.randn(100, 4),
                      index = dates,
                      columns = ['Colorado', 'Texas', 'New York', 'Ohio'])

In [60]:
long_df.loc['5-2001']

Unnamed: 0,Colorado,Texas,New York,Ohio
2001-05-02,-0.880092,0.380188,-1.228872,0.443889
2001-05-09,0.080631,-0.928498,-1.006751,0.173439
2001-05-16,-0.876454,0.546974,0.345031,-0.383402
2001-05-23,1.32244,-0.392473,0.555951,-1.989313
2001-05-30,0.980688,0.60869,1.569649,-0.446002


## 带有重复索引的时间序列

In [61]:
# 多个观测数据落在同一个时间点上的情况
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000', '1/3/2000'])

In [62]:
dup_ts = pd.Series(np.arange(5), index = dates)

In [63]:
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [64]:
# 通过is_unique属性查看是否唯一
dup_ts.index.is_unique

False

In [65]:
# 对这个时间序列进行索引，要么产生标量值，要么产生切片，具体看所选时间是否重复
dup_ts['1/3/2000']

4

In [67]:
dup_ts['1/2/2000']

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [68]:
# 使用groupby对具有非唯一时间戳的数据进行聚合，并传入level = 0
grouped = dup_ts.groupby(level = 0)

In [69]:
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [70]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

# 日期的范围、频率以及移动

In [71]:
# 将时间序列转换为一个具有固定频率（每日）的时间序列
ts

2011-01-02   -1.162415
2011-01-05    0.522776
2011-01-07    1.210968
2011-01-08   -0.170009
2011-01-10    0.612213
2011-01-12   -0.323268
dtype: float64

In [72]:
# 'D'表示每天
resampler = ts.resample('D')

## 生成日期范围

In [74]:
# pandas.date_range可用于根据指定的频率生成指定长度的DatetimeIndex
index = pd.date_range('2012-04-01', '2012-06-01')

In [75]:
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [76]:
# 默认情况下，date_range会产生按天计算的时间点
# 如果只传入起始或结束日期，那还得传入一个表示一段时间的数字
pd.date_range(start = '2012-04-01', periods = 20)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

In [77]:
pd.date_range(end = '2012-06-01', periods = 20)

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

In [78]:
# 如果想生成一个由每月最后一个工作日组成的日期索引，可以传入“BM”频率，就只包含时间间隔内（或刚好在边界上的）符合频率要求的日期
pd.date_range('2000-01-01', '2000-12-01', freq = 'BM')

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31', '2000-04-28',
               '2000-05-31', '2000-06-30', '2000-07-31', '2000-08-31',
               '2000-09-29', '2000-10-31', '2000-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [80]:
# date_range默认会保留起始和结束时间戳的时间信息（如果有的话）
pd.date_range('2012-05-02 12:56:31', periods = 5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [81]:
# normalize可以进行规范化
pd.date_range('2012-05-02 12:56:31', periods = 5, normalize = True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

## 频率和日期偏移量

In [82]:
# 按小时计算的频率可以用Hour类表示
from pandas.tseries.offsets import Hour, Minute

In [83]:
hour = Hour()

In [84]:
hour

<Hour>

In [85]:
# 传入一个整数即可定义偏移量的倍数
four_hours = Hour(4)

In [86]:
four_hours

<4 * Hours>

In [87]:
# 在基础频率前面放上一个整数即可创建倍数
pd.date_range('2000-01-01', '2000-01-03 23:59', freq = '4h')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [88]:
# 大部分偏移量对象可通过加法进行连接
Hour(2) + Minute(30)

<150 * Minutes>

In [89]:
# 也可以传入频率字符串
pd.date_range('2000-01-01', periods = 10, freq = '1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

### WOM日期

In [1]:
# Week Of Month，能得到“每月第3个星期五”
import pandas as pd
rng = pd.date_range('2012-01-01', '2012-09-01', freq = 'WOM-3FRI')

In [2]:
list(rng)

[Timestamp('2012-01-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-02-17 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2012-08-17 00:00:00', freq='WOM-3FRI')]

## 移动（超前和滞后）数据

In [4]:
# 移动(shifting)指的是沿着时间轴将数据前移或后移
# Series和DataFrame都有一个shift方法用于执行单纯的前移或后移操作，保持索引不变
import numpy as np
ts = pd.Series(np.random.randn(4),
              index = pd.date_range('1/1/2000', periods = 4, freq = 'M'))

In [5]:
ts

2000-01-31   -0.136713
2000-02-29    0.748671
2000-03-31    1.830436
2000-04-30   -0.859817
Freq: M, dtype: float64

In [6]:
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.136713
2000-04-30    0.748671
Freq: M, dtype: float64

In [7]:
ts.shift(-2)

2000-01-31    1.830436
2000-02-29   -0.859817
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [None]:
# 当这样移动时，会在时间序列前面或后面产生缺失数据
# shift用于计算一个时间序列或多个时间序列中的百分比的变化
ts/ts.shift(1) - 1

In [8]:
# 如果频率已知，可以将其传给shift以便实现对时间戳进行位移
ts.shift(2, freq = 'M')

2000-03-31   -0.136713
2000-04-30    0.748671
2000-05-31    1.830436
2000-06-30   -0.859817
Freq: M, dtype: float64

In [10]:
ts.shift(3, freq = 'D')

2000-02-03   -0.136713
2000-03-03    0.748671
2000-04-03    1.830436
2000-05-03   -0.859817
dtype: float64

In [11]:
ts.shift(1, freq = '90T')

2000-01-31 01:30:00   -0.136713
2000-02-29 01:30:00    0.748671
2000-03-31 01:30:00    1.830436
2000-04-30 01:30:00   -0.859817
Freq: M, dtype: float64

### 通过偏移量对日期进行位移