# 时间序列

In [1]:
import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

## 日期和时间数据类型及工具
+ Python标准库标准库包含用于日期(date)和时间（time）数据的类型
+ 会用到datetime、time以及calendar模块，`datetime.datetime`是用的最多的。

In [2]:
import datetime
now = datetime.datetime.now()
now
now.year, now.month, now.day

datetime.datetime(2019, 4, 16, 9, 57, 14, 379971)

(2019, 4, 16)

In [3]:
delta = datetime.datetime(2011, 1, 7) - datetime.datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(926, 56700)

In [4]:
delta.days

926

In [5]:
delta.seconds

56700

In [6]:
start = datetime.datetime(2011, 1, 7)
start + datetime.timedelta(12)

datetime.datetime(2011, 1, 19, 0, 0)

In [7]:
start - 2 * datetime.timedelta(12)

datetime.datetime(2010, 12, 14, 0, 0)

### 字符串和datetime的相互转换
+ 利用`str`或`strftime`可以将日期转化为字符串

In [8]:
stamp = datetime.datetime(2011, 1, 3)
str(stamp)

'2011-01-03 00:00:00'

In [9]:
stamp.strftime('%Y-%m-%d')

'2011-01-03'

datetime的格式定义

| 代码 | 说明 |
|----|----|
|%Y|4位数的年份|
|%y|2位数的年份|
|%m|2位数的月份|
|%d|2位数的日|
|%H|24小时制的时|
|%I|12小时制的时|
|%M|2位数的分钟|
|%S|2位数的秒|

+ `datetime.strptime`可以将格式化编码字符串转化为日期

In [10]:
value = '2011-01-03'
datetime.datetime.strptime(value, '%Y-%m-%d')


datetime.datetime(2011, 1, 3, 0, 0)

In [11]:
datestrs = ['7/6/2011', '8/6/2011']
[datetime.datetime.strptime(x, '%m/%d/%Y') for x in datestrs]

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

+ Pandas经常用于处理成组日期，不管这些日期是DataFrame的轴索引，还是列。
+ `to_datetime`方法可以解析多种不同的日期表示形式。

In [12]:
datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

+ pd.to_datetime可以处理缺失值 （None、空字符串等）

In [13]:
idx = pd.to_datetime(datestrs + [None])
idx

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

In [14]:
idx[2]

NaT

In [15]:
pd.isnull(idx)

array([False, False,  True], dtype=bool)

## 时间序列基础
pandas最基本的时间序列类型就是以时间戳（Python字符串或datetime对象表示）位索引的Series

In [16]:
dates = [datetime.datetime(2011, 1, 2), datetime.datetime(2011, 1, 5),
         datetime.datetime(2011, 1, 7), datetime.datetime(2011, 1, 8),
         datetime.datetime(2011, 1, 10), datetime.datetime(2011, 1, 12),datetime.datetime(2011,10,1)]
ts = pd.Series(np.random.randn(7), index=dates)
ts

2011-01-02   -0.204708
2011-01-05    0.478943
2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
2011-01-12    1.393406
2011-10-01    0.092908
dtype: float64

In [17]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12', '2011-10-01'],
              dtype='datetime64[ns]', freq=None)

+ **不同索引的时间序列之间的算术运算会自动按日期对齐**

In [18]:
ts + ts[::2]

2011-01-02   -0.409415
2011-01-05         NaN
2011-01-07   -1.038877
2011-01-08         NaN
2011-01-10    3.931561
2011-01-12         NaN
2011-10-01    0.185816
dtype: float64

In [19]:
stamp = ts.index[0]
stamp

Timestamp('2011-01-02 00:00:00')

### 索引、选取和子集构造
TimeSeries是Series的一个子类，索引在索引以及数据选取方面行为一致。

In [20]:
ts

2011-01-02   -0.204708
2011-01-05    0.478943
2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
2011-01-12    1.393406
2011-10-01    0.092908
dtype: float64

In [21]:
stamp = ts.index[2]
ts[stamp]

-0.51943871505673811

In [22]:
ts['10/1/2011']

0.092907876743717671

In [23]:
ts['20110110']

1.9657805725027142

+ 可以发现，传入一个可被解释为日期的字符串，可以进行索引。
+ 对于较长的时间序列，秩序传入**年**或**年月**即可对数据进行切片。

In [24]:
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2017', periods=1000))
longer_ts

2017-01-01    0.281746
2017-01-02    0.769023
2017-01-03    1.246435
2017-01-04    1.007189
2017-01-05   -1.296221
2017-01-06    0.274992
2017-01-07    0.228913
2017-01-08    1.352917
2017-01-09    0.886429
2017-01-10   -2.001637
                ...   
2019-09-18   -1.159926
2019-09-19    0.618965
2019-09-20    1.373890
2019-09-21   -0.983505
2019-09-22    0.930944
2019-09-23   -0.811676
2019-09-24   -1.830156
2019-09-25   -0.138730
2019-09-26    0.334088
2019-09-27    0.488675
Freq: D, Length: 1000, dtype: float64

In [25]:
longer_ts['2017']

2017-01-01    0.281746
2017-01-02    0.769023
2017-01-03    1.246435
2017-01-04    1.007189
2017-01-05   -1.296221
2017-01-06    0.274992
2017-01-07    0.228913
2017-01-08    1.352917
2017-01-09    0.886429
2017-01-10   -2.001637
                ...   
2017-12-22    1.014042
2017-12-23   -1.135008
2017-12-24   -0.263371
2017-12-25    1.306425
2017-12-26   -1.610841
2017-12-27   -1.026621
2017-12-28    1.241573
2017-12-29   -0.156760
2017-12-30   -2.449096
2017-12-31   -1.033948
Freq: D, Length: 365, dtype: float64

In [26]:
longer_ts['2017-05']

2017-05-01   -0.606545
2017-05-02   -0.417064
2017-05-03   -0.017007
2017-05-04   -1.224145
2017-05-05   -1.800840
2017-05-06    1.634736
2017-05-07    0.989008
2017-05-08    0.457940
2017-05-09    0.555154
2017-05-10    1.306720
                ...   
2017-05-22    0.680321
2017-05-23    0.635512
2017-05-24   -0.757177
2017-05-25    0.718086
2017-05-26   -0.304273
2017-05-27   -1.677790
2017-05-28    0.426986
2017-05-29   -1.563740
2017-05-30   -0.367488
2017-05-31    1.045913
Freq: D, Length: 31, dtype: float64

In [27]:
ts[datetime.datetime(2011, 1, 7):]

2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
2011-01-12    1.393406
2011-10-01    0.092908
dtype: float64

In [28]:
ts

2011-01-02   -0.204708
2011-01-05    0.478943
2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
2011-01-12    1.393406
2011-10-01    0.092908
dtype: float64

In [29]:
ts['1/6/2011':'1/10/2011']

2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
dtype: float64

In [30]:
ts.truncate(after='1/9/2011')

2011-01-02   -0.204708
2011-01-05    0.478943
2011-01-07   -0.519439
2011-01-08   -0.555730
dtype: float64

In [31]:
dates = pd.date_range('5/8/2018', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=['Colorado', 'Texas',
                                'New York', 'Ohio'])
long_df

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,-0.178098,2.122315,0.061192,0.884111
2018-05-16,-0.608506,-0.072052,0.544066,0.323886
2018-05-23,-1.683325,0.526860,1.858791,-0.548419
2018-05-30,-0.279397,-0.021299,-0.287990,0.089175
2018-06-06,0.522858,0.572796,-1.760372,1.128179
2018-06-13,1.568606,-0.342277,-0.009813,0.053072
2018-06-20,-0.041943,-1.694312,-0.362381,1.127453
2018-06-27,2.183830,-1.186623,0.334270,-1.028389
2018-07-04,-0.168496,-0.998948,-1.531151,-1.079111
2018-07-11,-1.097732,0.405148,-0.227228,-1.442792


In [32]:
long_df.loc['5-2018']

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,-0.178098,2.122315,0.061192,0.884111
2018-05-16,-0.608506,-0.072052,0.544066,0.323886
2018-05-23,-1.683325,0.52686,1.858791,-0.548419
2018-05-30,-0.279397,-0.021299,-0.28799,0.089175


### 带有重复索引的时间序列

In [33]:
dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',
                          '1/2/2000', '1/3/2000'])
dup_ts = pd.Series(np.arange(5), index=dates)
dup_ts

2000-01-01    0
2000-01-02    1
2000-01-02    2
2000-01-02    3
2000-01-03    4
dtype: int32

In [34]:
dup_ts.index.is_unique

False

In [35]:
dup_ts['1/3/2000']  # not duplicated

4

In [36]:
dup_ts['1/2/2000']  # duplicated

2000-01-02    1
2000-01-02    2
2000-01-02    3
dtype: int32

In [37]:
grouped = dup_ts.groupby(level=0)
grouped.mean()

2000-01-01    0
2000-01-02    2
2000-01-03    4
dtype: int32

In [38]:
grouped.count()

2000-01-01    1
2000-01-02    3
2000-01-03    1
dtype: int64

## 日期的范围、频率以及移动

In [39]:
ts

2011-01-02   -0.204708
2011-01-05    0.478943
2011-01-07   -0.519439
2011-01-08   -0.555730
2011-01-10    1.965781
2011-01-12    1.393406
2011-10-01    0.092908
dtype: float64

In [40]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12', '2011-10-01'],
              dtype='datetime64[ns]', freq=None)

In [41]:
resampler = ts.resample('D')
resampler

DatetimeIndexResampler [freq=<Day>, axis=0, closed=left, label=left, convention=start, base=0]

### 生成日期范围

In [42]:
index = pd.date_range(start='2012-04-01',end='2012-06-01')
index

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

In [43]:
pd.date_range(start='2018-05-08', periods=20)

DatetimeIndex(['2018-05-08', '2018-05-09', '2018-05-10', '2018-05-11',
               '2018-05-12', '2018-05-13', '2018-05-14', '2018-05-15',
               '2018-05-16', '2018-05-17', '2018-05-18', '2018-05-19',
               '2018-05-20', '2018-05-21', '2018-05-22', '2018-05-23',
               '2018-05-24', '2018-05-25', '2018-05-26', '2018-05-27'],
              dtype='datetime64[ns]', freq='D')

In [44]:
pd.date_range(end='2018-05-08', periods=20)

DatetimeIndex(['2018-04-19', '2018-04-20', '2018-04-21', '2018-04-22',
               '2018-04-23', '2018-04-24', '2018-04-25', '2018-04-26',
               '2018-04-27', '2018-04-28', '2018-04-29', '2018-04-30',
               '2018-05-01', '2018-05-02', '2018-05-03', '2018-05-04',
               '2018-05-05', '2018-05-06', '2018-05-07', '2018-05-08'],
              dtype='datetime64[ns]', freq='D')

In [45]:
# BM (business end of month)，每个月最后一个工作日组成的日期索引
pd.date_range('2018-01-01', '2018-12-01', freq='BM')

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-30', '2018-04-30',
               '2018-05-31', '2018-06-29', '2018-07-31', '2018-08-31',
               '2018-09-28', '2018-10-31', '2018-11-30'],
              dtype='datetime64[ns]', freq='BM')

In [46]:
pd.date_range('2018-01-01', '2018-12-01', freq='M')

DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31',
               '2018-09-30', '2018-10-31', '2018-11-30'],
              dtype='datetime64[ns]', freq='M')

In [47]:
pd.date_range('2012-05-02 12:56:31', periods=5)

DatetimeIndex(['2012-05-02 12:56:31', '2012-05-03 12:56:31',
               '2012-05-04 12:56:31', '2012-05-05 12:56:31',
               '2012-05-06 12:56:31'],
              dtype='datetime64[ns]', freq='D')

In [48]:
# normalize=True 产生一组被规范化到午夜的时间戳
pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)

DatetimeIndex(['2012-05-02', '2012-05-03', '2012-05-04', '2012-05-05',
               '2012-05-06'],
              dtype='datetime64[ns]', freq='D')

### 频率和日期偏移量
+ pandas中的频率是由**一个基础频率（base frequency）和一个乘数**组成
+ 基础频率通常以一个字符串别名表示，譬如“M”表示月，“H”表示小时，“D”表示天。

In [49]:
pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00', '2000-01-03 04:00:00',
               '2000-01-03 08:00:00', '2000-01-03 12:00:00',
               '2000-01-03 16:00:00', '2000-01-03 20:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [50]:
pd.date_range('2000-01-01', periods=10, freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

#### Week of month dates

In [51]:
rng = pd.date_range('2018-01-01', '2018-09-01', freq='WOM-3FRI')
list(rng)

[Timestamp('2018-01-19 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-02-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-03-16 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-04-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-05-18 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-06-15 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-07-20 00:00:00', freq='WOM-3FRI'),
 Timestamp('2018-08-17 00:00:00', freq='WOM-3FRI')]

### 移动（超前和滞后）数据/Shifting (Leading and Lagging) Data
+ 移动 是指沿着时间轴将数据前移或后移。
+ Series和DataFrame都有一个shift方法，用于执行单纯的前移或后移操作，保持索引不变。

In [52]:
ts = pd.Series(np.random.randn(4),
               index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts

2000-01-31    0.838639
2000-02-29   -0.117388
2000-03-31   -0.517795
2000-04-30   -0.116696
Freq: M, dtype: float64

In [53]:
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.838639
2000-04-30   -0.117388
Freq: M, dtype: float64

In [54]:
ts.shift(-2)

2000-01-31   -0.517795
2000-02-29   -0.116696
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

shift通常用于计算一个时间序列或多个时间序列（DataFrame的列）中的百分比变化。

ts / ts.shift(1) - 1

In [55]:
ts

2000-01-31    0.838639
2000-02-29   -0.117388
2000-03-31   -0.517795
2000-04-30   -0.116696
Freq: M, dtype: float64

In [56]:
ts/ts.shift(1) - 1

2000-01-31         NaN
2000-02-29   -1.139975
2000-03-31    3.410958
2000-04-30   -0.774629
Freq: M, dtype: float64

+ 由于单纯的shifting不会修改索引，所以部分数据会丢弃。因此，如果频率已知，则可以将其传递给shifit，**实现对时间戳的位移**，而不是数据的位移。

In [57]:
ts.shift(2, freq='M')

2000-03-31    0.838639
2000-04-30   -0.117388
2000-05-31   -0.517795
2000-06-30   -0.116696
Freq: M, dtype: float64

In [58]:
ts.shift(3, freq='D')

2000-02-03    0.838639
2000-03-03   -0.117388
2000-04-03   -0.517795
2000-05-03   -0.116696
dtype: float64

In [59]:
ts.shift(1, freq='90T')

2000-01-31 01:30:00    0.838639
2000-02-29 01:30:00   -0.117388
2000-03-31 01:30:00   -0.517795
2000-04-30 01:30:00   -0.116696
Freq: M, dtype: float64

## 时期及其算数运算
时期（period）表示时间的区间，即一段时间。

In [60]:
p = pd.Period(2007, freq='A-DEC') # ‘A-DEC'表示每年12月的最后一天,故p表示全年时间
p

Period('2007', 'A-DEC')

Period对象加上或减去一个整数，即可达到根据其频率进行位移的效果。

In [61]:
p + 5

Period('2012', 'A-DEC')

In [62]:
p - 2

Period('2005', 'A-DEC')

In [63]:
pd.Period('2014', freq='A-DEC') - p

7

+ period_range函数可用于创建规则的时期范围。

In [64]:
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M') # 每个月最后一天
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [65]:
pd.Series(np.random.randn(6), index=rng)

2000-01    2.389645
2000-02   -0.932454
2000-03   -0.229331
2000-04   -1.140330
2000-05    0.439920
2000-06   -0.823758
Freq: M, dtype: float64

In [66]:
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC') # quarterly frequency, year ends in December
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

### 时期的频率转换（Period Frequency Conversion）
+ Period和PeriodIndex对象可以通过`asfreq`方法转换为其他频率。

In [67]:
p = pd.Period('2007', freq='A-NOV')
p

Period('2007', 'A-NOV')

In [68]:
p.asfreq('M', how='start')

Period('2006-12', 'M')

In [69]:
p.asfreq('M', how='end')

Period('2007-11', 'M')

In [70]:
p = pd.Period('2007', freq='A-JUN') # year frequency, year ends in June
p

Period('2007', 'A-JUN')

In [71]:
p.asfreq('M', 'start')

Period('2006-07', 'M')

In [72]:
p.asfreq('M', 'end')

Period('2007-06', 'M')

+ 将高频率转化为低频率时，超时期(superperiod) 是由子时期（subperiod）所属的位置决定的。

In [73]:
p = pd.Period('Aug-2007', 'M')
p.asfreq('A-JUN')

Period('2008', 'A-JUN')

In [74]:
rng = pd.period_range('2006', '2009', freq='A-NOV')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2006   -0.520930
2007    0.350282
2008    0.204395
2009    0.133445
Freq: A-NOV, dtype: float64

In [75]:
ts.asfreq('M', how='start')

2005-12   -0.520930
2006-12    0.350282
2007-12    0.204395
2008-12    0.133445
Freq: M, dtype: float64

In [76]:
ts.asfreq('M', how='end')

2006-11   -0.520930
2007-11    0.350282
2008-11    0.204395
2009-11    0.133445
Freq: M, dtype: float64

In [77]:
ts.asfreq('B', how='end') #’B' business day frequency

2006-11-30   -0.520930
2007-11-30    0.350282
2008-11-28    0.204395
2009-11-30    0.133445
Freq: B, dtype: float64

### 按季度计算的时期频率（Quarterly Period Frequencies）
+ 季度型数据在会计、金融领域非常常见，譬如“季报”“年报”等。
+ pandas支持12种可能的季度型频率，即Q-JAN,Q-FEB,...,Q-DEC

In [78]:
p = pd.Period('2012Q4', freq='Q-JAN') #quarterly frequency, year ends in January
p

Period('2012Q4', 'Q-JAN')

In [79]:
p.asfreq('D', 'start')

Period('2011-11-01', 'D')

In [80]:
p.asfreq('D', 'end')

Period('2012-01-31', 'D')

+ period之间的运算即period转化位时间戳

In [81]:
# 获取该季度倒数第二个工作日下午4点的时间戳
p4pm = (p.asfreq('B', 'end') - 1).asfreq('T', 'start') + 16 * 60
p4pm

Period('2012-01-30 16:00', 'T')

In [82]:
p4pm.to_timestamp()

Timestamp('2012-01-30 16:00:00')

In [83]:
rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')
ts = pd.Series(np.arange(len(rng)), index=rng)
ts

2011Q3    0
2011Q4    1
2012Q1    2
2012Q2    3
2012Q3    4
2012Q4    5
Freq: Q-JAN, dtype: int32

In [84]:
new_rng = (rng.asfreq('B', 'end') - 1).asfreq('T', 'start') + 16 * 60
ts.index = new_rng.to_timestamp()
ts

2010-10-28 16:00:00    0
2011-01-28 16:00:00    1
2011-04-28 16:00:00    2
2011-07-28 16:00:00    3
2011-10-28 16:00:00    4
2012-01-30 16:00:00    5
dtype: int32

### 时间戳转化为period

In [85]:
rng = pd.date_range('2000-01-01', periods=3, freq='M')
ts = pd.Series(np.random.randn(3), index=rng)
ts

2000-01-31    0.327905
2000-02-29    0.072153
2000-03-31    0.131678
Freq: M, dtype: float64

In [86]:
pts = ts.to_period()
pts

2000-01    0.327905
2000-02    0.072153
2000-03    0.131678
Freq: M, dtype: float64

In [87]:
rng = pd.date_range('1/29/2000', periods=6, freq='D')
ts2 = pd.Series(np.random.randn(6), index=rng)
ts2

2000-01-29   -1.297459
2000-01-30    0.997747
2000-01-31    0.870955
2000-02-01   -0.991253
2000-02-02    0.151699
2000-02-03    1.266151
Freq: D, dtype: float64

In [88]:
ts2.to_period('M')

2000-01   -1.297459
2000-01    0.997747
2000-01    0.870955
2000-02   -0.991253
2000-02    0.151699
2000-02    1.266151
Freq: M, dtype: float64

In [89]:
pts = ts2.to_period()
pts

2000-01-29   -1.297459
2000-01-30    0.997747
2000-01-31    0.870955
2000-02-01   -0.991253
2000-02-02    0.151699
2000-02-03    1.266151
Freq: D, dtype: float64

In [90]:
pts.to_timestamp(how='end')

2000-01-29   -1.297459
2000-01-30    0.997747
2000-01-31    0.870955
2000-02-01   -0.991253
2000-02-02    0.151699
2000-02-03    1.266151
Freq: D, dtype: float64

### 通过数组创建PeriodIndex（Creating a PeriodIndex from Arrays）

In [91]:
data = pd.read_csv('examples/macrodata.csv')
data.head(5)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [92]:
data.year

0      1959.0
1      1959.0
2      1959.0
3      1959.0
4      1960.0
5      1960.0
6      1960.0
7      1960.0
8      1961.0
9      1961.0
        ...  
193    2007.0
194    2007.0
195    2007.0
196    2008.0
197    2008.0
198    2008.0
199    2008.0
200    2009.0
201    2009.0
202    2009.0
Name: year, Length: 203, dtype: float64

In [93]:
data.quarter

0      1.0
1      2.0
2      3.0
3      4.0
4      1.0
5      2.0
6      3.0
7      4.0
8      1.0
9      2.0
      ... 
193    2.0
194    3.0
195    4.0
196    1.0
197    2.0
198    3.0
199    4.0
200    1.0
201    2.0
202    3.0
Name: quarter, Length: 203, dtype: float64

In [94]:
index = pd.PeriodIndex(year=data.year, quarter=data.quarter,
                       freq='Q-DEC')
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')

In [95]:
data.index = index
data

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
1959Q1,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0.00,0.00
1959Q2,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74
1959Q3,1959.0,3.0,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09
1959Q4,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.370,140.0,4.33,5.6,179.386,0.27,4.06
1960Q1,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19
1960Q2,1960.0,2.0,2834.390,1792.9,298.152,460.400,1966.1,29.550,140.2,2.68,5.2,180.671,0.14,2.55
1960Q3,1960.0,3.0,2839.022,1785.8,296.375,474.676,1967.8,29.750,140.9,2.36,5.6,181.528,2.70,-0.34
1960Q4,1960.0,4.0,2802.616,1788.2,259.764,476.434,1966.6,29.840,141.1,2.29,6.3,182.287,1.21,1.08
1961Q1,1961.0,1.0,2819.264,1787.7,266.405,475.854,1984.5,29.810,142.1,2.37,6.8,182.992,-0.40,2.77
1961Q2,1961.0,2.0,2872.005,1814.3,286.246,480.328,2014.4,29.920,142.9,2.29,7.0,183.691,1.47,0.81


## 重采样及频率转换（Resampling and Frequency Conversion）
+ 重采样，即将时间序列从一个频率转换到另一个频率。
+ 高频数据聚合到低频数据，称为**降采样（downsampling)** 。
+ 低频数据转换到高频数据，称为**升采样（upsampling）** 。

In [96]:
rng = pd.date_range('2000-01-01', periods=100, freq='D')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2000-01-01   -0.202469
2000-01-02    0.050718
2000-01-03    0.639869
2000-01-04    0.597594
2000-01-05   -0.797246
2000-01-06    0.472879
2000-01-07    0.522356
2000-01-08   -0.546348
2000-01-09   -0.733537
2000-01-10    1.302736
                ...   
2000-03-31    0.028558
2000-04-01    1.129605
2000-04-02   -0.374173
2000-04-03   -0.011401
2000-04-04    0.272924
2000-04-05   -0.601544
2000-04-06    0.574265
2000-04-07   -0.194115
2000-04-08    0.202225
2000-04-09   -0.505124
Freq: D, Length: 100, dtype: float64

In [97]:
ts.resample('M',label='left',closed='right').mean()

1999-12-31    0.000007
2000-01-31   -0.049287
2000-02-29   -0.040121
2000-03-31    0.054740
Freq: M, dtype: float64

In [98]:
ts.resample('M', kind='period').mean()

2000-01    0.000007
2000-02   -0.049287
2000-03   -0.040121
2000-04    0.054740
Freq: M, dtype: float64

### Downsampling

需考虑2个问题：
+ 各区间那边是闭合的，left or right？  `closed`参数
+ 如何标记各个聚合面元，用区间的开头，还是结尾？ `label`参数

```
Series.resample(rule, how=None, axis=0, fill_method=None, closed=None, label=None, 
    convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None)
```

In [99]:
rng = pd.date_range('2000-01-01', periods=12, freq='T')
ts = pd.Series(np.arange(12), index=rng)
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [100]:
ts.resample('5min', closed='right').sum()

1999-12-31 23:55:00     0
2000-01-01 00:00:00    15
2000-01-01 00:05:00    40
2000-01-01 00:10:00    11
Freq: 5T, dtype: int32

In [101]:
ts.resample('5min', closed='left').sum()

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int32

In [102]:
ts.resample('5min', closed='right', label='right').sum()

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int32

In [103]:
ts.resample('5min', closed='right',
            label='right', loffset='-1s').sum()

1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int32

#### Open-High-Low-Close (OHLC) resampling

"how='ohlc'" *FutureWarning: how in .resample() is deprecated the new syntax is .resample(...).ohlc()*

In [104]:
ts.resample('5min').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


### Upsampling and Interpolation
resampling的填充和插值方式跟fillna和reindex一致

```
dataframe.resample(rule, how=None, axis=0, fill_method=None, closed=None, label=None, 
    convention='start', kind=None, loffset=None, limit=None, base=0, on=None, level=None)
```

In [105]:
frame = pd.DataFrame(np.random.randn(2, 4),
                     index=pd.date_range('5/8/2018', periods=2,
                                         freq='W-WED'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,2.954439,-2.630247,-0.352453,-0.477808
2018-05-16,0.161594,1.686833,0.821965,-0.667406


In [106]:
df_daily = frame.resample('D').asfreq()
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,2.954439,-2.630247,-0.352453,-0.477808
2018-05-10,,,,
2018-05-11,,,,
2018-05-12,,,,
2018-05-13,,,,
2018-05-14,,,,
2018-05-15,,,,
2018-05-16,0.161594,1.686833,0.821965,-0.667406


In [107]:
frame.resample('D').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,2.954439,-2.630247,-0.352453,-0.477808
2018-05-10,2.954439,-2.630247,-0.352453,-0.477808
2018-05-11,2.954439,-2.630247,-0.352453,-0.477808
2018-05-12,2.954439,-2.630247,-0.352453,-0.477808
2018-05-13,2.954439,-2.630247,-0.352453,-0.477808
2018-05-14,2.954439,-2.630247,-0.352453,-0.477808
2018-05-15,2.954439,-2.630247,-0.352453,-0.477808
2018-05-16,0.161594,1.686833,0.821965,-0.667406


In [108]:
frame.resample('D').ffill(limit=2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-09,2.954439,-2.630247,-0.352453,-0.477808
2018-05-10,2.954439,-2.630247,-0.352453,-0.477808
2018-05-11,2.954439,-2.630247,-0.352453,-0.477808
2018-05-12,,,,
2018-05-13,,,,
2018-05-14,,,,
2018-05-15,,,,
2018-05-16,0.161594,1.686833,0.821965,-0.667406


In [109]:
frame.resample('W-THU').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2018-05-10,2.954439,-2.630247,-0.352453,-0.477808
2018-05-17,0.161594,1.686833,0.821965,-0.667406


### 通过时期重采样（Resampling with Periods）

In [110]:
frame = pd.DataFrame(np.random.randn(24, 4),
                     index=pd.period_range('1-2000', '12-2001',
                                           freq='M'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])
frame[:5]

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,0.468489,0.966343,0.520443,-0.118905
2000-02,0.033498,-0.225585,0.105737,0.303241
2000-03,-0.195686,0.764936,-0.449838,1.038124
2000-04,1.573183,1.282892,0.280971,-0.892036
2000-05,0.517564,-0.007852,-1.638806,1.401227


In [111]:
annual_frame = frame.resample('A-DEC').mean()
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,0.602853,0.43428,-0.159423,0.248538
2001,0.050228,-0.116932,-0.383992,0.465679


+ 升采样时需决定在新的频率中各区间的那端用于放置原来的值，convention默认为 start

In [112]:
# Q-DEC: Quarterly, year ending in December
annual_frame.resample('Q-DEC').ffill() # convention默认为 start

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,0.602853,0.43428,-0.159423,0.248538
2000Q2,0.602853,0.43428,-0.159423,0.248538
2000Q3,0.602853,0.43428,-0.159423,0.248538
2000Q4,0.602853,0.43428,-0.159423,0.248538
2001Q1,0.050228,-0.116932,-0.383992,0.465679
2001Q2,0.050228,-0.116932,-0.383992,0.465679
2001Q3,0.050228,-0.116932,-0.383992,0.465679
2001Q4,0.050228,-0.116932,-0.383992,0.465679


In [113]:
annual_frame.resample('Q-DEC', convention='end').bfill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,0.602853,0.43428,-0.159423,0.248538
2001Q1,0.050228,-0.116932,-0.383992,0.465679
2001Q2,0.050228,-0.116932,-0.383992,0.465679
2001Q3,0.050228,-0.116932,-0.383992,0.465679
2001Q4,0.050228,-0.116932,-0.383992,0.465679


In [114]:
annual_frame.resample('Q-DEC', convention='end').bfill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,0.602853,0.43428,-0.159423,0.248538
2001Q1,0.050228,-0.116932,-0.383992,0.465679
2001Q2,0.050228,-0.116932,-0.383992,0.465679
2001Q3,0.050228,-0.116932,-0.383992,0.465679
2001Q4,0.050228,-0.116932,-0.383992,0.465679


In [115]:
annual_frame.resample('Q-MAR').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,0.602853,0.43428,-0.159423,0.248538
2001Q1,0.602853,0.43428,-0.159423,0.248538
2001Q2,0.602853,0.43428,-0.159423,0.248538
2001Q3,0.602853,0.43428,-0.159423,0.248538
2001Q4,0.050228,-0.116932,-0.383992,0.465679
2002Q1,0.050228,-0.116932,-0.383992,0.465679
2002Q2,0.050228,-0.116932,-0.383992,0.465679
2002Q3,0.050228,-0.116932,-0.383992,0.465679


## Conclusion