In [1]:
import pandas as pd

In [2]:
import pytz

In [3]:
pytz.common_timezones[-5:]

['US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific', 'UTC']

In [4]:
tz = pytz.timezone('Asia/Shanghai')

In [5]:
tz

<DstTzInfo 'Asia/Shanghai' LMT+8:06:00 STD>

In [6]:
rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')

In [7]:
import numpy as np

In [8]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [9]:
ts

2012-03-09 09:30:00    0.938396
2012-03-10 09:30:00   -0.933687
2012-03-11 09:30:00    0.887078
2012-03-12 09:30:00   -0.593173
2012-03-13 09:30:00    0.358411
2012-03-14 09:30:00    0.490689
Freq: D, dtype: float64

In [10]:
print(ts.index.tz) # native timezone

None


In [11]:
pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC') # 指定时区

DatetimeIndex(['2012-03-09 09:30:00+00:00', '2012-03-10 09:30:00+00:00',
               '2012-03-11 09:30:00+00:00', '2012-03-12 09:30:00+00:00',
               '2012-03-13 09:30:00+00:00', '2012-03-14 09:30:00+00:00',
               '2012-03-15 09:30:00+00:00', '2012-03-16 09:30:00+00:00',
               '2012-03-17 09:30:00+00:00', '2012-03-18 09:30:00+00:00'],
              dtype='datetime64[ns, UTC]', freq='D')

In [12]:
ts_utc = ts.tz_localize('UTC')

In [13]:
ts_utc

2012-03-09 09:30:00+00:00    0.938396
2012-03-10 09:30:00+00:00   -0.933687
2012-03-11 09:30:00+00:00    0.887078
2012-03-12 09:30:00+00:00   -0.593173
2012-03-13 09:30:00+00:00    0.358411
2012-03-14 09:30:00+00:00    0.490689
Freq: D, dtype: float64

In [14]:
ts_utc.index.tz

<UTC>

In [15]:
ts_utc.tz_convert('America/New_York') # 注意有夏令时

2012-03-09 04:30:00-05:00    0.938396
2012-03-10 04:30:00-05:00   -0.933687
2012-03-11 05:30:00-04:00    0.887078
2012-03-12 05:30:00-04:00   -0.593173
2012-03-13 05:30:00-04:00    0.358411
2012-03-14 05:30:00-04:00    0.490689
Freq: D, dtype: float64

In [16]:
ts_eastern = ts.tz_localize('America/New_York')

In [17]:
ts_eastern.tz_convert('UTC')

2012-03-09 14:30:00+00:00    0.938396
2012-03-10 14:30:00+00:00   -0.933687
2012-03-11 13:30:00+00:00    0.887078
2012-03-12 13:30:00+00:00   -0.593173
2012-03-13 13:30:00+00:00    0.358411
2012-03-14 13:30:00+00:00    0.490689
Freq: D, dtype: float64

In [18]:
ts_eastern.tz_convert('Europe/Berlin')

2012-03-09 15:30:00+01:00    0.938396
2012-03-10 15:30:00+01:00   -0.933687
2012-03-11 14:30:00+01:00    0.887078
2012-03-12 14:30:00+01:00   -0.593173
2012-03-13 14:30:00+01:00    0.358411
2012-03-14 14:30:00+01:00    0.490689
Freq: D, dtype: float64

In [19]:
stamp = pd.Timestamp('2011-03-12 04:00')

In [20]:
stamp_utc = stamp.tz_localize('utc')

In [21]:
stamp_utc.tz_convert('Asia/Shanghai')

Timestamp('2011-03-12 12:00:00+0800', tz='Asia/Shanghai')

In [22]:
stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')

In [23]:
stamp_moscow

Timestamp('2011-03-12 04:00:00+0300', tz='Europe/Moscow')

In [24]:
stamp_utc.value

1299902400000000000

In [25]:
stamp_utc.tz_convert('Asia/Shanghai').value # 内部保存的值还是 UTC 的

1299902400000000000

In [26]:
from pandas.tseries.offsets import Hour

In [27]:
stamp = pd.Timestamp('2012-03-11 01:30', tz='US/Eastern')

In [28]:
stamp

Timestamp('2012-03-11 01:30:00-0500', tz='US/Eastern')

In [29]:
stamp + Hour() # 跨越夏令时开始时间点

Timestamp('2012-03-11 03:30:00-0400', tz='US/Eastern')

In [30]:
stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')

In [31]:
stamp

Timestamp('2012-11-04 00:30:00-0400', tz='US/Eastern')

In [32]:
stamp + 2 * Hour() # 跨越夏令时结束时间点

Timestamp('2012-11-04 01:30:00-0500', tz='US/Eastern')

In [33]:
rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')

In [34]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [35]:
ts

2012-03-07 09:30:00   -0.225652
2012-03-08 09:30:00   -0.672390
2012-03-09 09:30:00    2.945786
2012-03-12 09:30:00   -0.369407
2012-03-13 09:30:00   -0.296977
2012-03-14 09:30:00   -0.465153
2012-03-15 09:30:00    0.817062
2012-03-16 09:30:00   -1.172516
2012-03-19 09:30:00    1.605668
2012-03-20 09:30:00    1.012487
Freq: B, dtype: float64

In [36]:
ts1 = ts[:7].tz_localize('Europe/London')

In [37]:
ts2 = ts1[2:].tz_convert('Europe/Moscow')

In [38]:
result = ts1 + ts2

In [39]:
result # 不同时区的时间序列合并，按 UTC 出结果

2012-03-07 09:30:00+00:00         NaN
2012-03-08 09:30:00+00:00         NaN
2012-03-09 09:30:00+00:00    5.891571
2012-03-12 09:30:00+00:00   -0.738813
2012-03-13 09:30:00+00:00   -0.593955
2012-03-14 09:30:00+00:00   -0.930306
2012-03-15 09:30:00+00:00    1.634124
Freq: B, dtype: float64

In [40]:
p = pd.Period(2007, freq='A-DEC')

In [41]:
p

Period('2007', 'A-DEC')

In [42]:
p + 5

Period('2012', 'A-DEC')

In [43]:
p - 2

Period('2005', 'A-DEC')

In [44]:
pd.Period('2014', freq='A-DEC') - p

7

In [45]:
rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')

In [46]:
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]', freq='M')

In [47]:
pd.Series(np.random.randn(6), index=rng) # PeriodIndex 可以直接拿来做索引

2000-01   -0.670137
2000-02    1.130576
2000-03   -1.532474
2000-04   -0.475492
2000-05   -0.341900
2000-06   -1.938967
Freq: M, dtype: float64

In [48]:
values = ['2001Q3', '2002Q2', '2003Q1']

In [49]:
index = pd.PeriodIndex(values, freq='Q-DEC')

In [50]:
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]', freq='Q-DEC')

In [51]:
p = pd.Period('2007', freq='A-DEC')

In [52]:
p

Period('2007', 'A-DEC')

In [53]:
p.asfreq('M', how='start') # 低频率转换为高频率

Period('2007-01', 'M')

In [54]:
p.asfreq('M', how='end')

Period('2007-12', 'M')

In [55]:
p = pd.Period('2007', freq='A-JUN') # 不以 12 月为财年结束

In [56]:
p.asfreq('M', 'start')

Period('2006-07', 'M')

In [57]:
p.asfreq('M', 'end')

Period('2007-06', 'M')

In [58]:
p = pd.Period('Aug-2007', 'M')

In [59]:
p.asfreq('A-JUN') # 高频率转换为低频率，也就是 2007 年 8 月属于 2008 财年

Period('2008', 'A-JUN')

In [60]:
rng = pd.period_range('2006', '2009', freq='A-DEC')

In [61]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [62]:
ts

2006   -0.765133
2007   -1.064712
2008   -0.209551
2009   -0.985876
Freq: A-DEC, dtype: float64

In [63]:
ts.asfreq('M', how='start')

2006-01   -0.765133
2007-01   -1.064712
2008-01   -0.209551
2009-01   -0.985876
Freq: M, dtype: float64

In [64]:
ts.asfreq('B', how='end')

2006-12-29   -0.765133
2007-12-31   -1.064712
2008-12-31   -0.209551
2009-12-31   -0.985876
Freq: B, dtype: float64

In [65]:
p = pd.Period('2012Q4', freq='Q-JAN')

In [66]:
p.asfreq('D', 'start') # 1 月结束的财年，Q4 是 11、12、1 这三个月

Period('2011-11-01', 'D')

In [67]:
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60

In [68]:
p4pm # 该季度倒数第二个工作日下午4点，对照前面的算术运算

Period('2012-01-30 16:00', 'T')

In [69]:
p4pm.to_timestamp()

Timestamp('2012-01-30 16:00:00')

In [70]:
rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')

In [71]:
ts = pd.Series(np.arange(len(rng)), index=rng)

In [72]:
ts

2011Q3    0
2011Q4    1
2012Q1    2
2012Q2    3
2012Q3    4
2012Q4    5
Freq: Q-JAN, dtype: int32

In [73]:
new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60

In [74]:
ts.index = new_rng.to_timestamp()

In [75]:
ts

2010-10-28 16:00:00    0
2011-01-28 16:00:00    1
2011-04-28 16:00:00    2
2011-07-28 16:00:00    3
2011-10-28 16:00:00    4
2012-01-30 16:00:00    5
dtype: int32

In [76]:
rng = pd.date_range('2000-01-01', periods=3, freq='M')

In [77]:
ts = pd.Series(np.random.randn(3), index=rng)

In [78]:
ts

2000-01-31    0.044033
2000-02-29   -1.194691
2000-03-31    0.214495
Freq: M, dtype: float64

In [79]:
pts = ts.to_period() # 转换为 Period 索引

In [80]:
pts

2000-01    0.044033
2000-02   -1.194691
2000-03    0.214495
Freq: M, dtype: float64

In [81]:
rng = pd.date_range('1/29/2000', periods=6, freq='D')

In [82]:
ts2 = pd.Series(np.random.randn(6), index=rng)

In [83]:
ts2

2000-01-29    0.325904
2000-01-30   -1.291129
2000-01-31    0.541025
2000-02-01    1.106569
2000-02-02    0.248568
2000-02-03   -0.145031
Freq: D, dtype: float64

In [84]:
ts2.to_period('M') # 允许索引里的时期（period）重复

2000-01    0.325904
2000-01   -1.291129
2000-01    0.541025
2000-02    1.106569
2000-02    0.248568
2000-02   -0.145031
Freq: M, dtype: float64

In [85]:
pts.to_timestamp(how='end') # 转换回 timestamp

2000-01-31    0.044033
2000-02-29   -1.194691
2000-03-31    0.214495
Freq: M, dtype: float64

In [86]:
data = pd.read_csv('data/macrodata.csv')

In [87]:
data.year[:5]

0    1959.0
1    1959.0
2    1959.0
3    1959.0
4    1960.0
Name: year, dtype: float64

In [88]:
data.quarter[:5]

0    1.0
1    2.0
2    3.0
3    4.0
4    1.0
Name: quarter, dtype: float64

In [89]:
index = pd.PeriodIndex(year=data.year, quarter=data.quarter,
                       freq='Q-DEC')

In [90]:
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203, freq='Q-DEC')

In [91]:
data.index = index

In [92]:
data.infl[:5]

1959Q1    0.00
1959Q2    2.34
1959Q3    2.74
1959Q4    0.27
1960Q1    2.31
Freq: Q-DEC, Name: infl, dtype: float64

In [93]:
rng = pd.date_range('2000-01-01', periods=100, freq='D')

In [94]:
ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [95]:
ts[:5]

2000-01-01   -0.560466
2000-01-02    0.841401
2000-01-03    1.048079
2000-01-04   -0.864989
2000-01-05    0.067318
Freq: D, dtype: float64

In [96]:
ts.resample('M').mean()

2000-01-31   -0.160998
2000-02-29   -0.240901
2000-03-31   -0.197364
2000-04-30   -0.091195
Freq: M, dtype: float64

In [97]:
ts.resample('M', kind='period').mean() # 默认聚合到 timestamp，除非指定

2000-01   -0.160998
2000-02   -0.240901
2000-03   -0.197364
2000-04   -0.091195
Freq: M, dtype: float64

In [98]:
rng = pd.date_range('2000-01-01', periods=12, freq='T')

In [99]:
ts = pd.Series(np.arange(12), index=rng)

In [100]:
ts

2000-01-01 00:00:00     0
2000-01-01 00:01:00     1
2000-01-01 00:02:00     2
2000-01-01 00:03:00     3
2000-01-01 00:04:00     4
2000-01-01 00:05:00     5
2000-01-01 00:06:00     6
2000-01-01 00:07:00     7
2000-01-01 00:08:00     8
2000-01-01 00:09:00     9
2000-01-01 00:10:00    10
2000-01-01 00:11:00    11
Freq: T, dtype: int32

In [101]:
ts.resample('5min').sum() # 默认是包含左边界（closed='left'），相当于 00:00:00 指向 00:00:00 到 00:04:59

2000-01-01 00:00:00    10
2000-01-01 00:05:00    35
2000-01-01 00:10:00    21
Freq: 5T, dtype: int32

In [102]:
ts.resample('5min', closed='right', label='right').sum() # label 定义用左边界还是右边界来做索引

2000-01-01 00:00:00     0
2000-01-01 00:05:00    15
2000-01-01 00:10:00    40
2000-01-01 00:15:00    11
Freq: 5T, dtype: int32

In [103]:
ts.resample('5min', closed='right',
            label='right', loffset='-1s').sum()

1999-12-31 23:59:59     0
2000-01-01 00:04:59    15
2000-01-01 00:09:59    40
2000-01-01 00:14:59    11
Freq: 5T, dtype: int32

In [104]:
ts.resample('5min').ohlc()

Unnamed: 0,open,high,low,close
2000-01-01 00:00:00,0,4,0,4
2000-01-01 00:05:00,5,9,5,9
2000-01-01 00:10:00,10,11,10,11


In [105]:
rng = pd.date_range('1/1/2000', periods=100, freq='D')

In [106]:
ts = pd.Series(np.arange(100), index=rng)

In [107]:
ts.groupby(lambda x: x.month).mean() # groupby 也可以实现一些降采样的功能，但时间索引怎么没了？

1    15
2    45
3    75
4    95
dtype: int32

In [108]:
ts.groupby(lambda x: x.weekday).mean()

0    47.5
1    48.5
2    49.5
3    50.5
4    51.5
5    49.0
6    50.0
dtype: float64

In [109]:
frame = pd.DataFrame(np.random.randn(2, 4),
                     index=pd.date_range('1/1/2000', periods=2, freq='W-WED'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [110]:
frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.481773,-1.155842,-1.154738,-2.671541
2000-01-12,-1.627464,1.51497,-1.482503,-0.631939


In [111]:
df_daily = frame.resample('D').asfreq() # 升采样

In [112]:
df_daily

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.481773,-1.155842,-1.154738,-2.671541
2000-01-06,,,,
2000-01-07,,,,
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-1.627464,1.51497,-1.482503,-0.631939


In [113]:
frame.resample('D').ffill() # forward fill

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.481773,-1.155842,-1.154738,-2.671541
2000-01-06,1.481773,-1.155842,-1.154738,-2.671541
2000-01-07,1.481773,-1.155842,-1.154738,-2.671541
2000-01-08,1.481773,-1.155842,-1.154738,-2.671541
2000-01-09,1.481773,-1.155842,-1.154738,-2.671541
2000-01-10,1.481773,-1.155842,-1.154738,-2.671541
2000-01-11,1.481773,-1.155842,-1.154738,-2.671541
2000-01-12,-1.627464,1.51497,-1.482503,-0.631939


In [114]:
frame.resample('D').ffill(limit=2)

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-05,1.481773,-1.155842,-1.154738,-2.671541
2000-01-06,1.481773,-1.155842,-1.154738,-2.671541
2000-01-07,1.481773,-1.155842,-1.154738,-2.671541
2000-01-08,,,,
2000-01-09,,,,
2000-01-10,,,,
2000-01-11,,,,
2000-01-12,-1.627464,1.51497,-1.482503,-0.631939


In [115]:
frame.resample('W-THU').ffill() # 不一定非得是原来的日期索引

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01-06,1.481773,-1.155842,-1.154738,-2.671541
2000-01-13,-1.627464,1.51497,-1.482503,-0.631939


In [116]:
frame = pd.DataFrame(np.random.randn(24, 4),
                     index=pd.period_range('1-2000', '12-2001', freq='M'),
                     columns=['Colorado', 'Texas', 'New York', 'Ohio'])

In [117]:
frame[:5]

Unnamed: 0,Colorado,Texas,New York,Ohio
2000-01,0.744094,0.787148,1.475356,-1.224909
2000-02,0.087562,0.276697,0.033537,-0.375532
2000-03,-0.719425,-1.115222,-1.656455,-0.692428
2000-04,1.245263,-0.254698,1.202921,-0.096528
2000-05,0.28619,0.560131,-1.789713,0.114794


In [118]:
annual_frame = frame.resample('A-DEC').mean() # 通过时期（period）进行重采样

In [119]:
annual_frame

Unnamed: 0,Colorado,Texas,New York,Ohio
2000,0.228341,-0.128055,0.070747,-0.297555
2001,0.15075,-0.54396,0.198888,-0.200442


In [120]:
annual_frame.resample('Q-DEC').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q1,0.228341,-0.128055,0.070747,-0.297555
2000Q2,0.228341,-0.128055,0.070747,-0.297555
2000Q3,0.228341,-0.128055,0.070747,-0.297555
2000Q4,0.228341,-0.128055,0.070747,-0.297555
2001Q1,0.15075,-0.54396,0.198888,-0.200442
2001Q2,0.15075,-0.54396,0.198888,-0.200442
2001Q3,0.15075,-0.54396,0.198888,-0.200442
2001Q4,0.15075,-0.54396,0.198888,-0.200442


In [121]:
annual_frame.resample('Q-DEC', convention='end').ffill()

Unnamed: 0,Colorado,Texas,New York,Ohio
2000Q4,0.228341,-0.128055,0.070747,-0.297555
2001Q1,0.228341,-0.128055,0.070747,-0.297555
2001Q2,0.228341,-0.128055,0.070747,-0.297555
2001Q3,0.228341,-0.128055,0.070747,-0.297555
2001Q4,0.15075,-0.54396,0.198888,-0.200442
