# 11 时间序列
时间戳：具体的时刻
固定的时间区间
时间间隔

## 11.1 日期和时间数据的类型及工具
Python标准库包含了日期和时间数据的类型
1 datetime
2 time
3 calendar

In [43]:
from datetime import datetime
import pandas as pd
import numpy as np

import pandas as pd

now = datetime.now()
now

datetime.datetime(2022, 5, 8, 21, 32, 31, 382187)

In [4]:
now.year, now.month, now.day

(2022, 5, 8)

In [6]:
# timedelta表示两个时间之差
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [7]:
delta.days

926

In [8]:
delta.seconds

56700

In [9]:
from datetime import timedelta

start = datetime(2022, 1, 1)
# 可以通过加上或减去一个timedelta或其整数倍来产生一个新的datetime
start + timedelta(12)

datetime.datetime(2022, 1, 13, 0, 0)

In [10]:
start - 2 * timedelta(12)

datetime.datetime(2021, 12, 8, 0, 0)

In [14]:
start.tzinfo

### 11.1.1 字符串与datetime互转

In [16]:
stamp = datetime(2022, 1, 1)
str(stamp)

'2022-01-01 00:00:00'

In [17]:
stamp.strftime('%Y-%m-%d')

'2022-01-01'

In [21]:
stamp.strftime('%Y-%m-%d %H:%M:%S')

'2022-01-01 00:00:00'

In [25]:
# 将字符串转化为datetime
value = '2022-01-01'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2022, 1, 1, 0, 0)

In [35]:
datestrs = ['2022-01-01', '2022-05-08']
[datetime.strptime(x, '%Y-%m-%d') for x in datestrs]

[datetime.datetime(2022, 1, 1, 0, 0), datetime.datetime(2022, 5, 8, 0, 0)]

In [36]:
# datetime.strptime将字符串转化为时间，需要传入指定的格式
# 每次传入比较麻烦，对于通用的日期，可以使用第三方库
from dateutil.parser import parse

parse('2022-01-01')

datetime.datetime(2022, 1, 1, 0, 0)

In [37]:
# dateutil能够解析大部分人类可理解的日期
parse('Mar 31, 2022 10:45 PM')

datetime.datetime(2022, 3, 31, 22, 45)

In [41]:
# 国际场合下，一般日期出现在月份之前，可以传递dayfirst=True
parse('Mar 31, 2022 10:45 PM', dayfirst=True)

datetime.datetime(2022, 5, 8, 0, 0)

In [39]:
parse('8/5/2022', dayfirst=True)

datetime.datetime(2022, 5, 8, 0, 0)

> dateutil.parse可能存在识别不准的情况

In [44]:
# to_datetime方法可以转换很多不同的日期表示格式
datestrs = ['2022-05-08 21:00:00', '2022-05-09 22:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2022-05-08 21:00:00', '2022-05-09 22:00:00'], dtype='datetime64[ns]', freq=None)

In [45]:
# pandas.to_datetime方法可以处理那些被认为缺失值的值
idx = pd.to_datetime(datestrs + [None])
idx
# NaT是pandas中时间吹数据的null值

DatetimeIndex(['2022-05-08 21:00:00', '2022-05-09 22:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

## 11.2 时间序列基础

In [47]:
from datetime import datetime

dates = [datetime(2022, 1, 1),
         datetime(2022, 2, 1),
         datetime(2022, 3, 1),
         datetime(2022, 4, 1),
         datetime(2022, 5, 1),
         datetime(2022, 6, 1), ]
# datetime对象可以被放入DatetimeIndex
ts = pd.Series(np.random.randn(6), index=dates)
ts

2022-01-01    0.391336
2022-02-01    0.084591
2022-03-01    0.590092
2022-04-01   -1.074037
2022-05-01    1.132917
2022-06-01    0.507165
dtype: float64

In [48]:
ts.index

DatetimeIndex(['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01',
               '2022-05-01', '2022-06-01'],
              dtype='datetime64[ns]', freq=None)

In [49]:
ts.index.dtype

dtype('<M8[ns]')

In [50]:
# DatetimeIndex中的标量值是pandas的Timestamp对象
stamp = ts.index[0]
stamp

Timestamp('2022-01-01 00:00:00')

### 11.2.1 索引、选择、子集

In [51]:
stamp = ts.index[2]
# 可以通过值来索引出数据
ts[stamp]

0.5900924647158913

In [52]:
# 直接传入一个可解释的日期的字符串也可以
ts['2022/5/1']

1.1329168626760033

In [53]:
# 对于时间序列，可以传递一个年份或者月份来获取数据的切片
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('2022/1/1', periods=1000))
longer_ts[:5]

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

In [55]:
# 传入年份，获取2022年的数据
longer_ts['2022'].head()

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

In [57]:
# 传入月份也可以
longer_ts['2022-05'].head()

2022-05-01    0.160175
2022-05-02    0.104173
2022-05-03   -0.618365
2022-05-04    0.999042
2022-05-05    0.140988
Freq: D, dtype: float64

In [58]:
# 传入时间区间
longer_ts['2022/1/1':'2022/1/5']

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

> 在传入一个字符串的日期、datetime对象或时间戳，这种方式产生了一个原时间序列的试图。类似于Numpy的数组。这意味着没有数据被复制，并且在切片上的修改会反映在原始数据上。

In [61]:
# truncate可以在两个日期对Series进行切片
ts.truncate(before='2022/1/5')

2022-02-01    0.084591
2022-03-01    0.590092
2022-04-01   -1.074037
2022-05-01    1.132917
2022-06-01    0.507165
dtype: float64

In [63]:
# 在DataFrame操作同样适用
dates = pd.date_range('1/1/2022', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=list('ABCD'))
long_df

Unnamed: 0,A,B,C,D
2022-01-05,-0.400783,-1.264017,0.379966,-0.256141
2022-01-12,-0.571639,2.131241,-0.241245,0.509657
2022-01-19,-0.401982,-0.158788,1.402505,-1.186511
2022-01-26,-0.574415,-0.485531,-0.615474,1.202686
2022-02-02,1.220108,-2.251149,2.631229,-0.791016
...,...,...,...,...
2023-11-01,-0.910823,-1.091824,-1.579518,-1.195703
2023-11-08,-0.278071,0.558738,-0.077413,-2.918578
2023-11-15,0.766577,0.550362,0.619179,0.456066
2023-11-22,0.149930,0.328295,-1.395013,1.267319


In [65]:
long_df.loc['1/2022']

Unnamed: 0,A,B,C,D
2022-01-05,-0.400783,-1.264017,0.379966,-0.256141
2022-01-12,-0.571639,2.131241,-0.241245,0.509657
2022-01-19,-0.401982,-0.158788,1.402505,-1.186511
2022-01-26,-0.574415,-0.485531,-0.615474,1.202686


### 11.2.2 含有重复索引的时间序列

In [66]:
dates = pd.DatetimeIndex(['1/1/2022', '1/1/2022', '1/1/2022', '1/5/2022', '5/5/2022', '5/5/2022'])
dup_ts = pd.Series(np.random.randn(6), index=dates)
dup_ts

2022-01-01    0.768743
2022-01-01    0.719777
2022-01-01   -1.804380
2022-01-05   -0.025937
2022-05-05   -0.277915
2022-05-05   -0.414315
dtype: float64

In [69]:
dup_ts.index.is_unique

False

In [70]:
# 使用groupby聚合数据
grouped = dup_ts.groupby(level=0)
grouped.mean()

2022-01-01   -0.105287
2022-01-05   -0.025937
2022-05-05   -0.346115
dtype: float64

In [71]:
grouped.count()

2022-01-01    3
2022-01-05    1
2022-05-05    2
dtype: int64

## 11.3 日期范围、频率和移位

In [72]:
# 调用resample方法将样本时间序列转换为固定的每日频率数据
ts

2022-01-01    0.391336
2022-02-01    0.084591
2022-03-01    0.590092
2022-04-01   -1.074037
2022-05-01    1.132917
2022-06-01    0.507165
dtype: float64

In [76]:
# 字符串D被解释为每日频率
resampler = ts.resample('D')

### 11.3.1 生成日期范围

In [77]:
# pandas.date_range生成日期范围序列
date_range = pd.date_range('1/1/2022', '10/1/2022')
date_range

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10',
               ...
               '2022-09-22', '2022-09-23', '2022-09-24', '2022-09-25',
               '2022-09-26', '2022-09-27', '2022-09-28', '2022-09-29',
               '2022-09-30', '2022-10-01'],
              dtype='datetime64[ns]', length=274, freq='D')

In [78]:
# 可以针对freq传入不同的值，可以生成指定的序列
pd.date_range('1/1/2022', '31/12/2022', freq='BM')  # BM为工作日的月底日期

  exec(code_obj, self.user_global_ns, self.user_ns)


DatetimeIndex(['2022-01-31', '2022-02-28', '2022-03-31', '2022-04-29',
               '2022-05-31', '2022-06-30', '2022-07-29', '2022-08-31',
               '2022-09-30', '2022-10-31', '2022-11-30', '2022-12-30'],
              dtype='datetime64[ns]', freq='BM')

In [79]:
# 默认情况下，date_range保留开始或结束时间戳的时间
pd.date_range('2022-1-1 12:34:56', periods=10)

DatetimeIndex(['2022-01-01 12:34:56', '2022-01-02 12:34:56',
               '2022-01-03 12:34:56', '2022-01-04 12:34:56',
               '2022-01-05 12:34:56', '2022-01-06 12:34:56',
               '2022-01-07 12:34:56', '2022-01-08 12:34:56',
               '2022-01-09 12:34:56', '2022-01-10 12:34:56'],
              dtype='datetime64[ns]', freq='D')

In [80]:
# 如果想生成标准化的零点时间戳，加上normalize=True
pd.date_range('2022-1-1 12:34:56', '2022-1-10 12:23:34', normalize=True)

DatetimeIndex(['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04',
               '2022-01-05', '2022-01-06', '2022-01-07', '2022-01-08',
               '2022-01-09', '2022-01-10'],
              dtype='datetime64[ns]', freq='D')

### 11.3.2 频率和日期偏置

In [81]:
from pandas.tseries.offsets import Hour, Minute

# 小时偏置频率
hour = Hour()
hour

<Hour>

In [83]:
# 生成4小时的日期序列
pd.date_range('2000-1-1', '2000-1-3', freq='4H')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

In [84]:
Hour(2) + Minute(40)

<160 * Minutes>

In [85]:
# 还可以传递字符串频率
pd.date_range('2000-1-1', '2000-1-2', freq='1h30min')

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 01:30:00',
               '2000-01-01 03:00:00', '2000-01-01 04:30:00',
               '2000-01-01 06:00:00', '2000-01-01 07:30:00',
               '2000-01-01 09:00:00', '2000-01-01 10:30:00',
               '2000-01-01 12:00:00', '2000-01-01 13:30:00',
               '2000-01-01 15:00:00', '2000-01-01 16:30:00',
               '2000-01-01 18:00:00', '2000-01-01 19:30:00',
               '2000-01-01 21:00:00', '2000-01-01 22:30:00',
               '2000-01-02 00:00:00'],
              dtype='datetime64[ns]', freq='90T')

In [86]:
# 月中某个星期的日期
# freq='WOM-3FRI'：每个月第三个星期五
rng = pd.date_range('2000-1-1', '2000-2-1', freq='WOM-3FRI')
rng

DatetimeIndex(['2000-01-21'], dtype='datetime64[ns]', freq='WOM-3FRI')

### 11.3.3 移位（前向和后向）日期

In [87]:
ts = pd.Series(np.random.randn(4),
               index=pd.date_range('1/1/2000', periods=4, freq='M'))
ts

2000-01-31    0.411399
2000-02-29   -0.915257
2000-03-31   -0.267982
2000-04-30   -1.108570
Freq: M, dtype: float64

In [88]:
# 将数据向前移位2
ts.shift(2)

2000-01-31         NaN
2000-02-29         NaN
2000-03-31    0.411399
2000-04-30   -0.915257
Freq: M, dtype: float64

In [89]:
# 将数据向后移动2
ts.shift(-2)

2000-01-31   -0.267982
2000-02-29   -1.108570
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

In [90]:
# 移动时间戳而不是数据
ts.shift(2, freq='M')

2000-03-31    0.411399
2000-04-30   -0.915257
2000-05-31   -0.267982
2000-06-30   -1.108570
Freq: M, dtype: float64

In [91]:
# 按天数移动
ts.shift(3, freq='D')

2000-02-03    0.411399
2000-03-03   -0.915257
2000-04-03   -0.267982
2000-05-03   -1.108570
dtype: float64

In [94]:
# 使用偏置移动日期
from pandas.tseries.offsets import Day, MonthEnd

now = datetime(2000, 1, 1)
now + 3 * Day()

Timestamp('2000-01-04 00:00:00')

In [95]:
now + MonthEnd()

Timestamp('2000-01-31 00:00:00')

In [96]:
now + MonthEnd(3)

Timestamp('2000-03-31 00:00:00')

In [100]:
offset = MonthEnd()
ts = pd.Series(np.random.randn(20),
               index=pd.date_range('1/15/2000', periods=20, freq='4d'))
ts.groupby(offset.rollforward).mean()

2000-01-31    0.175464
2000-02-29    0.098912
2000-03-31    0.776743
dtype: float64

## 11.4 时区处理

..................

## 11.5 时间区间和区间算术

In [102]:
p = pd.Period(2007, freq='A-DEC')
p

Period('2007', 'A-DEC')

In [103]:
# 对时间段增加或减去整数可以方便地根据他们的频率移位
p + 5

Period('2012', 'A-DEC')

In [104]:
p - 2

Period('2005', 'A-DEC')

In [106]:
# 如果两个区间拥有相同的频率，则它们之间的差是它们之间的单位数
pd.Period('2014', freq='A-DEC') - p

<7 * YearEnds: month=12>

In [108]:
# 使用period_range函数可以构造区间序列
rng = pd.period_range('2000-1-1', '2000-6-30', freq='M')
rng

PeriodIndex(['2000-01', '2000-02', '2000-03', '2000-04', '2000-05', '2000-06'], dtype='period[M]')

In [109]:
pd.Series(np.random.randn(6), index=rng)

2000-01   -0.584653
2000-02    0.137891
2000-03    0.221958
2000-04    1.089043
2000-05    1.083146
2000-06   -0.456654
Freq: M, dtype: float64

In [112]:
# 可以将字符串数组转化为period_range
values = ['2001Q3', '2002Q2', '2003Q1']
index = pd.PeriodIndex(values, freq='Q-DEC')
index

PeriodIndex(['2001Q3', '2002Q2', '2003Q1'], dtype='period[Q-DEC]')

### 11.5.1 区间频率转换

In [114]:
# A-DEC表示年末是12月
p = pd.Period('2007', freq='A-DEC')
p

Period('2007', 'A-DEC')

In [115]:
# 使用asfreq将区间和PeriodIndex转换为其他的频率
p.asfreq('M', how='start')

Period('2007-01', 'M')

In [116]:
p.asfreq('D', how='end')

Period('2007-12-31', 'D')

In [117]:
# 完整的PeriodIndex对象或时间序列可以按照相同或的语义进行转换
rng = pd.period_range('2006', '2009', freq='A-DEC')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts

2006   -1.554496
2007    0.012153
2008    0.919082
2009    0.517960
Freq: A-DEC, dtype: float64

In [118]:
ts.asfreq('M', how='start')

2006-01   -1.554496
2007-01    0.012153
2008-01    0.919082
2009-01    0.517960
Freq: M, dtype: float64

In [120]:
# 季度区间频率
p = pd.Period('2012Q4', freq='Q-JAN')
p

Period('2012Q4', 'Q-JAN')

In [121]:
p.asfreq('D', how='start')

Period('2011-11-01', 'D')

In [123]:
p.asfreq('D', how='end')

Period('2012-01-31', 'D')

In [124]:
# 生成季度序列
p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60
p4pm

Period('2012-01-30 16:00', 'T')

### 11.5.3 时间戳与区间互转

In [125]:
rng = pd.date_range('2000-1-1', periods=3, freq='M')
rng

DatetimeIndex(['2000-01-31', '2000-02-29', '2000-03-31'], dtype='datetime64[ns]', freq='M')

In [126]:
ts = pd.Series(np.random.randn(3), index=rng)
ts

2000-01-31   -0.300656
2000-02-29   -2.818111
2000-03-31    0.249771
Freq: M, dtype: float64

In [127]:
ts.to_period()

2000-01   -0.300656
2000-02   -2.818111
2000-03    0.249771
Freq: M, dtype: float64

In [128]:
# 包含重复区间也没有问题
rng = pd.date_range('1/29/2000', periods=6, freq='D')
ts2 = pd.Series(np.random.randn(6), index=rng)
ts2.to_period('M')

2000-01    2.667197
2000-01    0.019008
2000-01    0.556190
2000-02   -0.994354
2000-02   -0.783380
2000-02   -0.082355
Freq: M, dtype: float64

In [129]:
# 使用to_timestamp可以将区间在转换为时间戳
pts = ts2.to_period()
pts

2000-01-29    2.667197
2000-01-30    0.019008
2000-01-31    0.556190
2000-02-01   -0.994354
2000-02-02   -0.783380
2000-02-03   -0.082355
Freq: D, dtype: float64

In [130]:
pts.to_timestamp(how='end')

2000-01-29 23:59:59.999999999    2.667197
2000-01-30 23:59:59.999999999    0.019008
2000-01-31 23:59:59.999999999    0.556190
2000-02-01 23:59:59.999999999   -0.994354
2000-02-02 23:59:59.999999999   -0.783380
2000-02-03 23:59:59.999999999   -0.082355
Freq: D, dtype: float64

### 11.5.4 从数组生成PeriodIndex

In [131]:
data = pd.read_csv('./examples/macrodata.csv')
data.head(5)

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.98,139.7,2.82,5.8,177.146,0.0,0.0
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.15,141.7,3.08,5.1,177.83,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.26,1916.4,29.35,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.37,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.54,139.6,3.5,5.2,180.007,2.31,1.19


In [132]:
data.year

0      1959.0
1      1959.0
2      1959.0
3      1959.0
4      1960.0
        ...  
198    2008.0
199    2008.0
200    2009.0
201    2009.0
202    2009.0
Name: year, Length: 203, dtype: float64

In [133]:
# 将数组和频率传递给PeriodIndex，可以联合这些数组DataFrame的索引
index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')
index

PeriodIndex(['1959Q1', '1959Q2', '1959Q3', '1959Q4', '1960Q1', '1960Q2',
             '1960Q3', '1960Q4', '1961Q1', '1961Q2',
             ...
             '2007Q2', '2007Q3', '2007Q4', '2008Q1', '2008Q2', '2008Q3',
             '2008Q4', '2009Q1', '2009Q2', '2009Q3'],
            dtype='period[Q-DEC]', length=203)

In [137]:
data.index = index
data.infl.head(5)

1959Q1    0.00
1959Q2    2.34
1959Q3    2.74
1959Q4    0.27
1960Q1    2.31
Freq: Q-DEC, Name: infl, dtype: float64