# 11 时间序列
时间戳：具体的时刻
固定的时间区间
时间间隔

## 11.1 日期和时间数据的类型及工具
Python标准库包含了日期和时间数据的类型
1 datetime
2 time
3 calendar

In [43]:
from datetime import datetime
import pandas as pd
import numpy as np

import pandas as pd

now = datetime.now()
now

datetime.datetime(2022, 5, 8, 21, 32, 31, 382187)

In [4]:
now.year, now.month, now.day

(2022, 5, 8)

In [6]:
# timedelta表示两个时间之差
delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)
delta

datetime.timedelta(days=926, seconds=56700)

In [7]:
delta.days

926

In [8]:
delta.seconds

56700

In [9]:
from datetime import timedelta

start = datetime(2022, 1, 1)
# 可以通过加上或减去一个timedelta或其整数倍来产生一个新的datetime
start + timedelta(12)

datetime.datetime(2022, 1, 13, 0, 0)

In [10]:
start - 2 * timedelta(12)

datetime.datetime(2021, 12, 8, 0, 0)

In [14]:
start.tzinfo

### 11.1.1 字符串与datetime互转

In [16]:
stamp = datetime(2022, 1, 1)
str(stamp)

'2022-01-01 00:00:00'

In [17]:
stamp.strftime('%Y-%m-%d')

'2022-01-01'

In [21]:
stamp.strftime('%Y-%m-%d %H:%M:%S')

'2022-01-01 00:00:00'

In [25]:
# 将字符串转化为datetime
value = '2022-01-01'
datetime.strptime(value, '%Y-%m-%d')

datetime.datetime(2022, 1, 1, 0, 0)

In [35]:
datestrs = ['2022-01-01', '2022-05-08']
[datetime.strptime(x, '%Y-%m-%d') for x in datestrs]

[datetime.datetime(2022, 1, 1, 0, 0), datetime.datetime(2022, 5, 8, 0, 0)]

In [36]:
# datetime.strptime将字符串转化为时间，需要传入指定的格式
# 每次传入比较麻烦，对于通用的日期，可以使用第三方库
from dateutil.parser import parse

parse('2022-01-01')

datetime.datetime(2022, 1, 1, 0, 0)

In [37]:
# dateutil能够解析大部分人类可理解的日期
parse('Mar 31, 2022 10:45 PM')

datetime.datetime(2022, 3, 31, 22, 45)

In [41]:
# 国际场合下，一般日期出现在月份之前，可以传递dayfirst=True
parse('Mar 31, 2022 10:45 PM', dayfirst=True)

datetime.datetime(2022, 5, 8, 0, 0)

In [39]:
parse('8/5/2022', dayfirst=True)

datetime.datetime(2022, 5, 8, 0, 0)

> dateutil.parse可能存在识别不准的情况

In [44]:
# to_datetime方法可以转换很多不同的日期表示格式
datestrs = ['2022-05-08 21:00:00', '2022-05-09 22:00:00']
pd.to_datetime(datestrs)

DatetimeIndex(['2022-05-08 21:00:00', '2022-05-09 22:00:00'], dtype='datetime64[ns]', freq=None)

In [45]:
# pandas.to_datetime方法可以处理那些被认为缺失值的值
idx = pd.to_datetime(datestrs + [None])
idx
# NaT是pandas中时间吹数据的null值

DatetimeIndex(['2022-05-08 21:00:00', '2022-05-09 22:00:00', 'NaT'], dtype='datetime64[ns]', freq=None)

## 11.2 时间序列基础

In [47]:
from datetime import datetime

dates = [datetime(2022, 1, 1),
         datetime(2022, 2, 1),
         datetime(2022, 3, 1),
         datetime(2022, 4, 1),
         datetime(2022, 5, 1),
         datetime(2022, 6, 1), ]
# datetime对象可以被放入DatetimeIndex
ts = pd.Series(np.random.randn(6), index=dates)
ts

2022-01-01    0.391336
2022-02-01    0.084591
2022-03-01    0.590092
2022-04-01   -1.074037
2022-05-01    1.132917
2022-06-01    0.507165
dtype: float64

In [48]:
ts.index

DatetimeIndex(['2022-01-01', '2022-02-01', '2022-03-01', '2022-04-01',
               '2022-05-01', '2022-06-01'],
              dtype='datetime64[ns]', freq=None)

In [49]:
ts.index.dtype

dtype('<M8[ns]')

In [50]:
# DatetimeIndex中的标量值是pandas的Timestamp对象
stamp = ts.index[0]
stamp

Timestamp('2022-01-01 00:00:00')

### 11.2.1 索引、选择、子集

In [51]:
stamp = ts.index[2]
# 可以通过值来索引出数据
ts[stamp]

0.5900924647158913

In [52]:
# 直接传入一个可解释的日期的字符串也可以
ts['2022/5/1']

1.1329168626760033

In [53]:
# 对于时间序列，可以传递一个年份或者月份来获取数据的切片
longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('2022/1/1', periods=1000))
longer_ts[:5]

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

In [55]:
# 传入年份，获取2022年的数据
longer_ts['2022'].head()

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

In [57]:
# 传入月份也可以
longer_ts['2022-05'].head()

2022-05-01    0.160175
2022-05-02    0.104173
2022-05-03   -0.618365
2022-05-04    0.999042
2022-05-05    0.140988
Freq: D, dtype: float64

In [58]:
# 传入时间区间
longer_ts['2022/1/1':'2022/1/5']

2022-01-01   -0.519933
2022-01-02   -0.297715
2022-01-03   -0.094866
2022-01-04    0.248127
2022-01-05   -1.748985
Freq: D, dtype: float64

> 在传入一个字符串的日期、datetime对象或时间戳，这种方式产生了一个原时间序列的试图。类似于Numpy的数组。这意味着没有数据被复制，并且在切片上的修改会反映在原始数据上。

In [61]:
# truncate可以在两个日期对Series进行切片
ts.truncate(before='2022/1/5')

2022-02-01    0.084591
2022-03-01    0.590092
2022-04-01   -1.074037
2022-05-01    1.132917
2022-06-01    0.507165
dtype: float64

In [63]:
# 在DataFrame操作同样适用
dates = pd.date_range('1/1/2022', periods=100, freq='W-WED')
long_df = pd.DataFrame(np.random.randn(100, 4),
                       index=dates,
                       columns=list('ABCD'))
long_df

Unnamed: 0,A,B,C,D
2022-01-05,-0.400783,-1.264017,0.379966,-0.256141
2022-01-12,-0.571639,2.131241,-0.241245,0.509657
2022-01-19,-0.401982,-0.158788,1.402505,-1.186511
2022-01-26,-0.574415,-0.485531,-0.615474,1.202686
2022-02-02,1.220108,-2.251149,2.631229,-0.791016
...,...,...,...,...
2023-11-01,-0.910823,-1.091824,-1.579518,-1.195703
2023-11-08,-0.278071,0.558738,-0.077413,-2.918578
2023-11-15,0.766577,0.550362,0.619179,0.456066
2023-11-22,0.149930,0.328295,-1.395013,1.267319


In [65]:
long_df.loc['1/2022']

Unnamed: 0,A,B,C,D
2022-01-05,-0.400783,-1.264017,0.379966,-0.256141
2022-01-12,-0.571639,2.131241,-0.241245,0.509657
2022-01-19,-0.401982,-0.158788,1.402505,-1.186511
2022-01-26,-0.574415,-0.485531,-0.615474,1.202686


### 11.2.2 含有重复索引的时间序列

In [66]:
dates = pd.DatetimeIndex(['1/1/2022', '1/1/2022', '1/1/2022', '1/5/2022', '5/5/2022', '5/5/2022'])
dup_ts = pd.Series(np.random.randn(6), index=dates)
dup_ts

2022-01-01    0.768743
2022-01-01    0.719777
2022-01-01   -1.804380
2022-01-05   -0.025937
2022-05-05   -0.277915
2022-05-05   -0.414315
dtype: float64

In [69]:
dup_ts.index.is_unique

False

In [70]:
# 使用groupby聚合数据
grouped = dup_ts.groupby(level=0)
grouped.mean()

2022-01-01   -0.105287
2022-01-05   -0.025937
2022-05-05   -0.346115
dtype: float64

In [71]:
grouped.count()

2022-01-01    3
2022-01-05    1
2022-05-05    2
dtype: int64

## 11.3 日期范围、频率和移位