# 第7章 时间序列数据分析

## 7.1 时间序列基础

### 7.1.1 创建时间序列数据

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
pd.to_datetime('20180828')   # 将datetime转换为Timestamp对象

Timestamp('2018-08-28 00:00:00')

In [2]:
# 传入多个datetime字符串
date_index = pd.to_datetime(['20180820', '20180828', '20180908'])
date_index

DatetimeIndex(['2018-08-20', '2018-08-28', '2018-09-08'], dtype='datetime64[ns]', freq=None)

In [3]:
date_index[0]   # 取出第一个时间戳

Timestamp('2018-08-20 00:00:00')

In [6]:
# 创建时间序列类型的Series对象
date_ser = pd.Series([11, 22, 33], index=date_index)
date_ser

date_ser['2018-08']

2018-08-20    11
2018-08-28    22
dtype: int64

In [7]:
# 指定索引为多个datetime的列表
date_list = [datetime(2018, 1, 1), datetime(2018, 1, 15),
             datetime(2018, 2, 20), datetime(2018, 4, 1),
             datetime(2018, 5, 5), datetime(2018, 6, 1)]
time_se = pd.Series(np.arange(6), index=date_list)
time_se

2018-01-01    0
2018-01-15    1
2018-02-20    2
2018-04-01    3
2018-05-05    4
2018-06-01    5
dtype: int32

In [9]:
data_demo = [[11, 22, 33], [44, 55, 66], 
             [77, 88, 99], [12, 23, 34]]
date_list = [datetime(2018, 1, 23), datetime(2018, 2, 15),
             datetime(2018, 5, 22), datetime(2018, 3, 30)]
time_df = pd.DataFrame(data_demo, index=date_list)
time_df['2018']

Unnamed: 0,0,1,2
2018-01-23,11,22,33
2018-02-15,44,55,66
2018-05-22,77,88,99
2018-03-30,12,23,34


### 7.1.2 通过时间戳索引选取子集

In [10]:
# 指定索引为多个日期字符串的列表
date_list = ['2015/05/30', '2017/02/01',
             '2015.6.1', '2016.4.1',
             '2017.6.1', '2018.1.23']
# 将日期字符串转换为DatetimeIndex 
date_index = pd.to_datetime(date_list)
# 创建以DatetimeIndex 为索引的Series对象
date_se = pd.Series(np.arange(6), index=date_index)
date_se

2015-05-30    0
2017-02-01    1
2015-06-01    2
2016-04-01    3
2017-06-01    4
2018-01-23    5
dtype: int32

In [11]:
# 根据位置索引获取数据
time_se[3]

3

In [12]:
date_time = datetime(2015, 6, 1)
date_se[date_time]

2

In [13]:
date_se['20150530']

2015-05-30    0
dtype: int32

In [14]:
date_se['2016-04-01']

2016-04-01    3
dtype: int32

In [15]:
date_se['2018/01/23']

2018-01-23    5
dtype: int32

In [16]:
date_se['6/1/2017']

2017-06-01    4
dtype: int32

In [17]:
date_se['2015']  # 获取2015年的数据

2015-05-30    0
2015-06-01    2
dtype: int32

In [18]:
# 扔掉2016-1-1之前的数据
sorted_se = date_se.sort_index()
tt = sorted_se.truncate(before='2016-1-1')
tt

2016-04-01    3
2017-02-01    1
2017-06-01    4
2018-01-23    5
dtype: int32

In [50]:
# 扔掉2016-7-31之后的数据
sorted_se.truncate(after='2016-7-31')

2015-05-30    0
2015-06-01    2
2016-04-01    3
dtype: int32

## 7.2 固定频率的时间序列

### 7.2.1 创建固定频率的时间序列

In [19]:
# 创建DatetimeIndex对象时，只传入开始日期与结束日期
pd.date_range('2018/08/10', '2018/08/20')

DatetimeIndex(['2018-08-10', '2018-08-11', '2018-08-12', '2018-08-13',
               '2018-08-14', '2018-08-15', '2018-08-16', '2018-08-17',
               '2018-08-18', '2018-08-19', '2018-08-20'],
              dtype='datetime64[ns]', freq='D')

In [20]:
# 创建DatetimeIndex对象时，传入start与periods参数
pd.date_range(start='2018/08/10', periods=5)

DatetimeIndex(['2018-08-10', '2018-08-11', '2018-08-12', '2018-08-13',
               '2018-08-14'],
              dtype='datetime64[ns]', freq='D')

In [21]:
# 创建DatetimeIndex对象时，传入end与periods参数
pd.date_range(end='2018/08/10', periods=5)

DatetimeIndex(['2018-08-06', '2018-08-07', '2018-08-08', '2018-08-09',
               '2018-08-10'],
              dtype='datetime64[ns]', freq='D')

In [22]:
dates_index = pd.date_range('2018-01-01',         # 起始日期
                            periods=5,            # 周期
                            freq='W-SUN')         # 频率
dates_index

DatetimeIndex(['2018-01-07', '2018-01-14', '2018-01-21', '2018-01-28',
               '2018-02-04'],
              dtype='datetime64[ns]', freq='W-SUN')

In [23]:
ser_data = [12, 56, 89, 99, 31]
pd.Series(ser_data, dates_index)

2018-01-07    12
2018-01-14    56
2018-01-21    89
2018-01-28    99
2018-02-04    31
Freq: W-SUN, dtype: int64

In [24]:
# 创建DatetimeIndex，并指定开始日期、产生日期个数、默认的频率，以及时区
pd.date_range(start='2018/8/1 12:13:30', periods=5, 
              tz='Asia/Hong_Kong')

DatetimeIndex(['2018-08-01 12:13:30+08:00', '2018-08-02 12:13:30+08:00',
               '2018-08-03 12:13:30+08:00', '2018-08-04 12:13:30+08:00',
               '2018-08-05 12:13:30+08:00'],
              dtype='datetime64[ns, Asia/Hong_Kong]', freq='D')

In [25]:
#规范化时间戳
pd.date_range(start='2018/8/1 12:13:30', periods=5, 
              normalize=True, tz='Asia/Hong_Kong')

DatetimeIndex(['2018-08-01 00:00:00+08:00', '2018-08-02 00:00:00+08:00',
               '2018-08-03 00:00:00+08:00', '2018-08-04 00:00:00+08:00',
               '2018-08-05 00:00:00+08:00'],
              dtype='datetime64[ns, Asia/Hong_Kong]', freq='D')

### 7.2.2 时间序列的频率、偏移量

In [26]:
pd.date_range(start='2018/2/1', end='2018/2/28', freq='5D')

DatetimeIndex(['2018-02-01', '2018-02-06', '2018-02-11', '2018-02-16',
               '2018-02-21', '2018-02-26'],
              dtype='datetime64[ns]', freq='5D')

In [31]:
from pandas.tseries.offsets import *
dayOff = DateOffset(months=4, days=5)
t = pd.datetime(2019,10,1)
print(t)
print(t + dayOff)

2019-10-01 00:00:00
2020-02-06 00:00:00


In [32]:
Week(2) + Minute(10)+Second(5)

Timedelta('14 days 00:10:05')

In [33]:
# 生成日期偏移量
date_offset  = Week(2) + Hour(10)
pd.date_range('2018/3/1', '2018/12/31', freq=date_offset)

DatetimeIndex(['2018-03-01 00:00:00', '2018-03-15 10:00:00',
               '2018-03-29 20:00:00', '2018-04-13 06:00:00',
               '2018-04-27 16:00:00', '2018-05-12 02:00:00',
               '2018-05-26 12:00:00', '2018-06-09 22:00:00',
               '2018-06-24 08:00:00', '2018-07-08 18:00:00',
               '2018-07-23 04:00:00', '2018-08-06 14:00:00',
               '2018-08-21 00:00:00', '2018-09-04 10:00:00',
               '2018-09-18 20:00:00', '2018-10-03 06:00:00',
               '2018-10-17 16:00:00', '2018-11-01 02:00:00',
               '2018-11-15 12:00:00', '2018-11-29 22:00:00',
               '2018-12-14 08:00:00', '2018-12-28 18:00:00'],
              dtype='datetime64[ns]', freq='346H')

### 7.2.3 时间序列数据的移动

In [34]:
date_index = pd.date_range('2018/01/01', periods=5)
time_ser = pd.Series(np.arange(5) + 1, index=date_index)
time_ser

2018-01-01    1
2018-01-02    2
2018-01-03    3
2018-01-04    4
2018-01-05    5
Freq: D, dtype: int32

In [38]:
# 向后移动一次
time_ser.shift(4)

2018-01-01    NaN
2018-01-02    NaN
2018-01-03    NaN
2018-01-04    NaN
2018-01-05    1.0
Freq: D, dtype: float64

In [36]:
# 向前移动一次
time_ser.shift(-1)

2018-01-01    2.0
2018-01-02    3.0
2018-01-03    4.0
2018-01-04    5.0
2018-01-05    NaN
Freq: D, dtype: float64

## 7.3 时间周期及计算

### 7.3.1 创建时期对象

In [43]:
# 创建Period对象，表示从2018-01-01到2018-12-31之间的时间段
pd.Period(2018) 

Period('2018', 'A-DEC')

In [44]:
# 表示从2017-06-01到2017-06-30之间的整月时间
period = pd.Period('2017/6')
period

Period('2017-06', 'M')

In [45]:
period + 1   # Period对象加上一个整数

Period('2017-07', 'M')

In [46]:
period - 5    # Period对象减去一个整数

Period('2017-01', 'M')

In [49]:
# 创建一个与period频率相同的时期
other_period = pd.Period(201201, freq='M' )
#print(other_period)
#print(period - other_period)
print((period - other_period))
#dir(MonthEnd)

<65 * MonthEnds>


In [50]:
period_index = pd.period_range('2012.1.8', '2012.5.31', freq='M')
print(period_index.values)

[Period('2012-01', 'M') Period('2012-02', 'M') Period('2012-03', 'M')
 Period('2012-04', 'M') Period('2012-05', 'M')]


In [51]:
str_list = ['2010', '2011', '2012']
pd.PeriodIndex(str_list, freq='A-DEC')

PeriodIndex(['2010', '2011', '2012'], dtype='period[A-DEC]', freq='A-DEC')

In [52]:
period_ser = pd.Series(np.arange(5), period_index)
period_ser

2012-01    0
2012-02    1
2012-03    2
2012-04    3
2012-05    4
Freq: M, dtype: int32

### 7.3.2 时期的频率转换

In [40]:
# 创建时期对象
period = pd.Period('2017', freq='A-DEC')
period.asfreq('M', how='start')

Period('2017-01', 'M')

In [41]:
period.asfreq('M', how='end')

Period('2017-12', 'M')