In [1]:
import pandas as pd
import numpy as np

## 生成时间序列

pd.date_range(start, end, periods, freq, ...)

- start: 开始日期
- end: 结束日期
- periods: 生成的日期个数
- freq: 日期频率，可以是字符串，也可以是pandas的DateOffset对象。常用的频率如下：

| 别名 | 描述 |
| --- | --- |
| B | 工作日频率 |
| C | 自定义工作日频率 |
| D | 日历日频率 |
| W | 周频率 |
| ME | 月末频率 |
| MS | 月初频率 |
| h | 每小时频率 |
| min | 每分钟频率 |
| s | 每秒频率 |
| ms | 毫秒 |
| us | 微秒 |
| ns | 纳秒 |

等，参见 https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases


In [2]:
# 从2018-01-01开始，生成10个日期，间隔为1天
pd.date_range('2018-01-01', periods=10, freq='D')

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10'],
              dtype='datetime64[ns]', freq='D')

In [3]:
# 从2018-01-01开始，至2018-01-10，生成日期，间隔为3天
pd.date_range('2018-01-01', '2018-01-10', freq='3D')

DatetimeIndex(['2018-01-01', '2018-01-04', '2018-01-07', '2018-01-10'], dtype='datetime64[ns]', freq='3D')

In [4]:
# 从2018-01-15开始，生成10个月末时间
pd.date_range('2018-01-15', periods=10, freq='ME')

  pd.date_range('2018-01-15', periods=10, freq='M')


DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
               '2018-05-31', '2018-06-30', '2018-07-31', '2018-08-31',
               '2018-09-30', '2018-10-31'],
              dtype='datetime64[ns]', freq='ME')

In [6]:
# 从2018-01-01开始，生成10个日期，间隔为2小时
pd.date_range('2018-01-01', periods=10, freq='2H')

  pd.date_range('2018-01-01', periods=10, freq='2H')


DatetimeIndex(['2018-01-01 00:00:00', '2018-01-01 02:00:00',
               '2018-01-01 04:00:00', '2018-01-01 06:00:00',
               '2018-01-01 08:00:00', '2018-01-01 10:00:00',
               '2018-01-01 12:00:00', '2018-01-01 14:00:00',
               '2018-01-01 16:00:00', '2018-01-01 18:00:00'],
              dtype='datetime64[ns]', freq='2h')

## 字符类型转换为时间序列

pd.to_datetime(arg, format, errors, ...)

- arg: 需要转换的日期字符串，支持很多格式
- format: 日期字符串的格式，有中文一般需要指定格式
    - ### 年份 ###
    - %Y: 2022, 2023, 2024, ...
    - %y: 22, 23, 24, ...
    - ### 月份 ###
    - %B: January, February, ...
    - %b: Jan, Feb, ...
    - %m: 01, 02, 03, ...
    - ### 日期 ###
    - %d: 2位数日期
    - %A: Sunday, Monday, ...
    - %a: Sun, Mon, ...
    - ### 时间 ###
    - %H: 24小时制小时
    - %M: 分钟
    - %S: 秒
    - 等，参照 https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes
- errors: 如果转换失败，如何处理
    - 'raise': 抛出异常
    - 'coerce': 转换为NaT
    - 'ignore': 忽略错误

In [7]:
# 将字符串转换为时间序列
pd.to_datetime('2022-01-01')

Timestamp('2022-01-01 00:00:00')

## 时间序列重采样

df.resample(rule, axis, closed, label, ...)

使用前提是，时间序列作为索引，可以这样操作：
```python
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
```

In [9]:
# 创建时间序列

df1 = pd.DataFrame(np.random.uniform(0, 100, (100, 1)), index=pd.date_range('2022-01-01', periods=100, freq='D'), columns=['data'])
df1

Unnamed: 0,data
2022-01-01,56.900771
2022-01-02,53.453226
2022-01-03,17.532452
2022-01-04,37.360228
2022-01-05,68.987939
...,...
2022-04-06,89.005287
2022-04-07,34.598012
2022-04-08,28.536810
2022-04-09,36.281415


In [12]:
df1.resample('ME')

<pandas.core.resample.DatetimeIndexResampler object at 0x0000022154F05490>

In [13]:
# 按月重采样，取平均值
df1.resample('ME').mean()

Unnamed: 0,data
2022-01-31,58.052047
2022-02-28,42.205474
2022-03-31,50.275925
2022-04-30,37.119606


In [11]:
# 按10天重采样，计数
df1.resample('10D').count()

Unnamed: 0,data
2022-01-01,10
2022-01-11,10
2022-01-21,10
2022-01-31,10
2022-02-10,10
2022-02-20,10
2022-03-02,10
2022-03-12,10
2022-03-22,10
2022-04-01,10


## 多列数据合成时间序列

pd.PeriodIndex.from_field(year, month, day, hour, minute, second, freq, ...)

In [15]:
# 示例数据
data = {
    'year': [2022, 2022, 2022, 2022, 2022],
    'month': [1, 1, 1, 1, 1],
    'day': [1, 2, 3, 4, 5],
    'hour': [0, 1, 2, 3, 4],
    'minute': [0, 0, 0, 0, 0],
    'second': [0, 0, 0, 0, 0],
    'data': np.random.uniform(0, 100, 5)
}
df2 = pd.DataFrame(data)
df2

Unnamed: 0,year,month,day,hour,minute,second,data
0,2022,1,1,0,0,0,23.390849
1,2022,1,2,1,0,0,82.712738
2,2022,1,3,2,0,0,10.00248
3,2022,1,4,3,0,0,20.054213
4,2022,1,5,4,0,0,33.021155


In [24]:
# 合成时间序列
period_index = pd.PeriodIndex.from_fields(year=df2['year'], month=df2['month'], day=df2['day'], freq='h')

In [28]:
# 将合成时间序列设置为索引
df2['date'] = period_index.to_timestamp() # 将PeriodIndex转换为时间序列再设置为索引
df2.set_index('date', inplace=True)
df2

Unnamed: 0_level_0,year,month,day,hour,minute,second,data
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-01,2022,1,1,0,0,0,23.390849
2022-01-02,2022,1,2,1,0,0,82.712738
2022-01-03,2022,1,3,2,0,0,10.00248
2022-01-04,2022,1,4,3,0,0,20.054213
2022-01-05,2022,1,5,4,0,0,33.021155


In [29]:
df2.resample('D').mean()

Unnamed: 0_level_0,year,month,day,hour,minute,second,data
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-01,2022.0,1.0,1.0,0.0,0.0,0.0,23.390849
2022-01-02,2022.0,1.0,2.0,1.0,0.0,0.0,82.712738
2022-01-03,2022.0,1.0,3.0,2.0,0.0,0.0,10.00248
2022-01-04,2022.0,1.0,4.0,3.0,0.0,0.0,20.054213
2022-01-05,2022.0,1.0,5.0,4.0,0.0,0.0,33.021155


In [31]:
df2.resample('ME').mean()

Unnamed: 0_level_0,year,month,day,hour,minute,second,data
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2022-01-31,2022.0,1.0,3.0,2.0,0.0,0.0,33.836287
