# 重采样

In [1]:
# ch1/ch1.ipynb
# 第三方库
import numpy as np
import pandas as pd

# 高频数据构造
date = pd.date_range(
    '2022-01-01 00:00:00',  # 开始时间
    '2022-01-01 08:00:00',  # 结束时间
    freq='1H',  # 间隔时间
    closed='left',  # 左闭右开
    tz='UTC'  # 时区
)
value = np.random.randint(1, 100, len(date))
data = pd.DataFrame({'value': value}, index=date)
print('原始1H间隔数据')
print(data)

# 降采样到3H间隔
data = data.resample('3H').sum()
print()
print('降采样3H间隔数据')
print(data)


原始1H间隔数据
                           value
2022-01-01 00:00:00+00:00     40
2022-01-01 01:00:00+00:00     28
2022-01-01 02:00:00+00:00     63
2022-01-01 03:00:00+00:00     87
2022-01-01 04:00:00+00:00     91
2022-01-01 05:00:00+00:00     39
2022-01-01 06:00:00+00:00      1
2022-01-01 07:00:00+00:00     97

降采样3H间隔数据
                           value
2022-01-01 00:00:00+00:00    131
2022-01-01 03:00:00+00:00    217
2022-01-01 06:00:00+00:00     98


# 缺失值补全

In [2]:
# ch1/ch1.ipynb
# 第三方库
import numpy as np
import pandas as pd

# 完整数据构造
date = pd.date_range(
    '2022-01-01 00:00:00',  # 开始时间
    '2022-01-01 08:00:00',  # 结束时间
    freq='1H',  # 间隔时间
    closed='left',  # 左闭右开
    tz='UTC'  # 时区
)
value = np.random.randint(1, 100, len(date))
data = pd.DataFrame({
    'date': date,
    'value': value
})

# 随机设置缺失值
data.iloc[[3, 6], :] = None
print('有缺失数据集')
print(data)

# 缺失值补全
data.set_index('date', inplace=True)  # 设置时间为索引
data = data.resample('1H').mean()  # 补全缺失时间
data.interpolate(method='time', inplace=True)  # 插值
data.reset_index(inplace=True)  # 恢复默认索引
print()
print('缺失值补全数据集')
print(data)


有缺失数据集
                       date  value
0 2022-01-01 00:00:00+00:00   96.0
1 2022-01-01 01:00:00+00:00   53.0
2 2022-01-01 02:00:00+00:00   11.0
3                       NaT    NaN
4 2022-01-01 04:00:00+00:00   46.0
5 2022-01-01 05:00:00+00:00   23.0
6                       NaT    NaN
7 2022-01-01 07:00:00+00:00   35.0

缺失值补全数据集
                       date  value
0 2022-01-01 00:00:00+00:00   96.0
1 2022-01-01 01:00:00+00:00   53.0
2 2022-01-01 02:00:00+00:00   11.0
3 2022-01-01 03:00:00+00:00   28.5
4 2022-01-01 04:00:00+00:00   46.0
5 2022-01-01 05:00:00+00:00   23.0
6 2022-01-01 06:00:00+00:00   29.0
7 2022-01-01 07:00:00+00:00   35.0


# 归一化和标准化

In [3]:
# ch1/ch1.ipynb
# 第三方库
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# 随机初始化生成数据
train_x = np.random.random((500, 10))*100  # [num_train, H]
train_y = np.random.random((500, 1))*100  # [num_train, 1]
test_x = np.random.random((100, 10))*100  # [num_test, H]
test_y = np.random.random((100, 1))*100  # [num_test, 1]

# 查看未归一化/标准化时的数据维度和范围
print(f'{train_x.shape=}, {train_y.shape=}')
print(f'{test_x.shape=}, {test_y.shape=}')
print(f'{train_x.min()=:.4f}, {train_x.max()=:.4f}')
print(f'{train_y.min()=:.4f}, {train_y.max()=:.4f}')
print(f'{test_x.min()=:.4f}, {test_x.max()=:.4f}')
print(f'{test_y.min()=:.4f}, {test_y.max()=:.4f}')

# 初始化 MinMaxScaler
x_scalar = MinMaxScaler(feature_range=(0, 1))
y_scalar = MinMaxScaler(feature_range=(0, 1))

# 训练集和测试集归一化
train_x_n = x_scalar.fit_transform(train_x)  # [num_train, H]
test_x_n = x_scalar.transform(test_x)  # [num_test, H]
train_y_n = y_scalar.fit_transform(train_y)  # [num_train, 1]
test_y_n = y_scalar.transform(test_y)  # [num_test, 1]

# 查看归一化后的数据维度和范围
print(f'\n{train_x_n.shape=}, {train_y_n.shape=}')
print(f'{test_x_n.shape=}, {test_y_n.shape=}')
print(f'{train_x_n.min()=:.4f}, {train_x_n.max()=:.4f}')
print(f'{train_y_n.min()=:.4f}, {train_y_n.max()=:.4f}')
print(f'{test_x_n.min()=:.4f}, {test_x_n.max()=:.4f}')
print(f'{test_y_n.min()=:.4f}, {test_y_n.max()=:.4f}')

# 初始化 StandardScaler
x_scalar = StandardScaler()
y_scalar = StandardScaler()

# 训练集和测试集标准化
train_x_n = x_scalar.fit_transform(train_x)  # [num_train, H]
test_x_n = x_scalar.transform(test_x)  # [num_test, H]
train_y_n = y_scalar.fit_transform(train_y)  # [num_train, 1]
test_y_n = y_scalar.transform(test_y)  # [num_test, 1]

# 查看标准化后的数据维度和范围
print(f'\n{train_x_n.shape=}, {train_y_n.shape=}')
print(f'{test_x_n.shape=}, {test_y_n.shape=}')
print(f'{train_x_n.min()=:.4f}, {train_x_n.max()=:.4f}')
print(f'{train_y_n.min()=:.4f}, {train_y_n.max()=:.4f}')
print(f'{test_x_n.min()=:.4f}, {test_x_n.max()=:.4f}')
print(f'{test_y_n.min()=:.4f}, {test_y_n.max()=:.4f}')


train_x.shape=(500, 10), train_y.shape=(500, 1)
test_x.shape=(100, 10), test_y.shape=(100, 1)
train_x.min()=0.0005, train_x.max()=99.8647
train_y.min()=0.1415, train_y.max()=99.8912
test_x.min()=0.0701, test_x.max()=99.6524
test_y.min()=1.8336, test_y.max()=96.8094

train_x_n.shape=(500, 10), train_y_n.shape=(500, 1)
test_x_n.shape=(100, 10), test_y_n.shape=(100, 1)
train_x_n.min()=0.0000, train_x_n.max()=1.0000
train_y_n.min()=0.0000, train_y_n.max()=1.0000
test_x_n.min()=0.0005, test_x_n.max()=1.0007
test_y_n.min()=0.0170, test_y_n.max()=0.9691

train_x_n.shape=(500, 10), train_y_n.shape=(500, 1)
test_x_n.shape=(100, 10), test_y_n.shape=(100, 1)
train_x_n.min()=-1.8114, train_x_n.max()=1.7516
train_y_n.min()=-1.7133, train_y_n.max()=1.7309
test_x_n.min()=-1.7773, test_x_n.max()=1.7413
test_y_n.min()=-1.6549, test_y_n.max()=1.6244


# 时间序列转监督学习

In [4]:
# ch1/ch1.ipynb
# 第三方库
import numpy as np
import pandas as pd


def series_to_supervised(series, H):
    """时间序列数据转监督学习数据

    参数:
        series (list or 1d numpy array): 时间序列数据
        H (int): 输入历史值数量

    返回值:
        numpy array: 监督学习数据集, 特征和标签
    """
    X, y = [], []
    for i in range(len(series)-H):
        seq_x = series[i:i+H]  # 从位置i开始截取长度为H的输入
        seq_y = series[i+H]  # 取位置i+H的单个数值为输出
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)  # 转换List变量为Numpy Array变量


# 时间序列数据构造
data = np.linspace(1, 10, 10)

# 输入输出划分
X, y = series_to_supervised(data, 5)
print(f'{X.shape=}, {y.shape=}')
print(f'{X=}, \n{y=}')


X.shape=(5, 5), y.shape=(5,)
X=array([[1., 2., 3., 4., 5.],
       [2., 3., 4., 5., 6.],
       [3., 4., 5., 6., 7.],
       [4., 5., 6., 7., 8.],
       [5., 6., 7., 8., 9.]]), 
y=array([ 6.,  7.,  8.,  9., 10.])
