#### 数据预处理

In [1]:
import pandas as pd
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor

In [2]:
# 因为训练集过大，所以请到 https://www.kaggle.com/competitions/store-sales-time-series-forecasting/data 下载 train.csv
df_train = pd.read_csv("../datasets/时间序列预测训练集.csv", parse_dates=["date"])

df_train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [None]:
# 将数据转换为符合autogluon框架形式
df_train.drop(['id', 'onpromotion'], axis=1, inplace=True)
df_train = df_train.sort_values(by=['family', 'store_nbr', 'date'])
df_train['item_id'] = df_train.groupby(['family', 'store_nbr']).ngroup()
df_train = df_train.rename(columns={'date': 'timestamp'})
df_train = df_train.set_index(['item_id', 'timestamp'])

df_train

Unnamed: 0_level_0,Unnamed: 1_level_0,store_nbr,family,sales
item_id,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2013-01-01,1,AUTOMOTIVE,0.0
0,2013-01-02,1,AUTOMOTIVE,2.0
0,2013-01-03,1,AUTOMOTIVE,3.0
0,2013-01-04,1,AUTOMOTIVE,3.0
0,2013-01-05,1,AUTOMOTIVE,5.0
...,...,...,...,...
1781,2017-08-11,54,SEAFOOD,0.0
1781,2017-08-12,54,SEAFOOD,1.0
1781,2017-08-13,54,SEAFOOD,2.0
1781,2017-08-14,54,SEAFOOD,0.0


#### 模型训练

In [4]:
data = TimeSeriesDataFrame(df_train)

predictor = TimeSeriesPredictor(
    target="sales",
    prediction_length=16,
    eval_metric="RMSLE",
    freq="D",
    path="../models/time_series_forecasting_model"
).fit(data)

Beginning AutoGluon training...
AutoGluon will save models to 'c:\Users\18664\Desktop\python-data-science\models\time_series_forecasting_model'
AutoGluon Version:  1.3.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.19045
CPU Count:          16
GPU Count:          0
Memory Avail:       13.05 GB / 23.29 GB (56.0%)
Disk Space Avail:   824.55 GB / 929.76 GB (88.7%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': RMSLE,
 'freq': 'D',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 16,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'sales',
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'D'.
Provided train_data has 3008016 rows (NaN fraction=0.2%), 1782 time series. Median time series 

In [13]:
predictor.leaderboard()

Unnamed: 0,model,score_val,pred_time_val,fit_time_marginal,fit_order
0,WeightedEnsemble,-0.462018,123.553494,2.443949,11
1,TemporalFusionTransformer,-0.485108,1.35681,250.060254,7
2,TiDE,-0.486261,2.286438,265.585489,10
3,RecursiveTabular,-0.492528,3.332961,115.004327,2
4,DeepAR,-0.498198,12.007168,122.309603,8
5,PatchTST,-0.5007,1.455447,62.693642,9
6,DirectTabular,-0.516305,1.921993,108.393407,3
7,AutoETS,-0.520597,35.037095,1.490065,6
8,DynamicOptimizedTheta,-0.528642,91.982368,1.492923,5
9,SeasonalNaive,-0.61704,3.294545,1.452921,1


#### 模型推理

In [5]:
predictor = TimeSeriesPredictor.load("../models/time_series_forecasting_model")
predictions = predictor.predict(data)

Loading predictor from path c:\Users\18664\Desktop\python-data-science\models\time_series_forecasting_model
data with frequency 'IRREG' has been resampled to frequency 'D'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


#### 将预测结果和测试集合并

In [6]:
df_train = df_train.reset_index()
df_train = df_train.loc[(df_train.timestamp > "2017-7-30")]

df_train

Unnamed: 0,item_id,timestamp,store_nbr,family,sales
1668,0,2017-07-31,1,AUTOMOTIVE,8.0
1669,0,2017-08-01,1,AUTOMOTIVE,5.0
1670,0,2017-08-02,1,AUTOMOTIVE,4.0
1671,0,2017-08-03,1,AUTOMOTIVE,3.0
1672,0,2017-08-04,1,AUTOMOTIVE,8.0
...,...,...,...,...,...
3000883,1781,2017-08-11,54,SEAFOOD,0.0
3000884,1781,2017-08-12,54,SEAFOOD,1.0
3000885,1781,2017-08-13,54,SEAFOOD,2.0
3000886,1781,2017-08-14,54,SEAFOOD,0.0


In [7]:
df_index = df_train[['store_nbr', 'family']]

df_index

Unnamed: 0,store_nbr,family
1668,1,AUTOMOTIVE
1669,1,AUTOMOTIVE
1670,1,AUTOMOTIVE
1671,1,AUTOMOTIVE
1672,1,AUTOMOTIVE
...,...,...
3000883,54,SEAFOOD
3000884,54,SEAFOOD
3000885,54,SEAFOOD
3000886,54,SEAFOOD


In [8]:
predictions = predictions.reset_index()
predictions = predictions[['timestamp', 'mean']]

predictions

Unnamed: 0,timestamp,mean
0,2017-08-16,4.345462
1,2017-08-17,4.077597
2,2017-08-18,5.049568
3,2017-08-19,4.973159
4,2017-08-20,2.209460
...,...,...
28507,2017-08-27,2.792339
28508,2017-08-28,1.843797
28509,2017-08-29,2.874664
28510,2017-08-30,2.042266


In [9]:
predictions = predictions.reset_index(drop=True)
df_index = df_index.reset_index(drop=True)

df_pre = pd.concat([predictions, df_index], axis = 1)

df_pre

Unnamed: 0,timestamp,mean,store_nbr,family
0,2017-08-16,4.345462,1,AUTOMOTIVE
1,2017-08-17,4.077597,1,AUTOMOTIVE
2,2017-08-18,5.049568,1,AUTOMOTIVE
3,2017-08-19,4.973159,1,AUTOMOTIVE
4,2017-08-20,2.209460,1,AUTOMOTIVE
...,...,...,...,...
28507,2017-08-27,2.792339,54,SEAFOOD
28508,2017-08-28,1.843797,54,SEAFOOD
28509,2017-08-29,2.874664,54,SEAFOOD
28510,2017-08-30,2.042266,54,SEAFOOD


In [10]:
df_test = pd.read_csv("../datasets/时间序列预测测试集.csv", parse_dates=["date"])

df_test

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [11]:
df_test.drop(['id', 'onpromotion'], axis=1, inplace=True)
df_test = df_test.rename(columns={'date': 'timestamp'})

df_test

Unnamed: 0,timestamp,store_nbr,family
0,2017-08-16,1,AUTOMOTIVE
1,2017-08-16,1,BABY CARE
2,2017-08-16,1,BEAUTY
3,2017-08-16,1,BEVERAGES
4,2017-08-16,1,BOOKS
...,...,...,...
28507,2017-08-31,9,POULTRY
28508,2017-08-31,9,PREPARED FOODS
28509,2017-08-31,9,PRODUCE
28510,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES


In [12]:
predictions = pd.merge(
    df_test,
    df_pre,
    left_on=["store_nbr", "family", "timestamp"],
    right_on=["store_nbr", "family", "timestamp"],
)

predictions

Unnamed: 0,timestamp,store_nbr,family,mean
0,2017-08-16,1,AUTOMOTIVE,4.345462
1,2017-08-16,1,BABY CARE,0.000046
2,2017-08-16,1,BEAUTY,3.641806
3,2017-08-16,1,BEVERAGES,2223.354171
4,2017-08-16,1,BOOKS,0.060321
...,...,...,...,...
28507,2017-08-31,9,POULTRY,376.082631
28508,2017-08-31,9,PREPARED FOODS,97.132338
28509,2017-08-31,9,PRODUCE,1204.466542
28510,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,64.850734
