In [1]:
# Data manipulation
# ==============================================================================
import numpy as np
import pandas as pd

# Plots
# ==============================================================================
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['font.size'] = 10
plt.rcParams['figure.figsize'] = (16, 10)

# Modeling and Forecasting
# ==============================================================================
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster

from statsmodels.tsa.stattools import adfuller
# Warnings configuration
# ==============================================================================
import warnings
# warnings.filterwarnings('ignore')

In [2]:
# %matplotlib inline
%matplotlib qt

In [3]:
from lutils.fin.data_loader import load, load_tq, load_ctp

In [4]:
exchange = 'SHFE'
symbol_underlying = 'rb2305'

In [5]:
df_underlying = load(exchange, symbol_underlying)

load Z:/tq_data/ticks\SHFE.rb2305.h5
load Y:/fin_data\2023-05-04\SHFE.rb2305.h5
load Y:/fin_data\2023-05-05\SHFE.rb2305.h5
load Y:/fin_data\2023-05-08\SHFE.rb2305.h5
load Y:/fin_data\2023-05-09\SHFE.rb2305.h5
load Y:/fin_data\2023-05-10\SHFE.rb2305.h5
load Y:/fin_data\2023-05-11\SHFE.rb2305.h5
load Y:/fin_data\2023-05-12\SHFE.rb2305.h5
load Y:/fin_data\2023-05-15\SHFE.rb2305.h5


In [6]:
df = df_underlying[['datetime', 'last_price', 'volume', 'amount', ]]

In [7]:
df = df.dropna()

In [8]:
df.index = df.datetime

In [9]:
df.shape

(7034205, 4)

In [10]:
resample_1s = df.resample('1s').last()

In [11]:
resample_1s = resample_1s.ffill().bfill()

In [12]:
resample_1s.shape

(31431022, 4)

In [13]:
df_1s = pd.concat([resample_1s.between_time('09:00', '10:15'), 
          resample_1s.between_time('10:30', '11:30'),
          resample_1s.between_time('13:30', '15:00'),
          resample_1s.between_time('21:00', '23:00')], axis=0).sort_index()[['last_price', 'volume', 'amount']]

In [14]:
df_1s.shape

(7536256, 3)

In [15]:
df_1s.head(10)

Unnamed: 0_level_0,last_price,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-05-16 21:00:00,4430.0,70.0,3093500.0
2022-05-16 21:00:01,4430.0,75.0,3315000.0
2022-05-16 21:00:02,4480.0,78.0,3449400.0
2022-05-16 21:00:03,4480.0,78.0,3449400.0
2022-05-16 21:00:04,4480.0,84.0,3718200.0
2022-05-16 21:00:05,4480.0,86.0,3807800.0
2022-05-16 21:00:06,4480.0,87.0,3852600.0
2022-05-16 21:00:07,4480.0,87.0,3852600.0
2022-05-16 21:00:08,4480.0,87.0,3852600.0
2022-05-16 21:00:09,4480.0,87.0,3852600.0


In [16]:
rolmean = df_1s.rolling('5T').last_price.mean()
rolstd = df_1s.rolling('5T').last_price.std()

In [17]:
rolmean_shift = rolmean.shift(periods=-1, freq='5T')

In [17]:
plt.plot(df_1s.last_price, label='1s')
plt.plot(rolmean, label='5 mean')
plt.plot(rolstd, label='5 std')
plt.legend()

<matplotlib.legend.Legend at 0x1d003633b08>

In [18]:
logprice = np.log(df_1s.last_price)
logmean = logprice.rolling('5T').mean()

In [20]:
plt.plot(logprice, label='log price')
plt.plot(logmean, label='log price mean')
plt.legend()

<matplotlib.legend.Legend at 0x1d06f5d64c8>

In [19]:
log_mean_diff = logprice - logmean

In [20]:
log_mean_diff_mean = log_mean_diff.rolling('5T').mean()
log_mean_diff_std = log_mean_diff.rolling('5T').std()

In [26]:
plt.plot(log_mean_diff, label='log diff')
plt.plot(log_mean_diff_mean, label='log diff mean')
plt.plot(log_mean_diff_std, label='log diff std')
plt.legend()

<matplotlib.legend.Legend at 0x1d02427e608>

In [25]:
log_mean_diff['2023-01-12']

datetime
2023-01-12 09:00:00    0.000000
2023-01-12 09:00:01    0.000000
2023-01-12 09:00:02    0.000649
2023-01-12 09:00:03    0.000122
2023-01-12 09:00:04   -0.000292
                         ...   
2023-01-12 22:59:56    0.000541
2023-01-12 22:59:57   -0.000190
2023-01-12 22:59:58   -0.000194
2023-01-12 22:59:59    0.000526
2023-01-12 23:00:00    0.000520
Name: last_price, Length: 20704, dtype: float64

In [21]:
# def test_stationarity(timeseries):
#     dftest = adfuller(timeseries, autolag='AIC')
#     dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
#     for key,value in dftest[4].items():
#         dfoutput['Critical Value (%s)'%key] = value
#     print(dfoutput)

In [27]:
test_stationarity(log_mean_diff['2023-01-13'])

Test Statistic                -9.980621e+00
p-value                        2.119551e-17
#Lags Used                     9.000000e+00
Number of Observations Used    2.069400e+04
Critical Value (1%)           -3.430666e+00
Critical Value (5%)           -2.861680e+00
Critical Value (10%)          -2.566844e+00
dtype: float64


In [None]:
preprocessing.StandardScaler().fit(X_train)

In [11]:
# df.rolling('5T').last_price.mean()

datetime
2022-06-16 09:22:27.500    4480.000000
2022-06-16 09:22:28.000    4480.000000
2022-06-16 09:22:34.500    4480.000000
2022-06-16 09:22:35.000    4480.000000
2022-06-16 09:22:35.500    4480.000000
                              ...     
2023-05-26 22:59:57.000    3447.174603
2023-05-26 22:59:57.500    3447.023810
2023-05-26 22:59:58.000    3446.873016
2023-05-26 22:59:58.500    3446.722222
2023-05-26 23:00:00.500    3446.495935
Name: last_price, Length: 4773290, dtype: float64

In [12]:
df_5t = df.rolling('5T').last_price.mean()

In [13]:
df_5t = df_5t[~df_5t.index.duplicated(keep='first')]

In [14]:
# a = df_5t.index.duplicated(keep='first')

In [15]:
# a[a == True]

In [16]:
df_5t.head()

datetime
2022-06-16 09:22:27.500    4480.0
2022-06-16 09:22:28.000    4480.0
2022-06-16 09:22:34.500    4480.0
2022-06-16 09:22:35.000    4480.0
2022-06-16 09:22:35.500    4480.0
Name: last_price, dtype: float64

In [17]:
df_5t_shift = df_5t.copy()

In [18]:
df_5t_shift.index = df_5t_shift.index + pd.DateOffset(minutes=-5)

In [19]:
df_5t_shift

datetime
2022-06-16 09:17:27.500    4480.000000
2022-06-16 09:17:28.000    4480.000000
2022-06-16 09:17:34.500    4480.000000
2022-06-16 09:17:35.000    4480.000000
2022-06-16 09:17:35.500    4480.000000
                              ...     
2023-05-26 22:54:57.000    3447.174603
2023-05-26 22:54:57.500    3447.023810
2023-05-26 22:54:58.000    3446.873016
2023-05-26 22:54:58.500    3446.722222
2023-05-26 22:55:00.500    3446.495935
Name: last_price, Length: 4773258, dtype: float64

In [None]:
# df_5t_shift.index.get_indexer([30, 25, 58, 50, 69], method="nearest")

In [20]:
a = df_5t_shift[df_5t_shift.index.get_indexer(df_5t.index, method='nearest')]

# df_5t_shift.iloc[df_5t.index.get_loc(dt, method='nearest')]

In [21]:
df_5t

datetime
2022-06-16 09:22:27.500    4480.000000
2022-06-16 09:22:28.000    4480.000000
2022-06-16 09:22:34.500    4480.000000
2022-06-16 09:22:35.000    4480.000000
2022-06-16 09:22:35.500    4480.000000
                              ...     
2023-05-26 22:59:57.000    3447.174603
2023-05-26 22:59:57.500    3447.023810
2023-05-26 22:59:58.000    3446.873016
2023-05-26 22:59:58.500    3446.722222
2023-05-26 23:00:00.500    3446.495935
Name: last_price, Length: 4773258, dtype: float64

In [22]:
# df_5t_shift.index.get_loc(df_5t.index[0], method='nearest')

In [29]:
plt.plot(df_5t, label='5t')
plt.plot(df_5t_shift, label='5 shift')
plt.legend()

<matplotlib.legend.Legend at 0x1278c4f81c8>

In [30]:
df_5t.index[0]

Timestamp('2022-06-16 09:22:27.500000')

In [20]:
df_5t.index.get_indexer?

In [10]:
price_min = df['last_price'].resample('1Min', closed='left', label='right').ohlc(_method='ohlc')
_volume = df['volume'].diff(1).fillna(0)
volume_min = _volume.resample('1Min', closed='left', label='right').sum()
_amount = df['amount'].diff(1).fillna(0)
amount_min = _amount.resample('1Min', closed='left', label='right').sum()
df_min = pd.concat([price_min, volume_min, amount_min], axis=1)

In [11]:
df_min

Unnamed: 0_level_0,open,high,low,close,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-16 09:23:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:24:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:25:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:26:00,,,,,0.0,0.0
2022-06-16 09:27:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
...,...,...,...,...,...,...
2023-05-24 22:57:00,3450.0,3451.0,3449.0,3449.0,91.0,3138860.0
2023-05-24 22:58:00,3449.0,3451.0,3449.0,3451.0,7.0,241450.0
2023-05-24 22:59:00,3451.0,3451.0,3451.0,3451.0,0.0,0.0
2023-05-24 23:00:00,3451.0,3458.0,3447.0,3452.0,63.0,2173610.0


In [12]:
df_min.loc['2022-06-16 09:43']

open      NaN
high      NaN
low       NaN
close     NaN
volume    0.0
amount    0.0
Name: 2022-06-16 09:43:00, dtype: float64

In [13]:
df_min[pd.isna(df_min.open)]

Unnamed: 0_level_0,open,high,low,close,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-16 09:26:00,,,,,0.0,0.0
2022-06-16 09:43:00,,,,,0.0,0.0
2022-06-16 09:44:00,,,,,0.0,0.0
2022-06-16 09:48:00,,,,,0.0,0.0
2022-06-16 09:53:00,,,,,0.0,0.0
...,...,...,...,...,...,...
2023-05-24 20:55:00,,,,,0.0,0.0
2023-05-24 20:56:00,,,,,0.0,0.0
2023-05-24 20:57:00,,,,,0.0,0.0
2023-05-24 20:58:00,,,,,0.0,0.0


In [14]:
df_min.dropna()

Unnamed: 0_level_0,open,high,low,close,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-16 09:23:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:24:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:25:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:27:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
2022-06-16 09:28:00,4480.0,4480.0,4480.0,4480.0,0.0,0.0
...,...,...,...,...,...,...
2023-05-24 22:57:00,3450.0,3451.0,3449.0,3449.0,91.0,3138860.0
2023-05-24 22:58:00,3449.0,3451.0,3449.0,3451.0,7.0,241450.0
2023-05-24 22:59:00,3451.0,3451.0,3451.0,3451.0,0.0,0.0
2023-05-24 23:00:00,3451.0,3458.0,3447.0,3452.0,63.0,2173610.0


In [56]:
np.nan

nan

In [60]:
pd.isna(df_min.iloc[2].open)

False

In [None]:
df_min

In [21]:
a = df.iloc[:20]

In [23]:
a

Unnamed: 0_level_0,datetime,last_price,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-16 09:22:27.500,2022-06-16 09:22:27.500,4480.0,1,44800.0
2022-06-16 09:22:28.000,2022-06-16 09:22:28.000,4480.0,1,44800.0
2022-06-16 09:22:34.500,2022-06-16 09:22:34.500,4480.0,1,44800.0
2022-06-16 09:22:35.000,2022-06-16 09:22:35.000,4480.0,1,44800.0
2022-06-16 09:22:35.500,2022-06-16 09:22:35.500,4480.0,1,44800.0
2022-06-16 09:22:50.500,2022-06-16 09:22:50.500,4480.0,1,44800.0
2022-06-16 09:23:03.500,2022-06-16 09:23:03.500,4480.0,1,44800.0
2022-06-16 09:23:04.000,2022-06-16 09:23:04.000,4480.0,1,44800.0
2022-06-16 09:24:06.000,2022-06-16 09:24:06.000,4480.0,1,44800.0
2022-06-16 09:24:06.500,2022-06-16 09:24:06.500,4480.0,1,44800.0


In [31]:
# a.shift(-1, freq=pd.Timedelta('5T'))
a.shift(-1, freq='5T')

Unnamed: 0_level_0,datetime,last_price,volume,amount
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-16 09:17:27.500,2022-06-16 09:22:27.500,4480.0,1,44800.0
2022-06-16 09:17:28.000,2022-06-16 09:22:28.000,4480.0,1,44800.0
2022-06-16 09:17:34.500,2022-06-16 09:22:34.500,4480.0,1,44800.0
2022-06-16 09:17:35.000,2022-06-16 09:22:35.000,4480.0,1,44800.0
2022-06-16 09:17:35.500,2022-06-16 09:22:35.500,4480.0,1,44800.0
2022-06-16 09:17:50.500,2022-06-16 09:22:50.500,4480.0,1,44800.0
2022-06-16 09:18:03.500,2022-06-16 09:23:03.500,4480.0,1,44800.0
2022-06-16 09:18:04.000,2022-06-16 09:23:04.000,4480.0,1,44800.0
2022-06-16 09:19:06.000,2022-06-16 09:24:06.000,4480.0,1,44800.0
2022-06-16 09:19:06.500,2022-06-16 09:24:06.500,4480.0,1,44800.0
