In [None]:
import pandas as pd
import numpy

In [None]:
import matplotlib.pyplot as plt

import matplotlib.cbook as cbook
import matplotlib.dates as mdates


In [None]:
import sys
print(sys.version)

## Порядок полей в файлах
### Идентификатор инструмента (FIGI)
### Дата и время начала свечи (UTC)
### Цена открытия (open)
### Цена закрытия (close)
### Максимальная цена за интервал (high)
### Минимальная цена за интервал (low)
### Объем в лотах (volume)

In [None]:
COLS_NAMES = ['figi', 'utc', 'open', 'close', 'high', 'low', 'volume']

In [None]:
!pwd

In [None]:
df1 = pd.read_csv('./data/sber_2023/e6123145-9665-43e0-8413-cd61b8aa9b13_20230101.csv', sep=';', names=[i for i in range(8)], usecols=range(1,7))

In [None]:
df1.rename(columns={i:COLS_NAMES[i] for i in range(1,7)}, inplace=True)

In [None]:
df1['utc'] = pd.to_datetime(df1['utc'])

In [None]:
df1 = df1.set_index('utc')

In [None]:
df1.head(3)

In [None]:
!ls ./data

In [None]:
import os

df = df1[0:0].copy()

for file in os.listdir('./data/sber_2023/'):
    filename = os.fsdecode(file)
    df_tmp = pd.read_csv(f'./data/sber_2023/{filename}', sep=';', names=[i for i in range(8)], usecols=range(1,7))    
    df_tmp.rename(columns={i:COLS_NAMES[i] for i in range(1,7)}, inplace=True)    
    # df_tmp['utc'] = pd.to_datetime(df_tmp['utc'])
    # df_tmp = df_tmp.set_index('utc')
    
    df = pd.concat([df, df_tmp])
    
df = df.sort_values(by='utc')
df['utc'] = pd.to_datetime(df['utc'])
df = df.set_index('utc')
df.head(3)

In [None]:
fig, axes = plt.subplots(1,2,figsize=(20,8))

axes[0].plot('open', 'g-', data=df, label='open price', alpha=0.9)
axes[0].plot('close', 'r--', data=df, label='close price', alpha=0.9, linewidth=2)

axes[1].plot('high', 'g-', data=df, label='highest price', alpha=0.9)
axes[1].plot('low', 'r--', data=df, label='lowest price', alpha=0.9, linewidth=2)

for ax in axes:
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    # ax.xaxis.set_minor_locator(mdates.MinuteLocator(interval=10))
    ax.grid(True)
    ax.legend()
    ax.set_ylabel(r'Price')

In [None]:
fig, axes = plt.subplots(1,1,figsize=(20,8))

axes.plot('open', 'g-', data=df, label='open price', alpha=0.9)
axes.plot('close', 'r--', data=df, label='close price', alpha=0.9, linewidth=1)
axes.xaxis.set_major_locator(mdates.AutoDateLocator())
# axes.xaxis.set_major_locator(mdates.MonthLocator())
# ax.xaxis.set_minor_locator(mdates.DayLocator(interval=10))
ax.xaxis.set_minor_locator(mdates.AutoDateLocator())
axes.grid(True)
axes.legend()
axes.set_ylabel(r'Price')
plt.show();

In [None]:
fig, axes = plt.subplots(1,1,figsize=(20,8))

axes.plot('open', 'g-', data=df['2023-02-01':'2023-02-15'], label='open price', alpha=0.9)
axes.plot('close', 'r--', data=df['2023-02-01':'2023-02-15'], label='close price', alpha=0.9, linewidth=1)
axes.xaxis.set_major_locator(mdates.AutoDateLocator())
# axes.xaxis.set_major_locator(mdates.MonthLocator())
# ax.xaxis.set_minor_locator(mdates.DayLocator(interval=10))
ax.xaxis.set_minor_locator(mdates.AutoDateLocator())
axes.grid(True)
axes.legend()
axes.set_ylabel(r'Price')
plt.show();

In [None]:
import holidays
ru_holidays = holidays.RUS()


df = df[df.index > '2023-01-10']
df = df[df.index.dayofweek < 5]
df = df[~df.index.isin(ru_holidays.keys())]

In [None]:
df = df[(df.index.hour >= 7) & (df.index.hour < 15)]

In [None]:
df

In [None]:
63810 / (8 * 60)

In [None]:
fig, axes = plt.subplots(1,1,figsize=(20,8))

axes.plot('open', 'g-', data=df['2023-02-01':'2023-02-15'], label='open price', alpha=0.9)
axes.plot('close', 'r--', data=df['2023-02-01':'2023-02-15'], label='close price', alpha=0.9, linewidth=1)
axes.xaxis.set_major_locator(mdates.AutoDateLocator())
# axes.xaxis.set_major_locator(mdates.MonthLocator())
# ax.xaxis.set_minor_locator(mdates.DayLocator(interval=10))
ax.xaxis.set_minor_locator(mdates.AutoDateLocator())
axes.grid(True)
axes.legend()
axes.set_ylabel(r'Price')
plt.show();

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from math import sqrt


In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import torch

# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.report.report_deco import ReportDeco

In [None]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 300
TARGET_NAME = 'close'

task = Task('reg', metric='mse')
roles = {
    'target': TARGET_NAME,
    'drop': ['utc']
}

In [None]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [None]:
len(df)/60

In [None]:
tscv = TimeSeriesSplit(n_splits=2, test_size=150)

In [None]:
tmp = df.resample('H').mean().dropna().reset_index()[['utc', 'close', 'volume']]
tmp['utc'] = tmp['utc'].dt.tz_convert(None)
print(tmp.head())

window_size = 10
tmp['MA'] = tmp['close'].rolling(window=window_size).mean()
tmp['ROC'] = tmp['close'].pct_change()
tmp['Volatility'] = tmp['close'].pct_change().std()
tmp['EMA'] = tmp['close'].ewm(span=10, adjust=False).mean()
tmp['StdDev'] = tmp['close'].rolling(window=window_size).std()
tmp['UpperBB'] = tmp['MA'] + (2 * tmp['StdDev'])
tmp['LowerBB'] = tmp['MA'] - (2 * tmp['StdDev'])
tmp['VolumeChange'] = tmp['volume'].pct_change()
tmp['Momentum'] = tmp['close'].pct_change(periods=window_size)

delta = tmp['close'].diff()
up, down = delta.copy(), delta.copy()
up[up < 0] = 0
down[down > 0] = 0
avg_gain = up.rolling(window=window_size).mean()
avg_loss = abs(down.rolling(window=window_size).mean())
rs = avg_gain / avg_loss
tmp['RSI'] = 100 - (100 / (1 + rs))

tmp['close'] = tmp['close'].shift(-1)
# tmp['volume'] = tmp['volume'].shift(-1)
tmp.dropna(subset=['close'], inplace=True)
# tmp.dropna(subset=['volume'], inplace=True)

tmp = tmp.fillna(method='bfill')



In [None]:
tmp.head(7)

In [None]:
# import time

rmse_values = []
for train_index, test_index in tscv.split(tmp):
   # tmp_df = tmp.reset_index().rename(columns={'index':'num'})
   train, test = tmp.iloc[train_index], tmp.iloc[test_index]
   
   # print(train)
   automl = TabularAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'random_state': RANDOM_STATE},
   )

   out_of_fold_predictions = automl.fit_predict(train, roles=roles, verbose = 0)
   test_predictions = automl.predict(test)


   rmse = round(sqrt(mean_squared_error(test['close'], test_predictions.data[:, 0])), 2)
   rmse_values.append(rmse)

   fig, axes = plt.subplots(1,1,figsize=(12,7))
   axes.plot(train.index, train['close'], color='royalblue', label='train')
   # axes[1].plot(train.index, train['close'], color='royalblue', label='train')
   axes.plot(test.index, test['close'], color='green', label='test')
   axes.plot(test.index, test_predictions.data[:, 0], color='red', label='predictions')
   plt.grid(True)
   plt.legend()
   plt.show();


overall_rmse = round(np.mean(rmse_values), 2)
print("Overall RMSE:", overall_rmse)


In [None]:
import joblib

In [None]:
!pwd

In [None]:
joblib.dump(automl, 'models/automl_model_v1.pkl')

In [None]:
model = joblib.load('models/automl_model_v1.pkl')

In [None]:
model.get_feature_scores()