# Постановка задачи

## Task #1

**Цель**
Обучаем сеть корректной последовательности действий на бирже: ожидание покупки -> покупка -> ожидание продажи -> продажа.
 
**Observation**
 - Состояние сделки

**Действия**
 1. Ожидание момента для покупки
 2. Покупка (открытие позиции)
 3. Ожидание момента для продажи
 4. Продажа (закрытие позици)

**Награда/штраф**
Действия 1 и 2 возможны при отсутствии открытой позиции. Действия 3 и 4 допустимы только при открытой позиции. При нарушении этого требования - сеть штрафуется.
 
Сеть получает награду при закрытии позиции.

# Импорты

In [1]:
# Системные импорты и настройки
import os
import sys
import yaml
import random
import warnings
import ipynbname
import logging.config

warnings.filterwarnings('ignore')

# for local development
RT_LIBS_PATH = "/Users/alex/Dev_projects/MyOwnRepo/rt_libs/src"
BA_LIBS_PATH = "/Users/alex/Dev_projects/MyOwnRepo/basic_application/src"
sys.path.append(RT_LIBS_PATH)
sys.path.append(BA_LIBS_PATH)

# read config
with open('config.yaml', "r") as stream:
    config = yaml.safe_load(stream)
    
# set logging config
log_config = config.get("log", None)
logging.config.dictConfig(log_config)

# set notebook alias
ALIAS = ipynbname.name()
print(ALIAS)

gen12.1-Abstract-04-Complex


In [2]:
# DS frameworks
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

%matplotlib notebook

In [3]:
# NN Frameworks
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, Concatenate, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPool1D, AveragePooling1D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.python.keras.models import load_model, clone_model

devices = tf.config.list_physical_devices()
print(devices)

2023-09-23 12:01:22.336471: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [4]:
# RT packages
from rl import DQNAgent
from env import TradeEnv
from core_v2 import Constructor, Player


from core_v2.data_point import DataPointFactory

from core_v2.observation_builder.precompute import PrecomputeOrderbookDiffFeature

from train_tools import plot_and_go
from train_tools.train_plot import TrainPlot4
from train_tools.train_manager import TrainManager

In [5]:
seed_value= 0
#os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
#np.random.seed(seed_value)
#tf.random.set_seed(seed_value)

# Конфиг

In [6]:
observation_len = 1

# Параметры точки наблюдения
observation_config = {
    "observation_len": observation_len,             # Количество точек наблюдения в сэмпле
    "offset": observation_len,                      # Количество точек наблюдения в сэмпле
    "future_points": 0,                             # Количество будущех точек для предсказания тренда (временное решение)
    "step_size": 1,                                 # Шаг по датасету
 }

# Датасет

In [7]:
n_steps = 100

np.random.seed(1)

sample_num = 10
sample_size = n_steps//sample_num

shift_size = 8

open_signal = np.empty(0)
close_signal = np.empty(0)

for i in range(sample_num):
    sample_shift_size = np.random.randint(1, shift_size)
    shift = np.zeros(sample_shift_size)
    
    sample_size_ = sample_size - sample_shift_size
    
    total = 2
    
    while not total==1:
        sample = np.random.uniform(size=sample_size_) > 0.9
        total = sum(sample)
    
    
    open_signal = np.concatenate([open_signal, sample, shift])
    close_signal = np.concatenate([close_signal, shift, sample])
    
    
lowest_ask = np.ones(len(open_signal)) - open_signal*0.5
highest_bid = np.ones(len(open_signal)) + close_signal*0.5

    
dataset = np.concatenate([
    lowest_ask.reshape(-1,1), 
    highest_bid.reshape(-1,1), 
    open_signal.reshape(-1,1), 
    close_signal.reshape(-1,1)
], axis=1)
data_train = pd.DataFrame(dataset, columns=["lowest_ask", "highest_bid", "open_signal", "close_signal"], dtype=np.float32)

#data_train['open_signal']  = data_train['open_signal']  - 0.5
#data_train['open_signal'] = data_train['open_signal'] * 2
#data_train['close_signal']  = data_train['close_signal']  - 0.5
#data_train['close_signal'] = data_train['close_signal'] * 2

print(data_train.values[:20])
print(data_train.sum())

[[0.5 1.  1.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.5 0.  1. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [0.5 1.  1.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.  0.  0. ]
 [1.  1.5 0.  1. ]
 [1.  1.  0.  0. ]]
lowest_ask       95.0
highest_bid     105.0
open_signal      10.0
close_signal     10.0
dtype: float32


# Инициализация компонентов

## Datapoint factory

In [8]:
dpf_train = DataPointFactory(dataset=data_train, **observation_config)
dpf_test = DataPointFactory(dataset=data_train, **observation_config)

## Env

In [9]:
core_config = {
    "action_controller":{"class": "AbstractTrainControllerOpenCloseSignal", "params":{ 
            "penalty": -1, 
            "wait_scale": 0, 
            "open_scale": 0, 
            "hold_scale": 0, 
            "close_scale": 1, 
            "last_points_mean": 0
        },},


    "observation_builder":{
        "class": "ObservationBuilder",
        "inputs": [
            {"class": "Input1D", "features": [
                {"class": "RawContextFeature", "params": {"name":"is_open"}},
                {"class": "RawValueFeature", "params": {"name":"open_signal"}},
                {"class": "RawValueFeature", "params": {"name":"close_signal"}},
                {"class": "RawContextFeature", "params": {"name":"open_signal"}}
            ]},
    ]
    }
}
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
core_constructor = Constructor()
env_core = core_constructor.get_core(ALIAS, core_config)

# train environment
env = TradeEnv(env_core, dpf_train, alias=ALIAS, log=True, log_obs=True)

# Нейронная сеть

In [10]:
ACTIVATION = 'relu'
def create_q_model(env):
    num_actions = env.action_space
    #----------------------------------------------
    
    inp_static = Input(shape=env.observation_space[0])
    classif = Dense(64, activation=ACTIVATION)(inp_static)
    classif = Dense(64, activation=ACTIVATION)(classif)
    classif = Dense(64, activation=ACTIVATION)(classif)
    classif = Dense(64, activation=ACTIVATION)(classif)
    output = Dense(num_actions, activation='softmax')(classif)

    model = Model(inputs=inp_static, outputs=output)
    return model

model = create_q_model(env)
model_target = create_q_model(env)

print(model.summary())

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 64)                320       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 4)                 260       
                                                                 
Total params: 13,060
Trainable params: 13,060
Non-trainable p

2023-09-23 12:01:27.073433: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Обучение

In [18]:
random.seed(seed_value)

core_train = core_constructor.get_core("train", core_config)
env = TradeEnv(core_train, dpf_train, alias=ALIAS, log=True, log_obs=True)

model = create_q_model(env)
model_target = create_q_model(env)
agent = DQNAgent(env, model, model_target)

agent.epsilon_random_frames = 500
agent.epsilon_greedy_frames = 4000
agent.max_memory_length     = 4000
agent.max_steps_per_episode = 50000
agent.gamma = 0.95
agent.epsilon_min = 0.01
agent.batch_size = 64
agent.update_after_actions = 4
agent.update_target_network = 250
agent.loss_function = tf.keras.losses.Huber() #tf.keras.losses.MeanSquaredError()
agent.optimizer = Adam(learning_rate=0.001, clipnorm=0.001)    #Adam(learning_rate=learning_rate) RMSprop(learning_rate=learning_rate) SGD(learning_rate=learning_rate)


tp = TrainPlot4()
core_test = core_constructor.get_core("test", core_config)
tm = TrainManager(agent, core_test, dpf_test, tp, alias=ALIAS)

In [19]:
tp.init_plot(width=1000, height=800)
tp.update_plot(tm.history)

FigureWidget({
    'data': [{'legendgroup': '1',
              'line': {'color': '#109618', 'width': 1},
              'mode': 'lines',
              'name': 'Train',
              'type': 'scatter',
              'uid': 'dea61004-3e8d-4502-b868-630e8fd5e9b6',
              'xaxis': 'x',
              'yaxis': 'y'},
             {'legendgroup': '1',
              'line': {'color': '#FF9900', 'width': 1},
              'mode': 'lines',
              'name': 'Test',
              'type': 'scatter',
              'uid': '6698cbd6-975e-465f-ab29-3c8a2cf6e552',
              'xaxis': 'x',
              'yaxis': 'y'},
             {'legendgroup': '2',
              'line': {'color': '#D62728', 'width': 1},
              'mode': 'lines',
              'name': 'Train',
              'type': 'scatter',
              'uid': '4ba117a1-dcf3-4a2c-8802-05caf87657c9',
              'xaxis': 'x2',
              'yaxis': 'y3'},
             {'legendgroup': '2',
              'line': {'color': '#FF9900'

In [20]:
tm.go(max_frames=25000, test_every=100, snapshot_every=500000, update_plot_every=100, save_since=0.06)

13:24:31 Running reward: -61.00   at episode 3    | frame 250    | eps: 0.94 | Running loss: 0.29375
13:24:35 Running reward: -61.60   at episode 6    | frame 500    | eps: 0.88 | Running loss: 0.19809
13:24:38 Running reward: -60.00   at episode 8    | frame 750    | eps: 0.81 | Running loss: 0.15256
13:24:42 Running reward: -56.60   at episode 11   | frame 1000   | eps: 0.75 | Running loss: 0.11974
13:24:45 Running reward: -55.58   at episode 13   | frame 1250   | eps: 0.69 | Running loss: 0.10581
13:24:49 Running reward: -53.07   at episode 16   | frame 1500   | eps: 0.63 | Running loss: 0.09204
13:24:54 Running reward: -51.76   at episode 18   | frame 1750   | eps: 0.57 | Running loss: 0.08524
13:24:58 Running reward: -49.20   at episode 21   | frame 2000   | eps: 0.51 | Running loss: 0.07830
13:25:02 Running reward: -47.59   at episode 23   | frame 2250   | eps: 0.44 | Running loss: 0.07407
13:25:07 Running reward: -45.92   at episode 26   | frame 2500   | eps: 0.38 | Running loss

13:29:46 Running reward: 5.20     at episode 212  | frame 20750  | eps: 0.01 | Running loss: 0.03285
13:29:49 Running reward: 6.07     at episode 215  | frame 21000  | eps: 0.01 | Running loss: 0.03262
13:29:52 Running reward: 6.30     at episode 217  | frame 21250  | eps: 0.01 | Running loss: 0.03252
13:29:56 Running reward: 6.63     at episode 220  | frame 21500  | eps: 0.01 | Running loss: 0.03198
13:29:59 Running reward: 6.77     at episode 222  | frame 21750  | eps: 0.01 | Running loss: 0.03151
13:30:02 Running reward: 6.70     at episode 225  | frame 22000  | eps: 0.01 | Running loss: 0.03131
13:30:06 Running reward: 7.00     at episode 228  | frame 22250  | eps: 0.01 | Running loss: 0.03090
13:30:09 Running reward: 6.90     at episode 230  | frame 22500  | eps: 0.01 | Running loss: 0.03086
13:30:12 Running reward: 8.57     at episode 233  | frame 22750  | eps: 0.01 | Running loss: 0.03113
13:30:16 Running reward: 8.73     at episode 235  | frame 23000  | eps: 0.01 | Running loss

- Базовый конфиг: 11к, 8.8к*, 11к*, 9.7k, 12к Общая черта - нестабильность после выхода на оптимальный результат.
    - 4 слой 32 нейрона: 19к, 5к, 12к, 7к, 15к Стабильность результатов выше, но бывает что без колбасы, а бывает, что как на 3 слоях.
    - 4 слоя по 64 нейрона: 5к, 8к, 17к, 10к,  Небольшая нестабильность в начале
        - agent.epsilon_greedy_frames = 8000(было 2к): 28к, 52k, 20k. По ощущениям стало дольше обучаться и расколбас сохраняется
            -lr=0.00025 (было 0.0005): 28к
        - epsilon_greedy_frames = 20000(было 2к): 36к, 66к, 36к есть нестабильность
        
        - utn=500: 16к, 21k иногда есть нестабильность.
        - utn=250: 14к, 14к, 14k
            - 3 слоя по 64: 17k, 20k, 11k
        
    - 64-32-16-18 | utn=250 | egf=2000: 15k, x Нестабильно.

In [17]:
env.get_step_info()

{'cursor': 1,
 'state': True,
 'observation': '[1.,0.]',
 'action': 1,
 'reward': 0,
 'total_reward': 0,
 'balance': 0,
 'profit': None,
 'lowest_ask': 0,
 'highest_bid': 0}

In [None]:
import random


dp = env.dp_factory.get_current_step()
obs = env.core.get_observation(data_point=dp)

agent._sample_transformer(obs)

indices = random.sample(range(len(agent.done_history)), agent.batch_size)
state_sample = [agent.state_history[i] for i in indices]


agent._batch_transformer(state_sample)


# Train

In [None]:
%matplotlib notebook

model = create_q_model(env)
model_target = create_q_model(env)

env.live_train_plot.init_plot(fig_size_x=20, fig_size_y=8, dpi=50, update=1)

agent = DQN(env, model, model_target, dqn_conf)

env.live_train_plot.lines["TotalReward"].ax.set_ylim(ymin=-1, ymax=1)

In [None]:
agent.train(max_frames=100000, goal_reward=45)

In [None]:
agent.model.save('models/model_' + ALIAS)

# Итоги

Наиболее простая задача из всех.

Сеть обучилась эффективному алгоримту - открывает и сразу закрывает сделку, без ожидания. Чем больше количество закрытых сделок, тем выше совокупная награда.

При одном слое из 4-х нейронов к концу обучения не выходила на оптимальнsй результат. При увеличении кол-ва нейронов или глубины алгоритм стал сходиться.

In [13]:
!pip install plotly

