In [2]:
import pandas as pd
import os
import csv
import glob
import numpy as np
import collections
import matplotlib.pyplot as plt

Prices = collections.namedtuple('Prices', field_names=['open', 'high', 'low', 'close', 'volume'])

In [7]:
train_path = "D:\Practical_Reinforcement_Learning\chapter8\data\ch08-small-quotes\YNDX_150101_151231.csv"

In [3]:
def bar2rel(df,tolerance):
    prev_vals = None
    fix_open_price  = True
    o, h, l, c, v = [], [], [], [], []
    count_out = 0
    count_filter = 0
    count_fixed = 0
    for row in df.itertuples():
        val = (row._3,row._4,row._5,row._6,row._7)
        po, ph, pl,pc,pv = val
        if fix_open_price and prev_vals is not None:
            ppo, pph, ppl, ppc, ppv = prev_vals
            if abs(po - ppc) > 1e-8:
                count_fixed += 1
                po = ppc
                pl = min(pl, po)
                ph = max(ph, po)
                count_out += 1
        o.append(po)
        c.append(pc)
        h.append(ph)
        l.append(pl)
        v.append(pv)
        prev_vals = val
    prices=Prices(open=np.array(o, dtype=np.float32),
                  high=np.array(h, dtype=np.float32),
                  low=np.array(l, dtype=np.float32),
                  close=np.array(c, dtype=np.float32),
                  volume=np.array(v, dtype=np.float32))
    return prices_to_relative(prices)

def prices_to_relative(prices):
    """
    Convert prices to relative in respect to open price
    :param ochl: tuple with open, close, high, low
    :return: tuple with open, rel_close, rel_high, rel_low
    """
    assert isinstance(prices, Prices)
    rh = (prices.high - prices.open) / prices.open
    rl = (prices.low - prices.open) / prices.open
    rc = (prices.close - prices.open) / prices.open
    return Prices(open=prices.open, high=rh, low=rl, close=rc, volume=prices.volume)

def preprocess(path):
    df = pd.read_csv(os.path.abspath(train_path))

    index = ['<OPEN>', "<HIGH>", "<LOW>","<CLOSE>","<VOL>"]
    df[index] = df[index].astype(float)
    df_normalized = (df - df.min()) / (df.max() - df.min())
    # Define the tolerance value
    tolerance = 1e-8

    # Apply the lambda function to check if each value is within the tolerance of the first value
    #result = df_normalized.applymap(lambda v: abs(v - df_normalized.iloc[0]) < tolerance)
    return bar2rel(df_normalized,tolerance)

In [None]:
import gym
import gym.spaces
from gym.utils import seeding
from gym.envs.registration import EnvSpec
import enum
import numpy as np


DEFAULT_BARS_COUNT = 10
DEFAULT_COMMISSION_PERC = 0.1



class Actions(enum.Enum):
    Skip = 0
    Buy = 1
    Close = 2

class State:
    def __init__(self, bars_count, commission_perc,
                 reset_on_close, reward_on_close=True,
                 volumes=True):
        assert isinstance(bars_count, int)
        assert bars_count > 0
        assert isinstance(commission_perc, float)
        assert commission_perc >= 0.0
        assert isinstance(reset_on_close, bool)
        assert isinstance(reward_on_close, bool)
        self.bars_count = bars_count
        self.commission_perc = commission_perc
        self.reset_on_close = reset_on_close
        self.reward_on_close = reward_on_close
        self.volumes = volumes

    def reset(self, prices, offset):
        assert isinstance(prices, Prices)
        assert offset >= self.bars_count-1
        self.have_position = False
        self.open_price = 0.0
        self._prices = prices
        self._offset = offset

    @property
    def shape(self):
        # [h, l, c] * bars + position_flag + rel_profit
        if self.volumes:
            return 4 * self.bars_count + 1 + 1,
        else:
            return 3*self.bars_count + 1 + 1,

    def encode(self):
        """
        Convert current state into numpy array.
        """
        res = np.ndarray(shape=self.shape, dtype=np.float32)
        shift = 0
        for bar_idx in range(-self.bars_count+1, 1):
            ofs = self._offset + bar_idx
            
            res[shift] = self._prices.high[ofs]
            shift += 1
            res[shift] = self._prices.low[ofs]
            shift += 1
            res[shift] = self._prices.close[ofs]
            shift += 1
            if self.volumes:
                res[shift] = self._prices.volume[ofs]
                shift += 1
            print(f"""state off set ofs {ofs}\n and shape res as batch from offset {res.shape} \n 
                    state_offset {self._offset} \n bar_idx {bar_idx} \n shift {shift} \n res shift {res}""")
        res[shift] = float(self.have_position)
        shift += 1
        if not self.have_position:
            res[shift] = 0.0
        else:
            res[shift] = self._cur_close() / self.open_price - 1.0
        print(f"Final res shape {res.shape} shift {shift}")
        return res

    def _cur_close(self):
        """
        Calculate real close price for the current bar
        """
        open = self._prices.open[self._offset]
        rel_close = self._prices.close[self._offset]
        return open * (1.0 + rel_close)

    def step(self, action):
        """
        Perform one step in our price, adjust offset, check for the end of prices
        and handle position change
        :param action:
        :return: reward, done
        """
        assert isinstance(action, Actions)
        reward = 0.0
        done = False
        close = self._cur_close()
        if action == Actions.Buy and not self.have_position:
            self.have_position = True
            self.open_price = close
            reward -= self.commission_perc
        elif action == Actions.Close and self.have_position:
            reward -= self.commission_perc
            done |= self.reset_on_close
            if self.reward_on_close:
                reward += 100.0 * (close / self.open_price - 1.0)
            self.have_position = False
            self.open_price = 0.0

        self._offset += 1
        prev_close = close
        close = self._cur_close()
        done |= self._offset >= self._prices.close.shape[0]-1

        if self.have_position and not self.reward_on_close:
            reward += 100.0 * (close / prev_close - 1.0)

        return reward, done


class StocksEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    #spec = EnvSpec("StocksEnv-v0",entry_point=libs.envoiran.StocksEnv)

    def __init__(self, prices: Prices, bars_count=DEFAULT_BARS_COUNT,
                 commission=DEFAULT_COMMISSION_PERC,
                 reset_on_close=True, state_1d=False,
                 random_ofs_on_reset=True, reward_on_close=False,
                 volumes=False):
        self._prices = prices
        self._state = State(
            bars_count, commission, reset_on_close,
            reward_on_close=reward_on_close, volumes=volumes)
        self.action_space = gym.spaces.Discrete(n=len(Actions))
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf,
            shape=self._state.shape, dtype=np.float32)
        self.random_ofs_on_reset = random_ofs_on_reset
        
        #self.seed()
    def seed(self, seed=None):
        self.np_random, seed1 = seeding.np_random(seed)
        seed2 = seeding.hash_seed(seed1 + 1) % 2 ** 31
        return [seed1, seed2]
    
    def reset(self):
        self._instrument = self.np_random.choice(
            list(self._prices._fields))
        if self._instrument is "open":
            prices = self._prices.open
        if self._instrument is "close":
            prices = self._prices.close
        if self._instrument is "high":
            prices = self._prices.high
        if self._instrument is "low":
            prices = self._prices.low
        else:
            prices = self._prices.volume
        bars = self._state.bars_count
        if self.random_ofs_on_reset:
            offset = self.np_random.choice(
                prices.shape[0]-bars*10) + bars
        else:
            offset = bars
        print(self._prices.low[offset],offset)
        
        # return P, offset
        self._state.reset(self._prices, offset)
        return self._state.encode()
    def step(self, action_idx):
        action = Actions(action_idx)
        reward, done = self._state.step(action)
        obs = self._state.encode()
        info = {

            
                "instrument": self._instrument,
                "offset": self._state._offset
                }
        return obs, reward, done, info

    


In [None]:
rp=preprocess(train_path)
env  = StocksEnv(rp, bars_count=10,
                 commission=0.1,
                 reset_on_close=True, state_1d=False,
                 random_ofs_on_reset=True, reward_on_close=False,
                 volumes=True)

All done for env .
lets setup model to get q value from our observation.

In [10]:
import math
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F



The provided class SimpleFFDQN is a neural network model defined using PyTorch. It represents a simple implementation of the Dueling Double Deep Q-Network (Dueling DQN) architecture.

Here's what the forward function does:

Value Stream (self.fc_val):

The input x (which represents the state observation) is passed through a series of fully connected layers (nn.Linear) with ReLU activation functions (nn.ReLU).
The output of the last linear layer is a single scalar value, representing the estimated value of the state (hence the name val). This value represents the expected return (or total future reward) that can be obtained from being in the given state.
Advantage Stream (self.fc_adv):

Similar to the value stream, the input x is passed through a series of fully connected layers with ReLU activation functions.
The output of the last linear layer is a vector with actions_n elements, where each element represents the estimated advantage for each action available in the environment.
Combining Value and Advantage Streams:

The value stream and the advantage stream are combined to produce the final output of the network. This is done by adding the value estimates (val) to the advantages (adv) after centering the advantages by subtracting their mean (adv - adv.mean(dim=1, keepdim=True)). This step helps in stabilizing the learning process by ensuring that the network can learn relative advantages of different actions while still having a baseline value estimate for each state.
The final output of the network is a tensor with actions_n elements, where each element represents the estimated Q-value for each action, considering both the state value and the advantages of each action.

This architecture separates the representation of state values and action advantages, which is a key idea in Dueling DQN. It helps in more stable and efficient learning, especially in environments with a large number of actions or where the advantages of different actions can vary significantly.

In [11]:
class SimpleFFDQN(nn.Module):
    def __init__(self, obs_len, actions_n):
        super(SimpleFFDQN, self).__init__()

        self.fc_val = nn.Sequential(
            nn.Linear(obs_len, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

        self.fc_adv = nn.Sequential(
            nn.Linear(obs_len, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, actions_n)
        )

    def forward(self, x):
        val = self.fc_val(x)
        adv = self.fc_adv(x)
        # The value stream and the advantage stream are combined to produce the final output of the network.
        # This is done by adding the value estimates (val) to the advantages (adv) after centering the advantages by subtracting their
        # mean (adv - adv.mean(dim=1, keepdim=True)). T
        # This step helps in stabilizing the learning process by ensuring that the network can learn relative advantages of different actions
        # while still having a baseline value estimate for each state.The final output of the network is a tensor with actions_n elements, 
        # where each element represents the estimated Q-value for each action, considering both the state value and the advantages of each action.
        return val + (adv - adv.mean(dim=1, keepdim=True))

In [12]:
import numpy as np

import torch
import torch.nn as nn

import warnings
from typing import Iterable
from datetime import datetime, timedelta

import ptan
from ignite.engine import Engine
from ignite.metrics import RunningAverage
from ignite.contrib.handlers import TensorboardLogger
from ignite.contrib.handlers import tensorboard_logger

@torch.no_grad()
def calc_values_of_states(states, net, device="cpu"):
    mean_vals = []
    for batch in np.array_split(states, 64):
        states_v = torch.tensor(batch).to(device)
        action_values_v = net(states_v)
        best_action_values_v = action_values_v.max(1)[0]
        mean_vals.append(best_action_values_v.mean().item())
    return np.mean(mean_vals)


def unpack_batch(batch):
    states, actions, rewards, dones, last_states = [], [], [], [], []
    for exp in batch:
        state = np.array(exp.state, copy=False)
        states.append(state)
        actions.append(exp.action)
        rewards.append(exp.reward)
        dones.append(exp.last_state is None)
        if exp.last_state is None:
            last_states.append(state)       # the result will be masked anyway
        else:
            last_states.append(np.array(exp.last_state, copy=False))
    return np.array(states, copy=False), np.array(actions), np.array(rewards, dtype=np.float32), \
           np.array(dones, dtype=np.uint8), np.array(last_states, copy=False)


def calc_loss(batch, net, tgt_net, gamma, device="cpu"):
    states, actions, rewards, dones, next_states = unpack_batch(batch)

    states_v = torch.tensor(states).to(device)
    next_states_v = torch.tensor(next_states).to(device)
    actions_v = torch.tensor(actions).to(device)
    rewards_v = torch.tensor(rewards).to(device)
    done_mask = torch.BoolTensor(dones).to(device)

    state_action_values = net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
    next_state_actions = net(next_states_v).max(1)[1]
    next_state_values = tgt_net(next_states_v).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    next_state_values[done_mask] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + rewards_v
    return nn.MSELoss()(state_action_values, expected_state_action_values)


def batch_generator(buffer: ptan.experience.ExperienceReplayBuffer,
                    initial: int, batch_size: int):
    buffer.populate(initial)
    while True:
        buffer.populate(1)
        yield buffer.sample(batch_size)




In [13]:
import ptan
import pathlib
import argparse
import gym.wrappers
import numpy as np

import torch
import torch.optim as optim

from ignite.engine import Engine
from ignite.contrib.handlers import tensorboard_logger
# from ignite.handlers import tensorboard_logger

In [14]:
tensorboard_logger.OutputHandler

ignite.handlers.tensorboard_logger.OutputHandler

In [None]:
train_path = "D:\Practical_Reinforcement_Learning\chapter8\data\ch08-small-quotes\YNDX_150101_151231.csv"
val_path = "D:\Practical_Reinforcement_Learning\chapter8\data\ch08-small-quotes\YNDX_150101_151231.csv"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


BATCH_SIZE = 32
BARS_COUNT = 10

EPS_START = 1.0
EPS_FINAL = 0.1
EPS_STEPS = 1000000

GAMMA = 0.99

REPLAY_SIZE = 100000
REPLAY_INITIAL = 10000
REWARD_STEPS = 2
LEARNING_RATE = 0.0001
STATES_TO_EVALUATE = 1000

tp= preprocess(train_path)
env = StocksEnv( tp,bars_count=10,
                 commission=0.1,
                 reset_on_close=True, state_1d=False,
                 random_ofs_on_reset=True, reward_on_close=False,
                 volumes=True)
# env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
vp = preprocess(val_path)
env_val = StocksEnv(vp, bars_count=10,
                 commission=0.1,
                 reset_on_close=True, state_1d=False,
                 random_ofs_on_reset=True, reward_on_close=False,
                 volumes=True)

net = SimpleFFDQN(env.observation_space.shape[0],
                            env.action_space.n).to(device)
tgt_net = ptan.agent.TargetNet(net)

In [16]:
from typing import Union
class EpsilonTracker:
    """
    Updates epsilon according to linear schedule
    """
    def __init__(self, selector: ptan.actions.EpsilonGreedyActionSelector,
                 eps_start: Union[int, float],
                 eps_final: Union[int, float],
                 eps_frames: int):
        self.selector = selector
        self.eps_start = eps_start
        self.eps_final = eps_final
        self.eps_frames = eps_frames
        self.frame(0)

    def frame(self, frame: int):
        eps = self.eps_start - frame / self.eps_frames
        self.selector.epsilon = max(self.eps_final, eps)

In [17]:


selector = ptan.actions.EpsilonGreedyActionSelector(EPS_START)
eps_tracker = EpsilonTracker(
    selector, EPS_START, EPS_FINAL, EPS_STEPS)



In [18]:
agent = ptan.agent.DQNAgent(net, selector, device=device)
exp_source = ptan.experience.ExperienceSourceFirstLast(
    env, agent, GAMMA, steps_count=REWARD_STEPS)
buffer = ptan.experience.ExperienceReplayBuffer(
    exp_source, REPLAY_SIZE)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

In [None]:
batch = batch_generator(buffer,12,32)
for experience in batch:
    print(experience)
    