 This tried to train a reinforcement learning agent to trade bitcoin. This uses openai's baseline library.
 
 Data:
 - daily bitcoin returns
 - supplmental data
     - bitcoin returns
     - twitter, reddit sentiment
     - gold, steel, S&P500, and more
 
Reinforcement learning features:
 
 
 - memory: the agent remembers the last few states so it can make prediction based on the history
 - [prioritised experience replay](https://arxiv.org/abs/1511.05952)
 - [dueling](https://arxiv.org/pdf/1511.06581.pdf)
 - [double q learning](https://arxiv.org/abs/1509.06461) adjust some situations when the agent overestimates reward. Leads to better performance
 
Machine learning features:

- dropout
- regularisation
- batchnorm
 

References:
 
 I used the ["A Deep Reinforcement Learning Framework for the Financial Portfolio Management Problem"](https://arxiv.org/abs/1706.10059) paper a lot for understanding the problem and ideas for model design.
 
 The trading environment is modified from here https://github.com/hackthemarket/gym-trading/blob/master/gym_trading/envs/TradingEnv.ipynb

In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# numeric
import quandl
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
# rl
import gym
from gym import error, spaces, utils
from gym.utils import seeding

import baselines.common.tf_util as U
from baselines import logger
from baselines import deepq
from baselines.common.schedules import LinearSchedule, PiecewiseSchedule
from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer
from baselines.common.misc_util import (
    boolean_flag,
    pickle_load,
    pretty_eta,
    relatively_safe_pickle_dump,
    set_global_seeds,
    RunningAvg,
    SimpleMonitor
)

# ml
import tensorflow as tf
import tensorflow.contrib.layers as layers

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
%reload_ext autoreload
%autoreload 2

# Parameters

In [4]:
import argparse

def parse_args():
    parser = argparse.ArgumentParser("DQN experiments for Atari games")
    # Environment
    parser.add_argument("--env", type=str, default="Pong", help="name of the game")
    parser.add_argument("--seed", type=int, default=42, help="which seed to use")
    # Core DQN parameters
    parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size")
    parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer")
    parser.add_argument("--num-steps", type=int, default=int(2e8), help="total number of steps to run the environment for")
    parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time")
    parser.add_argument("--learning-freq", type=int, default=4, help="number of iterations between every optimization step")
    parser.add_argument("--target-update-freq", type=int, default=40000, help="number of iterations between every target network update")
    # Bells and whistles
    boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning")
    boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
    boolean_flag(parser, "prioritized", default=False, help="whether or not to use prioritized replay buffer")
    parser.add_argument("--prioritized-alpha", type=float, default=0.6, help="alpha parameter for prioritized replay buffer")
    parser.add_argument("--prioritized-beta0", type=float, default=0.4, help="initial value of beta parameters for prioritized replay")
    parser.add_argument("--prioritized-eps", type=float, default=1e-6, help="eps parameter for prioritized replay buffer")
    # Checkpointing
    parser.add_argument("--save-dir", type=str, default=None, help="directory in which training state and model should be saved.")
    parser.add_argument("--save-azure-container", type=str, default=None,
                        help="It present data will saved/loaded from Azure. Should be in format ACCOUNT_NAME:ACCOUNT_KEY:CONTAINER")
    parser.add_argument("--save-freq", type=int, default=1e6, help="save model once every time this many iterations are completed")
    boolean_flag(parser, "load-on-start", default=True, help="if true and model was previously saved then training will be resumed")
    return parser.parse_args(['--dueling', '--prioritized','--num-steps','900000',"--save-dir","./models", '--target-update-freq', '10000'])
args = parse_args()
args

savedir = args.save_dir
if args.save_azure_container is not None:
    account_name, account_key, container_name = args.save_azure_container.split(":")
    container = Container(account_name=account_name,
                          account_key=account_key,
                          container_name=container_name,
                          maybe_create=True)
    if savedir is None:
        # Careful! This will not get cleaned up. Docker spoils the developers.
        savedir = tempfile.TemporaryDirectory().name
else:
    container = None
    
window_len = 7*8
window_len

56

# helpers

In [5]:
def maybe_save_model(savedir, container, state):
    """This function checkpoints the model and state of the training algorithm."""
    if savedir is None:
        return
    start_time = time.time()
    model_dir = "model-{}".format(state["num_iters"])
    U.save_state(os.path.join(savedir, model_dir, "saved"))
    if container is not None:
        container.put(os.path.join(savedir, model_dir), model_dir)
    relatively_safe_pickle_dump(state, os.path.join(savedir, 'training_state.pkl.zip'), compression=True)
    if container is not None:
        container.put(os.path.join(savedir, 'training_state.pkl.zip'), 'training_state.pkl.zip')
    relatively_safe_pickle_dump(state["monitor_state"], os.path.join(savedir, 'monitor_state.pkl'))
    if container is not None:
        container.put(os.path.join(savedir, 'monitor_state.pkl'), 'monitor_state.pkl')
    logger.log("Saved model in {} seconds\n".format(time.time() - start_time))


def maybe_load_model(savedir, container):
    """Load model if present at the specified path."""
    if savedir is None:
        return

    state_path = os.path.join(os.path.join(savedir, 'training_state.pkl.zip'))
    if container is not None:
        logger.log("Attempting to download model from Azure")
        found_model = container.get(savedir, 'training_state.pkl.zip')
    else:
        found_model = os.path.exists(state_path)
    if found_model:
        state = pickle_load(state_path, compression=True)
        model_dir = "model-{}".format(state["num_iters"])
        if container is not None:
            container.get(savedir, model_dir)
        U.load_state(os.path.join(savedir, model_dir, "saved"))
        logger.log("Loaded models checkpoint at {} iterations".format(state["num_iters"]))
        return state


In [6]:
from rl.memory import Memory, SequentialMemory
class FlatMemory(Memory):
    def get_recent_state(self, current_observation):
        """
        I want a window but modifying baselines internals to handle another
        dimension is hard. Lets just flatten instead.
        """
        state = super(FlatMemory, self).get_recent_state(current_observation)
        return np.array(state)#.flatten()

In [7]:
# modified the baselines monitor to record the mean of ALL infos
class SimpleMonitor(gym.Wrapper):
    def __init__(self, env=None):
        """Adds three qunatities to info returned by every step:
            num_steps: int
                Number of steps takes so far
            rewards: [float]
                All the cumulative rewards for the episodes completed so far.
            infos: [float]
                Infos by episode
        """
        super().__init__(env)
        # current episode state
        self._current_reward = None
        self._current_info = None
        self._num_steps = None
        # temporary monitor state that we do not save
        self._time_offset = None
        self._total_steps = None
        # monitor state
        self._episode_rewards = []
        self._episode_infos = []
        self._episode_lengths = []
        self._episode_end_times = []

    def _reset(self):
        obs = self.env.reset()
        # recompute temporary state if needed
        if self._time_offset is None:
            self._time_offset = time.time()
            if len(self._episode_end_times) > 0:
                self._time_offset -= self._episode_end_times[-1]
        if self._total_steps is None:
            self._total_steps = sum(self._episode_lengths)
        # update monitor state
        if self._current_reward is not None:
            self._episode_rewards.append(self._current_reward)
            # FIXME a bit slow to use a dataframe here
            self._episode_infos.append(pd.DataFrame(self._current_infos).mean().to_dict())
            self._episode_lengths.append(self._num_steps)
            self._episode_end_times.append(time.time() - self._time_offset)
        # reset episode state
        self._current_reward = 0
        self._current_infos = []
        self._num_steps = 0

        return obs

    def _step(self, action):
        obs, rew, done, info = self.env.step(action)
        self._current_reward += rew
        self._current_infos += [info]
        self._num_steps += 1
        self._total_steps += 1
        info['steps'] = self._total_steps
        info['rewards'] = self._episode_rewards
        info['infos'] = self._episode_infos
        return (obs, rew, done, info)

    def get_state(self):
        return {
#             'env_id': self.env.unwrapped.spec.id,
            'episode_data': {
                'episode_rewards': self._episode_rewards,
                'episode_infos': self._episode_infos,
                'episode_lengths': self._episode_lengths,
                'episode_end_times': self._episode_end_times,
                'initial_reset_time': 0,
            }
        }

    def set_state(self, state):
#         assert state['env_id'] == self.env.unwrapped.spec.id
        ed = state['episode_data']
        self._episode_rewards = ed['episode_rewards']
        self._episode_infos = ed['episode_infos']
        self._episode_lengths = ed['episode_lengths']
        self._episode_end_times = ed['episode_end_times']


# Environment

Day trading over 256 days. We scale and augument the training data.

You can see the base environment class [here](https://github.com/openai/gym/blob/master/gym/core.py#L13) and openai's nice docs [here](https://gym.openai.com/docs)

Our environment is based on https://github.com/hackthemarket/gym-trading/blob/master/gym_trading/envs/TradingEnv.ipynb.

In [8]:
from src.environments.portfolio import PortfolioEnv

In [9]:
df_train = pd.read_hdf('./data/poliniex_30m.hf',key='train')
env = PortfolioEnv(
    df=df_train,
    steps=128, 
    scale=True, 
    augument=0.0005    
)
env.seed = 0   
monitored_env = SimpleMonitor(env)
monitored_env

df_test = pd.read_hdf('./data/poliniex_30m.hf',key='test')
env_test = PortfolioEnv(
    df=df_test,
    steps=128, 
    scale=True, 
    augument=0.00)
env_test.seed = 0   
monitored_env_test = SimpleMonitor(env_test)
monitored_env_test.observation_space

Box(6, 8)

In [10]:
session = tf.InteractiveSession()

## SELU?

I tried SELU but it didn't help, It's mean to replace batchnorm and ELU with less parameters
there have been varied reports for it [reddit discussion]( https://www.reddit.com/r/MachineLearning/comments/6g5tg1/r_selfnormalizing_neural_networks_improved_elu/)

In [11]:
# https://stackoverflow.com/questions/44621731/how-to-handle-the-batchnorm-layer-when-training-fully-convolutional-networks-by
def selu(x, name="selu"):
    alpha = 1.6732632423543772848170429916717
    scale = 1.0507009873554804934193349852946
    return scale * tf.where(x >= 0.0, x, alpha * tf.nn.elu(x))

# Model

arXiv:1612.01277 indicated that CNN's are just as effective. That's great because I like them, they are fast so I can try more things and see the results faster. So we will be using a CNN model.


In [12]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/models.py
import tensorflow as tf
import tensorflow.contrib.layers as layers

def _cnn_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False):
    """CNN=>Dense model using dropout"""
    with tf.variable_scope(scope, reuse=reuse):
        out = inpt
        with tf.variable_scope("convnet"):
            for num_outputs, kernel_size, stride in convs:
                out = layers.convolution2d(out,
                    num_outputs=num_outputs,
                    kernel_size=kernel_size,
                    stride=stride,
                    weights_regularizer=tf.contrib.layers.l2_regularizer(1e-8),
                    activation_fn=tf.nn.relu,
#                     normalizer_fn=tf.layers.batch_normalization,                    
                )
                out = layers.dropout(out, 0.3)
        out = layers.flatten(out)
        with tf.variable_scope("action_value"):
            action_out = out
            for hidden in hiddens:
                action_out = layers.fully_connected(
                    action_out, 
                    num_outputs=hidden, 
                    activation_fn=tf.nn.relu,
                    weights_regularizer=tf.contrib.layers.l2_regularizer(1e-8),
#                     normalizer_fn=tf.layers.batch_normalization, 
                )
#                 action_out = tf.layers.batch_normalization(action_out)
                action_out = layers.dropout(action_out, 0.3)
            action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=tf.nn.softmax)
        
        if dueling:
            with tf.variable_scope("state_value"):
                state_out = out
                for hidden in hiddens:
                    state_out = layers.fully_connected(
                        state_out, num_outputs=hidden, activation_fn=tf.nn.relu,
                        weights_regularizer=tf.contrib.layers.l2_regularizer(1e-8),
                        normalizer_fn=tf.layers.batch_normalization,
                    )
#                     state_out = tf.layers.batch_normalization(state_out)
                    state_out = layers.dropout(state_out, 0.3)
                state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
            action_scores_mean = tf.reduce_mean(action_scores, 1)
            action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
            return state_score + action_scores_centered
        else:
            return action_scores

def cnn_mlp(convs, hiddens, dueling):
    """Factory to return a model function without input and output dimensions"""
    return lambda *args, **kwargs: _cnn_mlp(convs, hiddens, dueling, *args, **kwargs)

model = cnn_mlp(
    convs=[(12, 1, 1), (24, window_len, 4), (24, 1, 1)],#, (64, 4, 2), (64, 3, 1)],
#     convs=[(32, 8, 1), (64, 8, 4)],#, (64, 4, 2), (64, 3, 1)],
    hiddens=[],
    dueling=args.dueling
)

In [13]:
# remeber our last observations: agent memory
memory = FlatMemory(window_len)

In [14]:
deepq.build_train?
env.action_space.shape[0]

6

In [15]:
tf.where?

In [16]:
tf.random_uniform?
tf.stack([11])

<tf.Tensor 'stack:0' shape=(1,) dtype=int32>

In [17]:
# need custom build_train since we don't want to argmax it
import tensorflow as tf
import baselines.common.tf_util as U


def build_act(make_obs_ph, q_func, num_actions, scope="deepq", reuse=None):
    """Creates the act function:
    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that take a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    """
    with tf.variable_scope(scope, reuse=reuse):
        observations_ph = U.ensure_tf_input(make_obs_ph("observation"))
        stochastic_ph = tf.placeholder(tf.bool, (), name="stochastic")
        update_eps_ph = tf.placeholder(tf.float32, (), name="update_eps")

        eps = tf.get_variable("eps", (), initializer=tf.constant_initializer(0))

        q_values = q_func(observations_ph.get(), num_actions, scope="q_func")
        deterministic_actions = q_values # tf.argmax(q_values, axis=1)

        batch_size = tf.shape(observations_ph.get())[0]
        random_actions = tf.random_uniform(tf.stack([batch_size, num_actions]), minval=0, maxval=num_actions, dtype=tf.int64)
        chose_random = tf.random_uniform(tf.stack([batch_size, num_actions]), minval=0, maxval=1, dtype=tf.float32) < eps
        stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions)
        
#         import pdb; pdb.set_trace()

        # do either random or deterministic actions based on exploration
        output_actions = tf.cond(stochastic_ph, lambda: stochastic_actions, lambda: deterministic_actions)
        
        update_eps_expr = eps.assign(tf.cond(update_eps_ph >= 0, lambda: update_eps_ph, lambda: eps))

        act = U.function(inputs=[observations_ph, stochastic_ph, update_eps_ph],
                         outputs=output_actions,
                         givens={update_eps_ph: -1.0, stochastic_ph: True},
                         updates=[update_eps_expr])
        return act


def build_train(make_obs_ph, q_func, num_actions, optimizer, grad_norm_clipping=None, gamma=1.0, double_q=True, scope="deepq", reuse=None):
    """Creates the train function:
    Parameters
    ----------
    make_obs_ph: str -> tf.placeholder or TfInput
        a function that takes a name and creates a placeholder of input with that name
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    num_actions: int
        number of actions
    reuse: bool
        whether or not to reuse the graph variables
    optimizer: tf.train.Optimizer
        optimizer to use for the Q-learning objective.
    grad_norm_clipping: float or None
        clip gradient norms to this value. If None no clipping is performed.
    gamma: float
        discount rate.
    double_q: bool
        if true will use Double Q Learning (https://arxiv.org/abs/1509.06461).
        In general it is a good idea to keep it enabled.
    scope: str or VariableScope
        optional scope for variable_scope.
    reuse: bool or None
        whether or not the variables should be reused. To be able to reuse the scope must be given.
    Returns
    -------
    act: (tf.Variable, bool, float) -> tf.Variable
        function to select and action given observation.
`       See the top of the file for details.
    train: (object, np.array, np.array, object, np.array, np.array) -> np.array
        optimize the error in Bellman's equation.
`       See the top of the file for details.
    update_target: () -> ()
        copy the parameters from optimized Q function to the target Q function.
`       See the top of the file for details.
    debug: {str: function}
        a bunch of functions to print debug data like q_values.
    """
    act_f = build_act(make_obs_ph, q_func, num_actions, scope=scope, reuse=reuse)

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_t_input = U.ensure_tf_input(make_obs_ph("obs_t"))
        act_t_ph = tf.placeholder(tf.int32, [None], name="action")
        rew_t_ph = tf.placeholder(tf.float32, [None], name="reward")
        obs_tp1_input = U.ensure_tf_input(make_obs_ph("obs_tp1"))
        done_mask_ph = tf.placeholder(tf.float32, [None], name="done")
        importance_weights_ph = tf.placeholder(tf.float32, [None], name="weight")

        # q network evaluation
        q_t = q_func(obs_t_input.get(), num_actions, scope="q_func", reuse=True)  # reuse parameters from act
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        # target q network evalution
        q_tp1 = q_func(obs_tp1_input.get(), num_actions, scope="target_q_func")
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))

        # q scores for actions which we know were selected in the given state.
        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(act_t_ph, num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if double_q:
            q_tp1_using_online_net = q_func(obs_tp1_input.get(), num_actions, scope="q_func", reuse=True)
            q_tp1_best_using_online_net = tf.arg_max(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)
        q_tp1_best_masked = (1.0 - done_mask_ph) * q_tp1_best

        # compute RHS of bellman equation
        q_t_selected_target = rew_t_ph + gamma * q_tp1_best_masked

        # compute the error (potentially clipped)
        td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
        errors = U.huber_loss(td_error)
        weighted_error = tf.reduce_mean(importance_weights_ph * errors)
        # compute optimization op (potentially with gradient clipping)
        if grad_norm_clipping is not None:
            optimize_expr = U.minimize_and_clip(optimizer,
                                                weighted_error,
                                                var_list=q_func_vars,
                                                clip_val=grad_norm_clipping)
        else:
            optimize_expr = optimizer.minimize(weighted_error, var_list=q_func_vars)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_expr = []
        for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name),
                                   sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_expr.append(var_target.assign(var))
        update_target_expr = tf.group(*update_target_expr)

        # Create callable functions
        train = U.function(
            inputs=[
                obs_t_input,
                act_t_ph,
                rew_t_ph,
                obs_tp1_input,
                done_mask_ph,
                importance_weights_ph
            ],
            outputs=td_error,
            updates=[optimize_expr]
        )
        update_target = U.function([], [], updates=[update_target_expr])

        q_values = U.function([obs_t_input], q_t)

        return act_f, train, update_target, {'q_values': q_values}

In [18]:
# Create training graph and replay buffer
input_shape = (window_len, )+env.observation_space.shape
act, train, update_target, debug = deepq.build_train(
    make_obs_ph=lambda name: U.Uint8Input(shape=input_shape, name=name),
    q_func=model,
    num_actions=env.action_space.shape[0],
    optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=1e-4),
    gamma=0.99,
    grad_norm_clipping=10,
    double_q=args.double_q
)



[2017-07-15 14:37:06,771] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.




[2017-07-15 14:37:06,850] VARIABLES collection name is deprecated, please use GLOBAL_VARIABLES instead; VARIABLES will be removed after 2017-03-02.


In [19]:
# random vs predicted actions: agent playfullness
approximate_num_iters = args.num_steps / 4
exploration = PiecewiseSchedule([
    (0, 1.0),                         #     0% of iters, 100% random actions
    (approximate_num_iters / 50, 0.1),#     2% of iters, 10%  random actions
    (approximate_num_iters / 10, 0.05),#   10% of iters,  5%  random actions
    (approximate_num_iters / 5, 0.01) # at 20% of iters,  1%  random actions
], outside_value=0.01)

In [20]:
# prioritised replay: agent dreaming which enhances learning speed
if args.prioritized:
    replay_buffer = PrioritizedReplayBuffer(args.replay_buffer_size, args.prioritized_alpha)
    beta_schedule = LinearSchedule(approximate_num_iters, initial_p=args.prioritized_beta0, final_p=1.0)
else:
    replay_buffer = ReplayBuffer(args.replay_buffer_size)

# Train

In [21]:
def plot_env(env):
    # get data
    i1=env.src.idx
    di=env.src.step
    df_data = env.src.data[i1-di:i1-1]
    print('df_data', len(df_data))

    df_sim = env.sim.to_df()[-di:]
    print('df_sim', len(df_sim))

    if len(df_data)>len(df_sim):
        df_data=df_data[:-1]
    df_sim.index = df_data.index
    df = pd.merge(df_data,df_sim,left_index=True,right_index=True)

    # Plot prices
    df.Close.plot(alpha=0.5, figsize=(12,6), color="black")
    plt.title('Trades')

    # Plot actions
    colors = dict(
        LONG="green",
        SHORT="red",
        FLAT="blue"
    )
    for i in range(env.action_space.n):
        dfa=df[df.action==i]
        action_name = env.sim.action_names[i]
        plt.scatter(dfa.index,dfa.Close.values, s=15, marker='x',label='{}'.format(action_name), c=colors.get(action_name,None))
    plt.legend()
    plt.show()
    
    
    df[['bod_nav','mkt_nav']].plot(figsize=(12,6))    
    plt.title('Net asset values')
    plt.show()
    
# plot_env(env)

In [22]:
# https://github.com/openai/baselines/blob/master/baselines/deepq/experiments/atari/train.py

U.initialize()
update_target()

# # Load the model
mon_state = maybe_load_model(savedir, container)
if mon_state is not None:
    num_iters, replay_buffer = mon_state["num_iters"], mon_state["replay_buffer"],
    monitored_env.set_state(mon_state["monitor_state"])

In [23]:

log_intv = 2000
num_iters = 0


start_time, start_steps = None, None
steps_per_iter = RunningAvg(0.999)
iteration_time_est = RunningAvg(0.999)
obs = monitored_env.reset()
state = memory.get_recent_state(obs)

# Main training loop
with tqdm(total=args.num_steps, mininterval=0.5) as progbar:
    while True:
        num_iters += 1

        # Take action and store transition in the replay buffer.
        action = act(np.array(state)[None], update_eps=exploration.value(num_iters))[0]
        new_obs, rew, done, info = monitored_env.step(action)
        new_state = memory.get_recent_state(new_obs) # add remembered observations
        replay_buffer.add(state, action, rew, new_state, float(done))
        state = new_state
        
        if done and episodes%log_intv==0:
#             plot_env(env)
            pass
        if done: # reset the game environment
            obs = monitored_env.reset()
            state = memory.get_recent_state(obs)

        # Replay/Dream
        if (num_iters > max(5 * args.batch_size, args.replay_buffer_size // 20) and
                num_iters % args.learning_freq == 0 and
                len(memory.recent_observations)>window_len):
            
            # Sample a bunch of transitions from replay buffer, in batch
            if args.prioritized:
                experience = replay_buffer.sample(args.batch_size, beta=beta_schedule.value(num_iters))
                (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience
            else:
                obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size)
                weights = np.ones_like(rewards)
            
            # Minimize the error in Bellman's equation and compute TD-error
            td_errors = train(obses_t, actions, rewards, obses_tp1, dones, weights)
            
            # Update the priorities in the replay buffer
            if args.prioritized:
                new_priorities = np.abs(td_errors) + args.prioritized_eps
                replay_buffer.update_priorities(batch_idxes, new_priorities)

        # Update target network.
        if num_iters % args.target_update_freq == 0:
            update_target()

        # Stats
        if start_time is not None:
            steps_per_iter.update(info['steps'] - start_steps)
            iteration_time_est.update(time.time() - start_time)
        start_time, start_steps = time.time(), info["steps"]

        # Save the model and training state.
        if num_iters > 0 and (num_iters % args.save_freq == 0 or info["steps"] > args.num_steps):
            maybe_save_model(savedir, container, {
                'replay_buffer': replay_buffer,
                'num_iters': num_iters,
                'monitor_state': monitored_env.get_state()
            })

        if info["steps"] > args.num_steps:
            break
            
        # TODO validation test

        # log
        episodes = len(info["rewards"])
        if done and episodes%log_intv==0:
            steps_left = args.num_steps - info["steps"]
            completion = np.round(info["steps"] / args.num_steps, 1)
            
            info_means = pd.DataFrame(info["infos"])[-log_intv:].mean()

            logger.record_tabular("% completion", completion)
            logger.record_tabular("steps", info["steps"])
            logger.record_tabular("iters", num_iters)
            logger.record_tabular("episodes", episodes)
            logger.record_tabular("reward (%s epi mean)" % log_intv, info_means.reward)
            logger.record_tabular("nav (%s epi mean)" % log_intv, info_means.nav)
            logger.record_tabular("nav_abv_mkt (mean)", info_means.nav_abv_mkt)
            logger.record_tabular("cost (%s epi mean)" % log_intv, info_means.costs)
            logger.record_tabular("exploration", exploration.value(num_iters))
            if args.prioritized:
                logger.record_tabular("max priority", replay_buffer._max_priority)
            fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6)
                            if steps_per_iter._value is not None else "calculating...")
            logger.dump_tabular()
            logger.log()
            logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate)))
            logger.log()
            
        # Probar
        progbar.desc = 'reward={reward: 2.4f}, nav={nav: 2.2f} nav_abv_mkt={nav_abv_mkt: 2.2f} costs={costs: 2.4f}'.format(
            reward=info['reward'],
            nav=info["nav"],
            nav_abv_mkt=info["nav_abv_mkt"],
            costs=info["costs"]
        )
        progbar.update(1)




TypeError: list indices must be integers or slices, not str

In [None]:
action

In [None]:
info_means = pd.DataFrame(info["infos"])[-log_intv:].mean()
info_means.nav_abv_mkt

In [None]:
maybe_save_model(savedir, container, {
            'replay_buffer': replay_buffer,
            'num_iters': num_iters,
            'monitor_state': monitored_env.get_state()
        })
save_path = '{}/model-{}'.format(savedir, num_iters)
save_path

In [None]:
# pd.DataFrame(info["infos"])

In [None]:
# obses_t.shape, actions.shape, rewards.shape, obses_tp1.shape, dones.shape, weights.shape
# memory.recent_terminals

In [None]:
# history
history = pd.DataFrame(info['infos'])[['nav', 'reward']]
history['episodes'] = history.index

g = sns.jointplot(x="episodes", y="nav", data=history, kind="reg")
plt.show()


g = sns.jointplot(x="episodes", y="reward", data=history, kind="reg")

# visualise

ideally a price with colored actions? like https://hackernoon.com/the-self-learning-quant-d3329fcc9915

In [None]:
# test, play
def test_env(env, verbose=True):
    obs = env.reset()
    state = memory.get_recent_state(obs)
    for t in range(env.days):
        action = act(state[None], update_eps=exploration.value(t))[0]
        obs, rew, done, info = env.step(action)
        state = memory.get_recent_state(obs)
    if verbose:
        print('nav', env.sim.navs[-1], 'market_nav', env.sim.mkt_nav[-1])
    return env.sim.to_df()
df_test = test_env(env_test)
df_test[['bod_nav','mkt_nav']].plot()

In [None]:
df

# Test

In [None]:
tests={}
for i in tqdm(range(10)):
    df = test_env(env_test, verbose=False)
    tests[i]=df.iloc[-1]
tests = pd.DataFrame(tests).T
tests.mean()

In [None]:
sns.distplot(tests.bod_nav, label='model nav')
sns.distplot(tests.mkt_nav, label='holding nav')
plt.xlabel('Net Asset Value')
plt.legend()

# Test sim

In [None]:
# try 100 runs with random guessing
navs=[]
for _ in range(100):
    obs = env.reset()
    state = memory.get_recent_state(obs)
    for t in range(252):
        action = 1
        obs, rew, done, info = env.step(action)
        state = memory.get_recent_state(obs)
    df_display = env.sim.to_df()
    # mean of last 50 days
    nav = df_display.bod_nav[-50:].mean()
    navs.append(nav)
# show dist
plt.title('hold')
sns.distplot(navs)
plt.show()

# try 100 runs with random guessing
navs=[]
for _ in range(100):
    obs = env.reset()
    state = memory.get_recent_state(obs)
    for t in range(252):
        action = 0
        obs, rew, done, info = env.step(action)
        state = memory.get_recent_state(obs)
    df_display = env.sim.to_df()
    # mean of last 50 days
    nav = df_display.bod_nav[-50:].mean()
    navs.append(nav)
# show dist
plt.title('short')
sns.distplot(navs)
plt.show()


# try 100 runs with random guessing
navs=[]
for _ in range(100):
    obs = env.reset()
    state = memory.get_recent_state(obs)
    for t in range(252):
        action = 2
        obs, rew, done, info = env.step(action)
        state = memory.get_recent_state(obs)
    df_display = env.sim.to_df()
    # mean of last 50 days
    nav = df_display.bod_nav[-50:].mean()
    navs.append(nav)
# show dist
plt.title('long')
sns.distplot(navs)
plt.show()

# Dummy

In [None]:
# make X and y for traditiona ML training
window_length = memory.window_length

X = []
D = env.src.data.as_matrix()
D[np.isnan(D)]=0
for i in range(window_length,len(D)):
    # at each step we get the past few observations
    x = D[i-window_length:i]
    X.append(x)
X=np.array(X)



# convert y to down, flat, up categorical labels
from keras.utils.np_utils import to_categorical


y = np.concatenate([[0],np.diff(env.src.data.Close)])
y = y[window_length:]

short = y<0
flat = y==0
long = y>0
y[short]=0
y[flat]=1
y[long]=2

y=to_categorical(y)

print('X.shape',X.shape)
print('y.shape', y.shape)

In [None]:
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import train_test_split

X_flat = X.reshape((len(D)-window_length,-1))
X_train, X_test, y_train, y_test = train_test_split(X_flat, y, test_size=0.2, random_state=0)

dummy_scores = []
for strategy in ['most_frequent', 'uniform', 'prior', 'stratified']:
    clf = DummyClassifier(strategy=strategy)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = clf.score(X_test, y_test)
    print(strategy, score)

for strategy in ['mean', 'median']:
    clf=DummyRegressor(strategy=strategy)
    clf.fit(X_train, y_train)
    y_pred=clf.predict(X_test)
    score=clf.score(X_test, y_test)
    print(strategy, score)

# Direct model training

In [None]:
# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
model.compile('adam','mse',metrics=['accuracy'])
history = model.fit(X_train,y_train, 
          verbose=True,
          nb_epoch=100
         )


In [None]:
score = model.evaluate(X_test,y_test)
score = dict(zip(model.metrics_names,score))
score

In [None]:
y_pred = model.predict(X_test)
y_pred.argmax(1)