This notebook is a little easier for beginners because it uses pytorch. You need to clone a repo to get it working:

```sh
# you need this repo, so clone it
git clone https://github.com/wassname/DeepRL.git
cd DeepRL
git reset --hard aeae2c5d585e5853dc638968b1f090eb60abd351
cd ..
mkdir data log evaluation_log
```

This contains some minor modifications from https://github.com/ShangtongZhang/DeepRL.git

The notebook tries DPPG with the [EIIE model](https://arxiv.org/pdf/1706.10059.pdf)

I also uncommented reward normalization in DDPG_agent.py#L64 because otherwise my small reward les to large Q's, inf losses, and NaN actions and weights.

In [1]:
# plotting
%matplotlib notebook
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# numeric
import numpy as np
from numpy import random
import pandas as pd

# utils
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import tempfile
import logging
import time
import datetime

# logging
logger = log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

INFO:__main__:__main__ logger started.


In [2]:
import os
os.sys.path.append(os.path.abspath('.'))
os.sys.path.append(os.path.abspath('DeepRL'))
%reload_ext autoreload
%autoreload 2

In [3]:
# params
window_length = 50
steps = 128



In [4]:
# save dir
import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

save_path = './outputs/pytorch-DDPG/pytorch-DDPG-EIIE-action-crypto-%s.model' % ts
print(save_path)
try:
    os.makedirs(os.path.dirname(save_path))
except OSError:
    pass

./outputs/pytorch-DDPG/pytorch-DDPG-EIIE-action-crypto-20200423_21-48-39.model


In [5]:
# setup tensorboard logging
from tensorboard_logger import configure, log_value
tag = 'ddpg-' + ts
print('tensorboard --logdir '+"runs/" + tag)
try:
    configure("runs/" + tag)
except ValueError as e:
    print(e)
    pass

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


tensorboard --logdir runs/ddpg-20200423_21-48-39


# Env

In [6]:
from rl_portfolio_management.environments.portfolio import PortfolioEnv
from rl_portfolio_management.util import MDD, sharpe, softmax
from rl_portfolio_management.wrappers import SoftmaxActions, TransposeHistory, ConcatStates

df_train = pd.read_hdf('./data/poloniex_30m.hf',key='train')
df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')


In [7]:
import gym
class DeepRLWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.render_on_reset = False
        
        self.state_dim = self.observation_space.shape
        self.action_dim = self.action_space.shape[0]
        
        self.name = 'PortfolioEnv'
        self.success_threshold = 2
        
    def normalize_state(self, state):
        return state
    
    def step(self, action):
        state, reward, done, info =self.env.step(action)
        reward*=1e4 # often reward scaling is important sooo...
        return state, reward, done, info
    
    def reset(self):        
        # here's a roundabout way to get it to plot on reset
        if self.render_on_reset: 
            self.env.render('notebook')

        return self.env.reset()

In [8]:
def task_fn():
    env = PortfolioEnv(df=df_train, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env

def task_fn_test():
    env = PortfolioEnv(df=df_test, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env
    
# sanity check
task = task_fn()
task.reset().shape, task.step(task.action_space.sample())[0].shape

  data = df.as_matrix().reshape(


((4, 51, 3), (4, 51, 3))

# Agent and models

In [9]:
# load
import pickle
import shutil

def save_ddpg(agent):
    agent_type = agent.__class__.__name__
    save_file = 'data/%s-%s-model-%s.bin' % (agent_type, config.tag, agent.task.name)
    agent.save(save_file)
    print(save_file)
    

def load_ddpg(agent):
    agent_type = agent.__class__.__name__
    save_file = 'data/%s-%s-model-%s.bin' % (agent_type, config.tag, agent.task.name)
    new_states = pickle.load(open(save_file, 'rb'))
    states = agent.worker_network.load_state_dict(new_states)


def load_stats_ddpg(agent):
    agent_type = agent.__class__.__name__
    online_stats_file = 'data/%s-%s-online-stats-%s.bin' % (
                    agent_type, config.tag, agent.task.name)
    try:
        steps, rewards = pickle.load(open(online_stats_file, 'rb'))
    except FileNotFoundError:
        steps =[]
        rewards=[]
    df_online = pd.DataFrame(np.array([steps, rewards]).T, columns=['steps','rewards'])
    if len(df_online):
        df_online['step'] = df_online['steps'].cumsum()
        df_online.index.name = 'episodes'
    
    stats_file = 'data/%s-%s-all-stats-%s.bin' % (agent_type, config.tag, agent.task.name)

    try:
        stats = pickle.load(open(stats_file, 'rb'))
    except FileNotFoundError:
        stats = {}
    df = pd.DataFrame(stats["test_rewards"], columns=['rewards'])
    if len(df):
#         df["steps"]=range(len(df))*50

        df.index.name = 'episodes'
    return df_online, df

In [12]:
import logging
from DeepRL.agent import ProximalPolicyOptimization
from DeepRL.network import DisjointActorCriticNet #, DeterministicActorNet, DeterministicCriticNet
from DeepRL.component import GaussianPolicy, HighDimActionReplay, OrnsteinUhlenbeckProcess
from DeepRL.utils import Config, Logger
import gym
import torch
gym.logger.setLevel(logging.INFO)

# Alg

In [13]:
# Modified from https://github.com/ShangtongZhang/DeepRL to log to tensorboard

from DeepRL.utils.normalizer import Normalizer

null_normaliser = lambda x:x

class DDPGAgent:
    def __init__(self, config):
        self.config = config
        self.task = config.task_fn()
        self.worker_network = config.network_fn()
        self.target_network = config.network_fn()
        self.target_network.load_state_dict(self.worker_network.state_dict())
        self.actor_opt = config.actor_optimizer_fn(self.worker_network.actor.parameters())
        self.critic_opt = config.critic_optimizer_fn(self.worker_network.critic.parameters())
        self.replay = config.replay_fn()
        self.random_process = config.random_process_fn()
        self.criterion = nn.MSELoss()
        self.total_steps = 0

        self.state_normalizer = Normalizer(self.task.state_dim) # null_normaliser # 
        self.reward_normalizer = Normalizer(1)

    def soft_update(self, target, src):
        for target_param, param in zip(target.parameters(), src.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.config.target_network_mix) +
                                    param.data * self.config.target_network_mix)

    def save(self, file_name):
        with open(file_name, 'wb') as f:
            torch.save(self.worker_network.state_dict(), f)

    def episode(self, deterministic=False, video_recorder=None):
        self.random_process.reset_states()
        state = self.task.reset()
        state = self.state_normalizer(state)

        config = self.config
        actor = self.worker_network.actor
        critic = self.worker_network.critic
        target_actor = self.target_network.actor
        target_critic = self.target_network.critic

        steps = 0
        total_reward = 0.0
        while True:
            actor.eval()
            action = actor.predict(np.stack([state])).flatten()
            if not deterministic:
                action += self.random_process.sample()
            next_state, reward, done, info = self.task.step(action)
            if video_recorder is not None:
                video_recorder.capture_frame()
            done = (done or (config.max_episode_length and steps >= config.max_episode_length))
            next_state = self.state_normalizer(next_state) * config.reward_scaling
            total_reward += reward
            
            # tensorboard logging
            prefix = 'test_' if deterministic else ''
            log_value(prefix + 'reward', reward, self.total_steps)
#             log_value(prefix + 'action', action, steps)
            log_value('memory_size', self.replay.size(), self.total_steps)     
            for key in info:
                log_value(key, info[key], self.total_steps)     
            
            reward = self.reward_normalizer(reward)

            if not deterministic:
                self.replay.feed([state, action, reward, next_state, int(done)])
                self.total_steps += 1

            steps += 1
            state = next_state

            if done:
                break

            if not deterministic and self.replay.size() >= config.min_memory_size:
                self.worker_network.train()
                experiences = self.replay.sample()
                states, actions, rewards, next_states, terminals = experiences
                q_next = target_critic.predict(next_states, target_actor.predict(next_states))
                terminals = critic.to_torch_variable(terminals).unsqueeze(1)
                rewards = critic.to_torch_variable(rewards).unsqueeze(1)
                q_next = config.discount * q_next * (1 - terminals)
                q_next.add_(rewards)
                q_next = q_next.detach()
                q = critic.predict(states, actions)
                critic_loss = self.criterion(q, q_next)

                critic.zero_grad()
                self.critic_opt.zero_grad()
                critic_loss.backward()
                if config.gradient_clip:
                    grad_critic = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
                self.critic_opt.step()

                actions = actor.predict(states, False)
                var_actions = Variable(actions.data, requires_grad=True)
                q = critic.predict(states, var_actions)
                q.backward(torch.ones(q.size()))

                actor.zero_grad()
                self.actor_opt.zero_grad()
                actions.backward(-var_actions.grad.data)
                if config.gradient_clip:
                    grad_actor = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
                self.actor_opt.step()
                
                # tensorboard logging
                log_value('critic_loss', critic_loss.cpu().data.numpy().squeeze(), self.total_steps)
                log_value('loss_action', -q.sum(), self.total_steps)
                if config.gradient_clip:
                    log_value('grad_critic', grad_critic, self.total_steps)
                    log_value('grad_actor', grad_actor, self.total_steps)

                self.soft_update(self.target_network, self.worker_network)

        return total_reward, steps

# Model

In [14]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [15]:
task.state_dim, task.action_dim

((4, 51, 3), 4)

In [16]:

from DeepRL.network.base_network import BasicNet

class DeterministicActorNet(nn.Module, BasicNet):
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_gate,
                 action_scale,
                 gpu=False,
                 batch_norm=False,
                 non_linear=F.relu):
        super(DeterministicActorNet, self).__init__()

        stride_time = state_dim[1] - 1 - 2 #
        features = task.state_dim[0]
        h0 = 2
        h1 = 30
        self.conv1 = nn.Conv2d(features, h0, (3, 1))
        self.conv2 = nn.Conv2d(h0, h1, (stride_time, 1), stride=(stride_time, 1))
        self.conv3 = nn.Conv2d((h1+1), 1, (1, 1))

        self.action_scale = action_scale
        self.action_gate = action_gate
        self.non_linear = non_linear

        if batch_norm:
            self.bn1 = nn.BatchNorm1d(h0)
            self.bn2 = nn.BatchNorm1d(h1)

        self.batch_norm = batch_norm
        BasicNet.__init__(self, None, gpu, False)


    def forward(self, x):
        x = self.to_torch_variable(x)
        
        w0 = x[:,:1,:1,:] # weights from last step 
        x = x[:,:,1:,:]
        
        phi0 = self.non_linear(self.conv1(x))
        if self.batch_norm:
            phi0 = self.bn1(phi0)
        phi1 = self.non_linear(self.conv2(phi0))
        h = torch.cat([phi1,w0], 1)
        if self.batch_norm:
            h = self.bn2(h)
        
        action = self.conv3(h)
        
        # add cash_bias before we softmax
        cash_bias_int = 0
        cash_bias = self.to_torch_variable(torch.ones(action.size())[:,:,:,:1] * cash_bias_int)
        action = torch.cat([cash_bias, action], -1)
        
        batch_size = action.size()[0]
        action = action.view((batch_size,-1))
        if self.action_gate:
            action = self.action_scale * self.action_gate(action)
        return action

    def predict(self, x, to_numpy=True):
        y = self.forward(x)
        if to_numpy:
            y = y.cpu().data.numpy()
        return y

class DeterministicCriticNet(nn.Module, BasicNet):
    def __init__(self,
                 state_dim,
                 action_dim,
                 gpu=False,
                 batch_norm=False,
                 non_linear=F.relu):
        super(DeterministicCriticNet, self).__init__()
        stride_time = state_dim[1] - 1 - 2 #
        self.features = features = task.state_dim[0]
        h0=2
        h1=20
        self.action = actions = action_dim -1
        self.conv1 = nn.Conv2d(features, h0, (3, 1))
        self.conv2 = nn.Conv2d(h0, h1, (stride_time, 1), stride=(stride_time, 1))
        self.layer3 = nn.Linear((h1+2)*actions, 1)
        self.non_linear = non_linear

        if batch_norm:
            self.bn1 = nn.BatchNorm1d(h0)
            self.bn2 = nn.BatchNorm1d(h1)
        self.batch_norm = batch_norm

        BasicNet.__init__(self, None, gpu, False)


    def forward(self, x, action):
        x = self.to_torch_variable(x)
        action = self.to_torch_variable(action)[:,None,None,:-1] # remove cash bias
        
        w0 = x[:,:1,:1,:] # weights from last step 
        x = x[:,:,1:,:]
        
        phi0 = self.non_linear(self.conv1(x))
        if self.batch_norm:
            phi0 = self.bn1(phi0)
        phi1 = self.non_linear(self.conv2(phi0))
        h = torch.cat([phi1,w0,action], 1)
        if self.batch_norm:
            h = self.bn2(h)
        
        batch_size = x.size()[0]
        action = self.layer3(h.view((batch_size,-1)))
        return action

    def predict(self, x, action):
        return self.forward(x, action)

# Config

In [17]:
config = Config()
config.task_fn = task_fn
task = config.task_fn()
config.actor_network_fn = lambda: DeterministicActorNet(
    task.state_dim, task.action_dim, action_gate=None, action_scale=1.0, non_linear=F.relu, batch_norm=False, gpu=False)
config.critic_network_fn = lambda: DeterministicCriticNet(
    task.state_dim, task.action_dim, non_linear=F.relu, batch_norm=False, gpu=False)
config.network_fn = lambda: DisjointActorCriticNet(config.actor_network_fn, config.critic_network_fn)
config.actor_optimizer_fn = lambda params: torch.optim.Adam(params, lr=4e-5)
config.critic_optimizer_fn =\
    lambda params: torch.optim.Adam(params, lr=5e-4, weight_decay=0.001)
config.replay_fn = lambda: HighDimActionReplay(memory_size=600, batch_size=64)
config.random_process_fn = \
    lambda: OrnsteinUhlenbeckProcess(size=task.action_dim, theta=0.15, sigma=0.2, sigma_min=0.00002, n_steps_annealing=10000)
config.discount = 0.0

config.min_memory_size = 50
config.target_network_mix = 0.001
config.max_steps = 300000
config.max_episode_length = 3000 
config.target_network_mix = 0.01
config.noise_decay_interval = 100000
config.gradient_clip = 20
config.min_epsilon = 0.1

# Many papers have found rewards scaling to be an important parameter. But while they focus on the scaling factor
# I think they should focus on the end variance with a range of 200-400. e.g. https://arxiv.org/pdf/1709.06560.pdf
# Hard to tell for sure without experiments to prove it
config.reward_scaling = 1000

config.test_interval = 10 # ORIGINALLY
# config.test_interval = 2 # TODO: Remove (quick test)
config.test_repetitions = 1
config.save_interval = 40 # ORIGINALLY
# config.save_interval = 4 # TODO: Remove (quick test)
config.logger = Logger('./log', gym.logger)
config.tag = tag
agent = DDPGAgent(config)
agent

  data = df.as_matrix().reshape(


<__main__.DDPGAgent at 0x29e8d503b70>

# Train

In [None]:
from DeepRL.utils import run_episodes
agent.task._plot = agent.task._plot2 = None
try:    
    run_episodes(agent)
except KeyboardInterrupt as e:
    save_ddpg(agent)
    raise(e)

I0423 17:48:51.617868 17664 misc.py:27] episode 1, reward -3.039298, avg reward -3.039298, total steps 128, episode step 128
[2020-04-23 17:48:51,617] episode 1, reward -3.039298, avg reward -3.039298, total steps 128, episode step 128
I0423 17:48:58.523808 17664 misc.py:27] episode 2, reward -4.202390, avg reward -3.620844, total steps 256, episode step 128
[2020-04-23 17:48:58,523] episode 2, reward -4.202390, avg reward -3.620844, total steps 256, episode step 128
I0423 17:49:05.608614 17664 misc.py:27] episode 3, reward -4.488160, avg reward -3.909950, total steps 384, episode step 128
[2020-04-23 17:49:05,608] episode 3, reward -4.488160, avg reward -3.909950, total steps 384, episode step 128
I0423 17:49:14.109929 17664 misc.py:27] episode 4, reward 2.910715, avg reward -2.204783, total steps 512, episode step 128
[2020-04-23 17:49:14,109] episode 4, reward 2.910715, avg reward -2.204783, total steps 512, episode step 128
I0423 17:49:22.608917 17664 misc.py:27] episode 5, reward 

I0423 17:52:47.689260 17664 misc.py:27] episode 31, reward -1.141515, avg reward -1.038701, total steps 3968, episode step 128
[2020-04-23 17:52:47,689] episode 31, reward -1.141515, avg reward -1.038701, total steps 3968, episode step 128
I0423 17:52:55.413120 17664 misc.py:27] episode 32, reward -1.039056, avg reward -1.038712, total steps 4096, episode step 128
[2020-04-23 17:52:55,413] episode 32, reward -1.039056, avg reward -1.038712, total steps 4096, episode step 128
I0423 17:53:03.094568 17664 misc.py:27] episode 33, reward -1.612786, avg reward -1.056108, total steps 4224, episode step 128
[2020-04-23 17:53:03,094] episode 33, reward -1.612786, avg reward -1.056108, total steps 4224, episode step 128
I0423 17:53:10.306246 17664 misc.py:27] episode 34, reward 6.352451, avg reward -0.838210, total steps 4352, episode step 128
[2020-04-23 17:53:10,306] episode 34, reward 6.352451, avg reward -0.838210, total steps 4352, episode step 128
I0423 17:53:17.381762 17664 misc.py:27] ep

I0423 17:56:49.054306 17664 misc.py:27] episode 63, reward -5.230516, avg reward -0.540569, total steps 8064, episode step 128
[2020-04-23 17:56:49,054] episode 63, reward -5.230516, avg reward -0.540569, total steps 8064, episode step 128
I0423 17:56:56.532104 17664 misc.py:27] episode 64, reward -0.512746, avg reward -0.540134, total steps 8192, episode step 128
[2020-04-23 17:56:56,532] episode 64, reward -0.512746, avg reward -0.540134, total steps 8192, episode step 128
I0423 17:57:04.019936 17664 misc.py:27] episode 65, reward -2.871350, avg reward -0.575999, total steps 8320, episode step 128
[2020-04-23 17:57:04,019] episode 65, reward -2.871350, avg reward -0.575999, total steps 8320, episode step 128
I0423 17:57:11.299302 17664 misc.py:27] episode 66, reward -0.286554, avg reward -0.571613, total steps 8448, episode step 128
[2020-04-23 17:57:11,299] episode 66, reward -0.286554, avg reward -0.571613, total steps 8448, episode step 128
I0423 17:57:18.814082 17664 misc.py:27] 

[2020-04-23 18:00:56,860] episode 94, reward -1.553189, avg reward -0.671692, total steps 12032, episode step 128
I0423 18:01:04.286479 17664 misc.py:27] episode 95, reward -4.874760, avg reward -0.715935, total steps 12160, episode step 128
[2020-04-23 18:01:04,286] episode 95, reward -4.874760, avg reward -0.715935, total steps 12160, episode step 128
I0423 18:01:11.006934 17664 misc.py:27] episode 96, reward -2.371484, avg reward -0.733180, total steps 12288, episode step 128
[2020-04-23 18:01:11,006] episode 96, reward -2.371484, avg reward -0.733180, total steps 12288, episode step 128
I0423 18:01:18.161355 17664 misc.py:27] episode 97, reward -1.172092, avg reward -0.737705, total steps 12416, episode step 128
[2020-04-23 18:01:18,161] episode 97, reward -1.172092, avg reward -0.737705, total steps 12416, episode step 128
I0423 18:01:24.597156 17664 misc.py:27] episode 98, reward -0.949174, avg reward -0.739863, total steps 12544, episode step 128
[2020-04-23 18:01:24,597] episod

I0423 18:05:02.866876 17664 misc.py:27] episode 126, reward 3.366789, avg reward -1.320511, total steps 16128, episode step 128
[2020-04-23 18:05:02,866] episode 126, reward 3.366789, avg reward -1.320511, total steps 16128, episode step 128
I0423 18:05:10.741499 17664 misc.py:27] episode 127, reward -1.304927, avg reward -1.292536, total steps 16256, episode step 128
[2020-04-23 18:05:10,741] episode 127, reward -1.304927, avg reward -1.292536, total steps 16256, episode step 128
I0423 18:05:18.679628 17664 misc.py:27] episode 128, reward -8.832321, avg reward -1.383169, total steps 16384, episode step 128
[2020-04-23 18:05:18,679] episode 128, reward -8.832321, avg reward -1.383169, total steps 16384, episode step 128
I0423 18:05:26.773726 17664 misc.py:27] episode 129, reward 11.892437, avg reward -1.238300, total steps 16512, episode step 128
[2020-04-23 18:05:26,773] episode 129, reward 11.892437, avg reward -1.238300, total steps 16512, episode step 128
I0423 18:05:35.051793 1766

I0423 18:08:59.323894 17664 misc.py:27] episode 157, reward -3.648484, avg reward -1.867772, total steps 20096, episode step 128
[2020-04-23 18:08:59,323] episode 157, reward -3.648484, avg reward -1.867772, total steps 20096, episode step 128
I0423 18:09:07.148843 17664 misc.py:27] episode 158, reward -2.311639, avg reward -1.863101, total steps 20224, episode step 128
[2020-04-23 18:09:07,148] episode 158, reward -2.311639, avg reward -1.863101, total steps 20224, episode step 128
I0423 18:09:16.414099 17664 misc.py:27] episode 159, reward -2.122421, avg reward -1.947701, total steps 20352, episode step 128
[2020-04-23 18:09:16,414] episode 159, reward -2.122421, avg reward -1.947701, total steps 20352, episode step 128
I0423 18:09:24.004026 17664 misc.py:27] episode 160, reward -4.416247, avg reward -2.047596, total steps 20480, episode step 128
[2020-04-23 18:09:24,004] episode 160, reward -4.416247, avg reward -2.047596, total steps 20480, episode step 128
I0423 18:09:24.012212 17

I0423 18:13:10.559180 17664 misc.py:27] episode 188, reward -2.824945, avg reward -2.432463, total steps 24064, episode step 128
[2020-04-23 18:13:10,559] episode 188, reward -2.824945, avg reward -2.432463, total steps 24064, episode step 128
I0423 18:13:18.842967 17664 misc.py:27] episode 189, reward 4.621853, avg reward -2.368850, total steps 24192, episode step 128
[2020-04-23 18:13:18,842] episode 189, reward 4.621853, avg reward -2.368850, total steps 24192, episode step 128
I0423 18:13:26.594666 17664 misc.py:27] episode 190, reward 3.797255, avg reward -2.299058, total steps 24320, episode step 128
[2020-04-23 18:13:26,594] episode 190, reward 3.797255, avg reward -2.299058, total steps 24320, episode step 128
I0423 18:13:26.598955 17664 misc.py:47] Testing...
[2020-04-23 18:13:26,598] Testing...
I0423 18:13:27.415964 17664 misc.py:55] Avg reward -4.458004(0.000000)
[2020-04-23 18:13:27,415] Avg reward -4.458004(0.000000)
I0423 18:13:35.912255 17664 misc.py:27] episode 191, rew

I0423 18:17:39.233002 17664 misc.py:27] episode 219, reward -2.432594, avg reward -2.203205, total steps 28032, episode step 128
[2020-04-23 18:17:39,233] episode 219, reward -2.432594, avg reward -2.203205, total steps 28032, episode step 128
I0423 18:17:47.427274 17664 misc.py:27] episode 220, reward 1.713356, avg reward -2.137580, total steps 28160, episode step 128
[2020-04-23 18:17:47,427] episode 220, reward 1.713356, avg reward -2.137580, total steps 28160, episode step 128
I0423 18:17:47.433345 17664 misc.py:47] Testing...
[2020-04-23 18:17:47,433] Testing...
I0423 18:17:48.564819 17664 misc.py:55] Avg reward 0.757251(0.000000)
[2020-04-23 18:17:48,564] Avg reward 0.757251(0.000000)
I0423 18:17:56.784764 17664 misc.py:27] episode 221, reward -3.146776, avg reward -2.137579, total steps 28288, episode step 128
[2020-04-23 18:17:56,784] episode 221, reward -3.146776, avg reward -2.137579, total steps 28288, episode step 128
I0423 18:18:05.334203 17664 misc.py:27] episode 222, rew

I0423 18:21:43.025469 17664 misc.py:27] episode 250, reward -13.712670, avg reward -2.519654, total steps 32000, episode step 128
[2020-04-23 18:21:43,025] episode 250, reward -13.712670, avg reward -2.519654, total steps 32000, episode step 128
I0423 18:21:43.028793 17664 misc.py:47] Testing...
[2020-04-23 18:21:43,028] Testing...
I0423 18:21:43.719505 17664 misc.py:55] Avg reward -11.325607(0.000000)
[2020-04-23 18:21:43,719] Avg reward -11.325607(0.000000)
I0423 18:21:50.490918 17664 misc.py:27] episode 251, reward -5.064067, avg reward -2.481428, total steps 32128, episode step 128
[2020-04-23 18:21:50,490] episode 251, reward -5.064067, avg reward -2.481428, total steps 32128, episode step 128
I0423 18:21:57.833001 17664 misc.py:27] episode 252, reward -2.219732, avg reward -2.554113, total steps 32256, episode step 128
[2020-04-23 18:21:57,833] episode 252, reward -2.219732, avg reward -2.554113, total steps 32256, episode step 128
I0423 18:22:05.411288 17664 misc.py:27] episode 

I0423 18:25:37.270391 17664 misc.py:47] Testing...
[2020-04-23 18:25:37,270] Testing...
I0423 18:25:38.102364 17664 misc.py:55] Avg reward 0.962874(0.000000)
[2020-04-23 18:25:38,102] Avg reward 0.962874(0.000000)
I0423 18:25:46.966449 17664 misc.py:27] episode 281, reward -7.959420, avg reward -2.549606, total steps 35968, episode step 128
[2020-04-23 18:25:46,966] episode 281, reward -7.959420, avg reward -2.549606, total steps 35968, episode step 128
I0423 18:25:57.443083 17664 misc.py:27] episode 282, reward -5.892843, avg reward -2.699007, total steps 36096, episode step 128
[2020-04-23 18:25:57,443] episode 282, reward -5.892843, avg reward -2.699007, total steps 36096, episode step 128
I0423 18:26:06.978196 17664 misc.py:27] episode 283, reward -7.001473, avg reward -2.741372, total steps 36224, episode step 128
[2020-04-23 18:26:06,978] episode 283, reward -7.001473, avg reward -2.741372, total steps 36224, episode step 128
I0423 18:26:15.337965 17664 misc.py:27] episode 284, r

[2020-04-23 18:30:09,518] episode 311, reward -3.017566, avg reward -3.236344, total steps 39808, episode step 128
I0423 18:30:15.151499 17664 misc.py:27] episode 312, reward -2.770409, avg reward -3.206015, total steps 39936, episode step 128
[2020-04-23 18:30:15,151] episode 312, reward -2.770409, avg reward -3.206015, total steps 39936, episode step 128
I0423 18:30:19.900205 17664 misc.py:27] episode 313, reward 0.750926, avg reward -3.195901, total steps 40064, episode step 128
[2020-04-23 18:30:19,900] episode 313, reward 0.750926, avg reward -3.195901, total steps 40064, episode step 128
I0423 18:30:24.675281 17664 misc.py:27] episode 314, reward -7.463086, avg reward -3.162997, total steps 40192, episode step 128
[2020-04-23 18:30:24,675] episode 314, reward -7.463086, avg reward -3.162997, total steps 40192, episode step 128
I0423 18:30:29.289466 17664 misc.py:27] episode 315, reward -4.312718, avg reward -3.245922, total steps 40320, episode step 128
[2020-04-23 18:30:29,289] 

[2020-04-23 18:32:43,238] episode 342, reward -8.643971, avg reward -3.657812, total steps 43776, episode step 128
I0423 18:32:48.493866 17664 misc.py:27] episode 343, reward -4.591731, avg reward -3.696953, total steps 43904, episode step 128
[2020-04-23 18:32:48,493] episode 343, reward -4.591731, avg reward -3.696953, total steps 43904, episode step 128


# History

In [None]:
# plot rewards
plt.figure()
df_online, df = load_stats_ddpg(agent)
sns.regplot(x="step", y="rewards", data=df_online, order=1)

In [None]:
# monthly growth
portfolio_return = (1+df_online.rewards[-100:].mean())

returns = task.unwrapped.src.data[0,:,:1]
market_return = (1+returns).mean()
market_return, portfolio_return

# Test

In [None]:
def test_algo(env, algo, seed=0):
    """
    Runs and algo from https://github.com/Marigold/universal-portfolios on env
    
    https://github.com/Marigold/universal-portfolios/commit/e8970a82427522ef11b1c3cbf681e18b5fe8169c
    """
    env.seed(0)
    np.random.seed(0)

    state = env.reset()
    for i in range(env.unwrapped.sim.steps):
        
        history= pd.DataFrame(state[0,:,:], columns=env.unwrapped.src.asset_names)
        # MPT wants a cash column, and it should be first
        history['CASH']=1
        history=history[['CASH'] + env.unwrapped.src.asset_names]
#         cols = list(history.columns)
#         cols[0]='CASH'
#         history.columns = cols
        
        x=history.iloc[-1]
        
        last_b = env.unwrapped.sim.w0#[1:]

        algo.init_step(history)
        # some don't want history
        try:
            action = algo.step(x, last_b, history)
        except TypeError:
            action = algo.step(x, last_b)
        
        # might by dataframe
        action = getattr(action, 'value', action)
        
        # For upt
        if isinstance(action, np.matrixlib.defmatrix.matrix):
            action = np.array(action.tolist()).T[0]
            
        

        state, reward, done, info = env.step(action)

        if done:
            break   
    df = pd.DataFrame(env.unwrapped.infos)
    df.index = pd.to_datetime(df['date']*1e9)
    return df['portfolio_value'], df

In [None]:
# use test env
df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
test_steps=5000
env_test = task_fn_test()
agent.task = env_test
agent.config.max_episode_length = test_steps
agent.task.reset()
np.random.seed(0)

# run in deterministic mode, no training, no exploration
agent.episode(True)
agent.task.render('notebook')
agent.task.render('notebook', True)

df = pd.DataFrame(agent.task.unwrapped.infos)
df.index = pd.to_datetime(df['date']*1e9)

In [None]:
from UniversalPortfolios.universal import algos
env = task.unwrapped
price_cols = [col for col in df.columns if col.startswith('price')]
for col in price_cols:
    df[col]=df[col].cumprod()

df = df[price_cols + ['portfolio_value']]
    
algo_dict=dict(
    # Pick the same is in https://arxiv.org/pdf/1706.10059.pdf
    # Benchmarks
#     UCRP=algos.UP(),
    
    # Follow the winner
    BestSoFar=algos.BestSoFar(cov_window=env_test.unwrapped.src.window_length-1),
#     UniversalPortfolio=algos.UP(eval_points=1000),
    ONS=algos.ONS(),
    
    # Follow the loser
#     OnlineMovingAverageReversion=algos.OLMAR(window=env.src.window_length-1, eps=10), 
    RMR=algos.RMR(window=env_test.unwrapped.src.window_length-1, eps=10),
#     PassiveAggressiveMeanReversion=algos.PAMR(),
    
    # Pattern matching
    #     CorrelationDrivenNonparametricLearning=algos.CORN(window=30),
)
for name, algo in algo_dict.items():
    print(name)
    perf, _ = test_algo(env_test, algo)
    perf.index=df.index
    df[name]=perf

# put portfolio value at end so we plot it on top and can therefore see it
cols = list(df.columns.drop('portfolio_value'))+['portfolio_value']
df=df[cols]


df.plot(alpha=0.5)