This notebook is a little easier for beginners because it uses pytorch. You need to clone a repo to get it working:

```sh
# you need this repo, so clone it
git clone https://github.com/wassname/DeepRL.git
cd DeepRL
git reset --hard aeae2c5d585e5853dc638968b1f090eb60abd351
cd ..
mkdir data log evaluation_log
```

This contains some minor modifications from https://github.com/ShangtongZhang/DeepRL.git

The notebook tries DPPG with the [EIIE model](https://arxiv.org/pdf/1706.10059.pdf)

I also uncommented reward normalization in DDPG_agent.py#L64 because otherwise my small reward les to large Q's, inf losses, and NaN actions and weights.

In [1]:
# plotting
%matplotlib notebook
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# numeric
import numpy as np
from numpy import random
import pandas as pd

# utils
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import tempfile
import logging
import time
import datetime

# logging
logger = log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

INFO:__main__:__main__ logger started.


In [2]:
import os
os.sys.path.append(os.path.abspath('.'))
os.sys.path.append(os.path.abspath('DeepRL'))
%reload_ext autoreload
%autoreload 2

In [3]:
# params
window_length = 50
steps = 128



In [4]:
# save dir
import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')

save_path = './outputs/pytorch-DDPG/pytorch-DDPG-EIIE-action-crypto-%s.model' % ts
print(save_path)
try:
    os.makedirs(os.path.dirname(save_path))
except OSError:
    pass

./outputs/pytorch-DDPG/pytorch-DDPG-EIIE-action-crypto-20200502_00-32-15.model


In [5]:
# setup tensorboard logging
from tensorboard_logger import configure, log_value
tag = 'ddpg-' + ts
print('tensorboard --logdir '+"runs/" + tag)
try:
    configure("runs/" + tag)
except ValueError as e:
    print(e)
    pass

tensorboard --logdir runs/ddpg-20200502_00-32-15


# Env

In [6]:
from rl_portfolio_management.environments.portfolio import PortfolioEnv
from rl_portfolio_management.util import MDD, sharpe, softmax
from rl_portfolio_management.wrappers import SoftmaxActions, TransposeHistory, ConcatStates

df_train = pd.read_hdf('./data/poloniex_30m.hf',key='train')
df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')


In [7]:
import gym
class DeepRLWrapper(gym.Wrapper):
    def __init__(self, env):
        super().__init__(env)
        self.render_on_reset = False
        
        self.state_dim = self.observation_space.shape
        self.action_dim = self.action_space.shape[0]
        
        self.name = 'PortfolioEnv'
        self.success_threshold = 2
        
    def normalize_state(self, state):
        return state
    
    def step(self, action):
        state, reward, done, info =self.env.step(action)
        reward*=1e4 # often reward scaling is important sooo...
        return state, reward, done, info
    
    def reset(self):        
        # here's a roundabout way to get it to plot on reset
        if self.render_on_reset: 
            self.env.render('notebook')

        return self.env.reset()

In [8]:
def task_fn():
    env = PortfolioEnv(df=df_train, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env

def task_fn_test():
    env = PortfolioEnv(df=df_test, steps=steps, output_mode='EIIE')
    env = TransposeHistory(env)
    env = ConcatStates(env)
    env = SoftmaxActions(env)
    env = DeepRLWrapper(env)
    return env
    
# sanity check
task = task_fn()
task.reset().shape, task.step(task.action_space.sample())[0].shape

  data = df.as_matrix().reshape(


((4, 51, 3), (4, 51, 3))

# Agent and models

In [9]:
# load
import pickle
import shutil

def save_ddpg(agent):
    agent_type = agent.__class__.__name__
    save_file = 'data/%s-%s-model-%s.bin' % (agent_type, config.tag, agent.task.name)
    agent.save(save_file)
    print(save_file)
    

def load_ddpg(agent):
    agent_type = agent.__class__.__name__
    save_file = 'data/%s-%s-model-%s.bin' % (agent_type, config.tag, agent.task.name)
    new_states = pickle.load(open(save_file, 'rb'))
    states = agent.worker_network.load_state_dict(new_states)


def load_stats_ddpg(agent):
    agent_type = agent.__class__.__name__
    online_stats_file = 'data/%s-%s-online-stats-%s.bin' % (
                    agent_type, config.tag, agent.task.name)
    try:
        steps, rewards = pickle.load(open(online_stats_file, 'rb'))
    except FileNotFoundError:
        steps =[]
        rewards=[]
    df_online = pd.DataFrame(np.array([steps, rewards]).T, columns=['steps','rewards'])
    if len(df_online):
        df_online['step'] = df_online['steps'].cumsum()
        df_online.index.name = 'episodes'
    
    stats_file = 'data/%s-%s-all-stats-%s.bin' % (agent_type, config.tag, agent.task.name)

    try:
        stats = pickle.load(open(stats_file, 'rb'))
    except FileNotFoundError:
        stats = {}
    df = pd.DataFrame(stats["test_rewards"], columns=['rewards'])
    if len(df):
#         df["steps"]=range(len(df))*50

        df.index.name = 'episodes'
    return df_online, df

In [10]:
import logging
from DeepRL.agent import ProximalPolicyOptimization
from DeepRL.network import DisjointActorCriticNet #, DeterministicActorNet, DeterministicCriticNet
from DeepRL.component import GaussianPolicy, HighDimActionReplay, OrnsteinUhlenbeckProcess
from DeepRL.utils import Config, Logger
import gym
import torch
gym.logger.setLevel(logging.INFO)

# Alg

In [11]:
# Modified from https://github.com/ShangtongZhang/DeepRL to log to tensorboard

from DeepRL.utils.normalizer import Normalizer

null_normaliser = lambda x:x

class DDPGAgent:
    def __init__(self, config):
        self.config = config
        self.task = config.task_fn()
        self.worker_network = config.network_fn()
        self.target_network = config.network_fn()
        self.target_network.load_state_dict(self.worker_network.state_dict())
        self.actor_opt = config.actor_optimizer_fn(self.worker_network.actor.parameters())
        self.critic_opt = config.critic_optimizer_fn(self.worker_network.critic.parameters())
        self.replay = config.replay_fn()
        self.random_process = config.random_process_fn()
        self.criterion = nn.MSELoss()
        self.total_steps = 0

        self.state_normalizer = Normalizer(self.task.state_dim) # null_normaliser # 
        self.reward_normalizer = Normalizer(1)

    def soft_update(self, target, src):
        for target_param, param in zip(target.parameters(), src.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - self.config.target_network_mix) +
                                    param.data * self.config.target_network_mix)

    def save(self, file_name):
        with open(file_name, 'wb') as f:
            torch.save(self.worker_network.state_dict(), f)

    def episode(self, deterministic=False, video_recorder=None):
        self.random_process.reset_states()
        state = self.task.reset()
        state = self.state_normalizer(state)

        config = self.config
        actor = self.worker_network.actor
        critic = self.worker_network.critic
        target_actor = self.target_network.actor
        target_critic = self.target_network.critic

        steps = 0
        total_reward = 0.0
        while True:
            actor.eval()
            action = actor.predict(np.stack([state])).flatten()
            if not deterministic:
                action += self.random_process.sample()
            next_state, reward, done, info = self.task.step(action)
            if video_recorder is not None:
                video_recorder.capture_frame()
            done = (done or (config.max_episode_length and steps >= config.max_episode_length))
            next_state = self.state_normalizer(next_state) * config.reward_scaling
            total_reward += reward
            
            # tensorboard logging
            prefix = 'test_' if deterministic else ''
            log_value(prefix + 'reward', reward, self.total_steps)
#             log_value(prefix + 'action', action, steps)
            log_value('memory_size', self.replay.size(), self.total_steps)     
            for key in info:
                log_value(key, info[key], self.total_steps)     
            
            reward = self.reward_normalizer(reward)

            if not deterministic:
                self.replay.feed([state, action, reward, next_state, int(done)])
                self.total_steps += 1

            steps += 1
            state = next_state

            if done:
                break

            if not deterministic and self.replay.size() >= config.min_memory_size:
                self.worker_network.train()
                experiences = self.replay.sample()
                states, actions, rewards, next_states, terminals = experiences
                q_next = target_critic.predict(next_states, target_actor.predict(next_states))
                terminals = critic.to_torch_variable(terminals).unsqueeze(1)
                rewards = critic.to_torch_variable(rewards).unsqueeze(1)
                q_next = config.discount * q_next * (1 - terminals)
                q_next.add_(rewards)
                q_next = q_next.detach()
                q = critic.predict(states, actions)
                critic_loss = self.criterion(q, q_next)

                critic.zero_grad()
                self.critic_opt.zero_grad()
                critic_loss.backward()
                if config.gradient_clip:
                    grad_critic = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
                self.critic_opt.step()

                actions = actor.predict(states, False)
                var_actions = Variable(actions.data, requires_grad=True)
                q = critic.predict(states, var_actions)
                q.backward(torch.ones(q.size()))

                actor.zero_grad()
                self.actor_opt.zero_grad()
                actions.backward(-var_actions.grad.data)
                if config.gradient_clip:
                    grad_actor = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
                self.actor_opt.step()
                
                # tensorboard logging
                log_value('critic_loss', critic_loss.cpu().data.numpy().squeeze(), self.total_steps)
                log_value('loss_action', -q.sum(), self.total_steps)
                if config.gradient_clip:
                    log_value('grad_critic', grad_critic, self.total_steps)
                    log_value('grad_actor', grad_actor, self.total_steps)

                self.soft_update(self.target_network, self.worker_network)

        return total_reward, steps

# Model

In [12]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [13]:
task.state_dim, task.action_dim

((4, 51, 3), 4)

In [14]:

from DeepRL.network.base_network import BasicNet

class DeterministicActorNet(nn.Module, BasicNet):
    def __init__(self,
                 state_dim,
                 action_dim,
                 action_gate,
                 action_scale,
                 gpu=False,
                 batch_norm=False,
                 non_linear=F.relu):
        super(DeterministicActorNet, self).__init__()

        stride_time = state_dim[1] - 1 - 2 #
        features = task.state_dim[0]
        h0 = 2
        h1 = 30
        self.conv1 = nn.Conv2d(features, h0, (3, 1))
        self.conv2 = nn.Conv2d(h0, h1, (stride_time, 1), stride=(stride_time, 1))
        self.conv3 = nn.Conv2d((h1+1), 1, (1, 1))

        self.action_scale = action_scale
        self.action_gate = action_gate
        self.non_linear = non_linear

        if batch_norm:
            self.bn1 = nn.BatchNorm1d(h0)
            self.bn2 = nn.BatchNorm1d(h1)

        self.batch_norm = batch_norm
        BasicNet.__init__(self, None, gpu, False)


    def forward(self, x):
        x = self.to_torch_variable(x)
        
        w0 = x[:,:1,:1,:] # weights from last step 
        x = x[:,:,1:,:]
        
        phi0 = self.non_linear(self.conv1(x))
        if self.batch_norm:
            phi0 = self.bn1(phi0)
        phi1 = self.non_linear(self.conv2(phi0))
        h = torch.cat([phi1,w0], 1)
        if self.batch_norm:
            h = self.bn2(h)
        
        action = self.conv3(h)
        
        # add cash_bias before we softmax
        cash_bias_int = 0
        cash_bias = self.to_torch_variable(torch.ones(action.size())[:,:,:,:1] * cash_bias_int)
        action = torch.cat([cash_bias, action], -1)
        
        batch_size = action.size()[0]
        action = action.view((batch_size,-1))
        if self.action_gate:
            action = self.action_scale * self.action_gate(action)
        return action

    def predict(self, x, to_numpy=True):
        y = self.forward(x)
        if to_numpy:
            y = y.cpu().data.numpy()
        return y

class DeterministicCriticNet(nn.Module, BasicNet):
    def __init__(self,
                 state_dim,
                 action_dim,
                 gpu=False,
                 batch_norm=False,
                 non_linear=F.relu):
        super(DeterministicCriticNet, self).__init__()
        stride_time = state_dim[1] - 1 - 2 #
        self.features = features = task.state_dim[0]
        h0=2
        h1=20
        self.action = actions = action_dim -1
        self.conv1 = nn.Conv2d(features, h0, (3, 1))
        self.conv2 = nn.Conv2d(h0, h1, (stride_time, 1), stride=(stride_time, 1))
        self.layer3 = nn.Linear((h1+2)*actions, 1)
        self.non_linear = non_linear

        if batch_norm:
            self.bn1 = nn.BatchNorm1d(h0)
            self.bn2 = nn.BatchNorm1d(h1)
        self.batch_norm = batch_norm

        BasicNet.__init__(self, None, gpu, False)


    def forward(self, x, action):
        x = self.to_torch_variable(x)
        action = self.to_torch_variable(action)[:,None,None,:-1] # remove cash bias
        
        w0 = x[:,:1,:1,:] # weights from last step 
        x = x[:,:,1:,:]
        
        phi0 = self.non_linear(self.conv1(x))
        if self.batch_norm:
            phi0 = self.bn1(phi0)
        phi1 = self.non_linear(self.conv2(phi0))
        h = torch.cat([phi1,w0,action], 1)
        if self.batch_norm:
            h = self.bn2(h)
        
        batch_size = x.size()[0]
        action = self.layer3(h.view((batch_size,-1)))
        return action

    def predict(self, x, action):
        return self.forward(x, action)

# Config

In [15]:
config = Config()
config.task_fn = task_fn
task = config.task_fn()
config.actor_network_fn = lambda: DeterministicActorNet(
    task.state_dim, task.action_dim, action_gate=None, action_scale=1.0, non_linear=F.relu, batch_norm=False, gpu=False)
config.critic_network_fn = lambda: DeterministicCriticNet(
    task.state_dim, task.action_dim, non_linear=F.relu, batch_norm=False, gpu=False)
config.network_fn = lambda: DisjointActorCriticNet(config.actor_network_fn, config.critic_network_fn)
config.actor_optimizer_fn = lambda params: torch.optim.Adam(params, lr=4e-5)
config.critic_optimizer_fn =\
    lambda params: torch.optim.Adam(params, lr=5e-4, weight_decay=0.001)
config.replay_fn = lambda: HighDimActionReplay(memory_size=600, batch_size=64)
config.random_process_fn = \
    lambda: OrnsteinUhlenbeckProcess(size=task.action_dim, theta=0.15, sigma=0.2, sigma_min=0.00002, n_steps_annealing=10000)
config.discount = 0.0

config.min_memory_size = 50
config.target_network_mix = 0.001
config.max_steps = 300000
config.max_episode_length = 3000 
config.target_network_mix = 0.01
config.noise_decay_interval = 100000
config.gradient_clip = 20
config.min_epsilon = 0.1

# Many papers have found rewards scaling to be an important parameter. But while they focus on the scaling factor
# I think they should focus on the end variance with a range of 200-400. e.g. https://arxiv.org/pdf/1709.06560.pdf
# Hard to tell for sure without experiments to prove it
config.reward_scaling = 1000

# config.test_interval = 10 # ORIGINALLY
config.test_interval = 2 # TODO: Remove (quick test)
config.test_repetitions = 1
# config.save_interval = 40 # ORIGINALLY
config.save_interval = 4 # TODO: Remove (quick test)
config.logger = Logger('./log', gym.logger)
config.tag = tag
agent = DDPGAgent(config)
agent

  data = df.as_matrix().reshape(


<__main__.DDPGAgent at 0x1986f54bb50>

# Train

In [16]:
from DeepRL.utils import run_episodes
agent.task._plot = agent.task._plot2 = None
try:    
    run_episodes(agent)
except KeyboardInterrupt as e:
    save_ddpg(agent)
    raise(e)

  grad_critic = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
  grad_actor = nn.utils.clip_grad_norm(self.worker_network.parameters(), config.gradient_clip)
INFO:gym:episode 1, reward -29.587100, avg reward -29.587100, total steps 128, episode step 128
[2020-05-01 20:32:26,190] episode 1, reward -29.587100, avg reward -29.587100, total steps 128, episode step 128
INFO:gym:episode 2, reward -1.128720, avg reward -15.357910, total steps 256, episode step 128
[2020-05-01 20:32:29,091] episode 2, reward -1.128720, avg reward -15.357910, total steps 256, episode step 128
INFO:gym:Testing...
[2020-05-01 20:32:29,092] Testing...
INFO:gym:Avg reward -0.655801(0.000000)
[2020-05-01 20:32:29,759] Avg reward -0.655801(0.000000)
INFO:gym:episode 3, reward -0.221614, avg reward -10.312478, total steps 384, episode step 128
[2020-05-01 20:32:32,664] episode 3, reward -0.221614, avg reward -10.312478, total steps 384, episode step 128
INFO:gym:episode 4, reward -0.14

INFO:gym:Testing...
[2020-05-01 20:33:58,152] Testing...
INFO:gym:Avg reward -0.161165(0.000000)
[2020-05-01 20:33:58,801] Avg reward -0.161165(0.000000)
INFO:gym:episode 29, reward -0.200487, avg reward -1.361906, total steps 3712, episode step 128
[2020-05-01 20:34:01,968] episode 29, reward -0.200487, avg reward -1.361906, total steps 3712, episode step 128
INFO:gym:episode 30, reward -0.520730, avg reward -1.333866, total steps 3840, episode step 128
[2020-05-01 20:34:05,014] episode 30, reward -0.520730, avg reward -1.333866, total steps 3840, episode step 128
INFO:gym:Testing...
[2020-05-01 20:34:05,015] Testing...
INFO:gym:Avg reward -0.379305(0.000000)
[2020-05-01 20:34:05,679] Avg reward -0.379305(0.000000)
INFO:gym:episode 31, reward -0.658563, avg reward -1.312082, total steps 3968, episode step 128
[2020-05-01 20:34:08,729] episode 31, reward -0.658563, avg reward -1.312082, total steps 3968, episode step 128
INFO:gym:episode 32, reward -0.559527, avg reward -1.288565, tota

[2020-05-01 20:35:38,118] episode 57, reward -0.516704, avg reward -0.830167, total steps 7296, episode step 128
INFO:gym:episode 58, reward 0.002390, avg reward -0.815813, total steps 7424, episode step 128
[2020-05-01 20:35:40,985] episode 58, reward 0.002390, avg reward -0.815813, total steps 7424, episode step 128
INFO:gym:Testing...
[2020-05-01 20:35:40,987] Testing...
INFO:gym:Avg reward 0.112231(0.000000)
[2020-05-01 20:35:41,634] Avg reward 0.112231(0.000000)
INFO:gym:episode 59, reward -0.283493, avg reward -0.806790, total steps 7552, episode step 128
[2020-05-01 20:35:44,567] episode 59, reward -0.283493, avg reward -0.806790, total steps 7552, episode step 128
INFO:gym:episode 60, reward 0.075921, avg reward -0.792078, total steps 7680, episode step 128
[2020-05-01 20:35:47,493] episode 60, reward 0.075921, avg reward -0.792078, total steps 7680, episode step 128
INFO:gym:Testing...
[2020-05-01 20:35:47,496] Testing...
INFO:gym:Avg reward -0.606656(0.000000)
[2020-05-01 20:

[2020-05-01 20:37:18,502] episode 86, reward -0.357270, avg reward -0.658624, total steps 11008, episode step 128
INFO:gym:Testing...
[2020-05-01 20:37:18,504] Testing...
INFO:gym:Avg reward -0.641884(0.000000)
[2020-05-01 20:37:19,192] Avg reward -0.641884(0.000000)
INFO:gym:episode 87, reward -0.183894, avg reward -0.653168, total steps 11136, episode step 128
[2020-05-01 20:37:22,353] episode 87, reward -0.183894, avg reward -0.653168, total steps 11136, episode step 128
INFO:gym:episode 88, reward -0.070586, avg reward -0.646547, total steps 11264, episode step 128
[2020-05-01 20:37:25,344] episode 88, reward -0.070586, avg reward -0.646547, total steps 11264, episode step 128
INFO:gym:Testing...
[2020-05-01 20:37:25,347] Testing...
INFO:gym:Avg reward -0.282398(0.000000)
[2020-05-01 20:37:26,018] Avg reward -0.282398(0.000000)
INFO:gym:episode 89, reward -0.502793, avg reward -0.644932, total steps 11392, episode step 128
[2020-05-01 20:37:29,231] episode 89, reward -0.502793, avg

INFO:gym:Testing...
[2020-05-01 20:38:54,166] Testing...
INFO:gym:Avg reward -0.261867(0.000000)
[2020-05-01 20:38:54,791] Avg reward -0.261867(0.000000)
INFO:gym:episode 115, reward -0.631434, avg reward -0.307891, total steps 14720, episode step 128
[2020-05-01 20:38:57,676] episode 115, reward -0.631434, avg reward -0.307891, total steps 14720, episode step 128
INFO:gym:episode 116, reward -0.664625, avg reward -0.310813, total steps 14848, episode step 128
[2020-05-01 20:39:00,759] episode 116, reward -0.664625, avg reward -0.310813, total steps 14848, episode step 128
INFO:gym:Testing...
[2020-05-01 20:39:00,763] Testing...
INFO:gym:Avg reward 0.064010(0.000000)
[2020-05-01 20:39:01,446] Avg reward 0.064010(0.000000)
INFO:gym:episode 117, reward -0.113996, avg reward -0.316088, total steps 14976, episode step 128
[2020-05-01 20:39:04,593] episode 117, reward -0.113996, avg reward -0.316088, total steps 14976, episode step 128
INFO:gym:episode 118, reward -0.065186, avg reward -0.3

[2020-05-01 20:40:31,969] Avg reward -0.159929(0.000000)
INFO:gym:episode 143, reward 0.441708, avg reward -0.259617, total steps 18304, episode step 128
[2020-05-01 20:40:35,061] episode 143, reward 0.441708, avg reward -0.259617, total steps 18304, episode step 128
INFO:gym:episode 144, reward -0.238722, avg reward -0.256837, total steps 18432, episode step 128
[2020-05-01 20:40:38,117] episode 144, reward -0.238722, avg reward -0.256837, total steps 18432, episode step 128
INFO:gym:Testing...
[2020-05-01 20:40:38,120] Testing...
INFO:gym:Avg reward -0.193560(0.000000)
[2020-05-01 20:40:38,787] Avg reward -0.193560(0.000000)
INFO:gym:episode 145, reward 0.298651, avg reward -0.249619, total steps 18560, episode step 128
[2020-05-01 20:40:41,842] episode 145, reward 0.298651, avg reward -0.249619, total steps 18560, episode step 128
INFO:gym:episode 146, reward -0.035982, avg reward -0.248234, total steps 18688, episode step 128
[2020-05-01 20:40:44,914] episode 146, reward -0.035982,

[2020-05-01 20:42:11,054] episode 171, reward -0.012454, avg reward -0.206918, total steps 21888, episode step 128
INFO:gym:episode 172, reward -0.161521, avg reward -0.206383, total steps 22016, episode step 128
[2020-05-01 20:42:14,000] episode 172, reward -0.161521, avg reward -0.206383, total steps 22016, episode step 128
INFO:gym:Testing...
[2020-05-01 20:42:14,004] Testing...
INFO:gym:Avg reward -0.031882(0.000000)
[2020-05-01 20:42:14,617] Avg reward -0.031882(0.000000)
INFO:gym:episode 173, reward -0.207308, avg reward -0.206286, total steps 22144, episode step 128
[2020-05-01 20:42:17,547] episode 173, reward -0.207308, avg reward -0.206286, total steps 22144, episode step 128
INFO:gym:episode 174, reward -0.010998, avg reward -0.198367, total steps 22272, episode step 128
[2020-05-01 20:42:20,454] episode 174, reward -0.010998, avg reward -0.198367, total steps 22272, episode step 128
INFO:gym:Testing...
[2020-05-01 20:42:20,455] Testing...
INFO:gym:Avg reward 0.012138(0.0000

INFO:gym:episode 200, reward -0.020583, avg reward -0.159519, total steps 25600, episode step 128
[2020-05-01 20:43:48,660] episode 200, reward -0.020583, avg reward -0.159519, total steps 25600, episode step 128
INFO:gym:Testing...
[2020-05-01 20:43:48,664] Testing...
INFO:gym:Avg reward -0.176422(0.000000)
[2020-05-01 20:43:49,344] Avg reward -0.176422(0.000000)
INFO:gym:episode 201, reward -0.215749, avg reward -0.154739, total steps 25728, episode step 128
[2020-05-01 20:43:52,858] episode 201, reward -0.215749, avg reward -0.154739, total steps 25728, episode step 128
INFO:gym:episode 202, reward -0.007186, avg reward -0.152620, total steps 25856, episode step 128
[2020-05-01 20:43:56,075] episode 202, reward -0.007186, avg reward -0.152620, total steps 25856, episode step 128
INFO:gym:Testing...
[2020-05-01 20:43:56,077] Testing...
INFO:gym:Avg reward -0.060087(0.000000)
[2020-05-01 20:43:56,712] Avg reward -0.060087(0.000000)
INFO:gym:episode 203, reward -0.073093, avg reward -0

[2020-05-01 20:45:25,854] episode 228, reward -0.057741, avg reward -0.098399, total steps 29184, episode step 128
INFO:gym:Testing...
[2020-05-01 20:45:25,857] Testing...
INFO:gym:Avg reward -0.032594(0.000000)
[2020-05-01 20:45:26,493] Avg reward -0.032594(0.000000)
INFO:gym:episode 229, reward -0.517262, avg reward -0.101853, total steps 29312, episode step 128
[2020-05-01 20:45:29,676] episode 229, reward -0.517262, avg reward -0.101853, total steps 29312, episode step 128
INFO:gym:episode 230, reward -0.076241, avg reward -0.100970, total steps 29440, episode step 128
[2020-05-01 20:45:32,993] episode 230, reward -0.076241, avg reward -0.100970, total steps 29440, episode step 128
INFO:gym:Testing...
[2020-05-01 20:45:32,994] Testing...
INFO:gym:Avg reward -0.094437(0.000000)
[2020-05-01 20:45:33,631] Avg reward -0.094437(0.000000)
INFO:gym:episode 231, reward -0.226810, avg reward -0.101953, total steps 29568, episode step 128
[2020-05-01 20:45:36,867] episode 231, reward -0.2268

INFO:gym:Testing...
[2020-05-01 20:47:08,821] Testing...
INFO:gym:Avg reward -0.011902(0.000000)
[2020-05-01 20:47:09,515] Avg reward -0.011902(0.000000)
INFO:gym:episode 257, reward -0.001753, avg reward -0.083733, total steps 32896, episode step 128
[2020-05-01 20:47:12,888] episode 257, reward -0.001753, avg reward -0.083733, total steps 32896, episode step 128
INFO:gym:episode 258, reward -0.074350, avg reward -0.083595, total steps 33024, episode step 128
[2020-05-01 20:47:16,226] episode 258, reward -0.074350, avg reward -0.083595, total steps 33024, episode step 128
INFO:gym:Testing...
[2020-05-01 20:47:16,227] Testing...
INFO:gym:Avg reward -0.001723(0.000000)
[2020-05-01 20:47:16,823] Avg reward -0.001723(0.000000)
INFO:gym:episode 259, reward -0.052342, avg reward -0.082181, total steps 33152, episode step 128
[2020-05-01 20:47:20,322] episode 259, reward -0.052342, avg reward -0.082181, total steps 33152, episode step 128
INFO:gym:episode 260, reward -0.004324, avg reward -0

[2020-05-01 20:48:53,951] Avg reward -0.020206(0.000000)
INFO:gym:episode 285, reward -0.002733, avg reward -0.061995, total steps 36480, episode step 128
[2020-05-01 20:48:56,964] episode 285, reward -0.002733, avg reward -0.061995, total steps 36480, episode step 128
INFO:gym:episode 286, reward -0.012581, avg reward -0.062103, total steps 36608, episode step 128
[2020-05-01 20:49:00,059] episode 286, reward -0.012581, avg reward -0.062103, total steps 36608, episode step 128
INFO:gym:Testing...
[2020-05-01 20:49:00,061] Testing...
INFO:gym:Avg reward 0.002133(0.000000)
[2020-05-01 20:49:00,698] Avg reward 0.002133(0.000000)
INFO:gym:episode 287, reward 0.016240, avg reward -0.061146, total steps 36736, episode step 128
[2020-05-01 20:49:03,676] episode 287, reward 0.016240, avg reward -0.061146, total steps 36736, episode step 128
INFO:gym:episode 288, reward 0.052743, avg reward -0.059757, total steps 36864, episode step 128
[2020-05-01 20:49:06,659] episode 288, reward 0.052743, a

[2020-05-01 20:50:33,579] episode 313, reward -0.016821, avg reward -0.047985, total steps 40064, episode step 128
INFO:gym:episode 314, reward -0.019496, avg reward -0.048161, total steps 40192, episode step 128
[2020-05-01 20:50:36,841] episode 314, reward -0.019496, avg reward -0.048161, total steps 40192, episode step 128
INFO:gym:Testing...
[2020-05-01 20:50:36,843] Testing...
INFO:gym:Avg reward -0.180277(0.000000)
[2020-05-01 20:50:37,444] Avg reward -0.180277(0.000000)
INFO:gym:episode 315, reward -0.065780, avg reward -0.047646, total steps 40320, episode step 128
[2020-05-01 20:50:40,622] episode 315, reward -0.065780, avg reward -0.047646, total steps 40320, episode step 128
INFO:gym:episode 316, reward -0.089481, avg reward -0.047817, total steps 40448, episode step 128
[2020-05-01 20:50:43,749] episode 316, reward -0.089481, avg reward -0.047817, total steps 40448, episode step 128
INFO:gym:Testing...
[2020-05-01 20:50:43,754] Testing...
INFO:gym:Avg reward -0.036050(0.000

INFO:gym:episode 342, reward -0.001982, avg reward -0.034827, total steps 43776, episode step 128
[2020-05-01 20:52:19,373] episode 342, reward -0.001982, avg reward -0.034827, total steps 43776, episode step 128
INFO:gym:Testing...
[2020-05-01 20:52:19,374] Testing...
INFO:gym:Avg reward -0.001758(0.000000)
[2020-05-01 20:52:20,001] Avg reward -0.001758(0.000000)
INFO:gym:episode 343, reward -0.021335, avg reward -0.034425, total steps 43904, episode step 128
[2020-05-01 20:52:23,383] episode 343, reward -0.021335, avg reward -0.034425, total steps 43904, episode step 128
INFO:gym:episode 344, reward -0.027292, avg reward -0.036563, total steps 44032, episode step 128
[2020-05-01 20:52:26,594] episode 344, reward -0.027292, avg reward -0.036563, total steps 44032, episode step 128
INFO:gym:Testing...
[2020-05-01 20:52:26,598] Testing...
INFO:gym:Avg reward -0.107869(0.000000)
[2020-05-01 20:52:27,223] Avg reward -0.107869(0.000000)
INFO:gym:episode 345, reward -0.173317, avg reward -0

[2020-05-01 20:54:03,320] episode 370, reward -0.019608, avg reward -0.035133, total steps 47360, episode step 128
INFO:gym:Testing...
[2020-05-01 20:54:03,322] Testing...
INFO:gym:Avg reward -0.008990(0.000000)
[2020-05-01 20:54:03,950] Avg reward -0.008990(0.000000)
INFO:gym:episode 371, reward -0.002012, avg reward -0.035015, total steps 47488, episode step 128
[2020-05-01 20:54:07,391] episode 371, reward -0.002012, avg reward -0.035015, total steps 47488, episode step 128
INFO:gym:episode 372, reward -0.085379, avg reward -0.036436, total steps 47616, episode step 128
[2020-05-01 20:54:10,787] episode 372, reward -0.085379, avg reward -0.036436, total steps 47616, episode step 128
INFO:gym:Testing...
[2020-05-01 20:54:10,791] Testing...
INFO:gym:Avg reward 0.015271(0.000000)
[2020-05-01 20:54:11,437] Avg reward 0.015271(0.000000)
INFO:gym:episode 373, reward -0.001818, avg reward -0.036391, total steps 47744, episode step 128
[2020-05-01 20:54:14,953] episode 373, reward -0.001818

INFO:gym:Testing...
[2020-05-01 20:55:44,055] Testing...
INFO:gym:Avg reward -0.020468(0.000000)
[2020-05-01 20:55:44,700] Avg reward -0.020468(0.000000)
INFO:gym:episode 399, reward -0.001773, avg reward -0.038798, total steps 51072, episode step 128
[2020-05-01 20:55:47,890] episode 399, reward -0.001773, avg reward -0.038798, total steps 51072, episode step 128
INFO:gym:episode 400, reward -0.001736, avg reward -0.038275, total steps 51200, episode step 128
[2020-05-01 20:55:50,967] episode 400, reward -0.001736, avg reward -0.038275, total steps 51200, episode step 128
INFO:gym:Testing...
[2020-05-01 20:55:50,971] Testing...
INFO:gym:Avg reward -0.027118(0.000000)
[2020-05-01 20:55:51,644] Avg reward -0.027118(0.000000)
INFO:gym:episode 401, reward -0.003762, avg reward -0.037833, total steps 51328, episode step 128
[2020-05-01 20:55:54,688] episode 401, reward -0.003762, avg reward -0.037833, total steps 51328, episode step 128
INFO:gym:episode 402, reward -0.001815, avg reward -0

data/DDPGAgent-ddpg-20200502_00-32-15-model-PortfolioEnv.bin


KeyboardInterrupt: 

# History

In [17]:
# plot rewards
plt.figure()
df_online, df = load_stats_ddpg(agent)
sns.regplot(x="step", y="rewards", data=df_online, order=1)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x1986f531e80>

In [18]:
# monthly growth
portfolio_return = (1+df_online.rewards[-100:].mean())

returns = task.unwrapped.src.data[0,:,:1]
market_return = (1+returns).mean()
market_return, portfolio_return

(1.0091946, 0.9687866225294243)

# Test

In [19]:
def test_algo(env, algo, seed=0):
    """
    Runs and algo from https://github.com/Marigold/universal-portfolios on env
    
    https://github.com/Marigold/universal-portfolios/commit/e8970a82427522ef11b1c3cbf681e18b5fe8169c
    """
    env.seed(0)
    np.random.seed(0)

    state = env.reset()
    for i in range(env.unwrapped.sim.steps):
        
        history= pd.DataFrame(state[0,:,:], columns=env.unwrapped.src.asset_names)
        # MPT wants a cash column, and it should be first
        history['CASH']=1
        history=history[['CASH'] + env.unwrapped.src.asset_names]
#         cols = list(history.columns)
#         cols[0]='CASH'
#         history.columns = cols
        
        x=history.iloc[-1]
        
        last_b = env.unwrapped.sim.w0#[1:]

        algo.init_step(history)
        # some don't want history
        try:
            action = algo.step(x, last_b, history)
        except TypeError:
            action = algo.step(x, last_b)
        
        # might by dataframe
        action = getattr(action, 'value', action)
        
        # For upt
        if isinstance(action, np.matrixlib.defmatrix.matrix):
            action = np.array(action.tolist()).T[0]
            
        

        state, reward, done, info = env.step(action)

        if done:
            break   
    df = pd.DataFrame(env.unwrapped.infos)
    df.index = pd.to_datetime(df['date']*1e9)
    return df['portfolio_value'], df

In [21]:
df

Unnamed: 0_level_0,reward,log_return,portfolio_value,market_return,rate_of_return,weights_mean,weights_std,cost,weight_BTCBTC,price_BTCBTC,weight_DASHBTC,price_DASHBTC,weight_LTCBTC,price_LTCBTC,weight_XMRBTC,price_XMRBTC,market_value,date,steps
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-03-20 05:30:00,0.000000e+00,0.000000e+00,1.000000,0.995411,0.000000e+00,0.25,0.433013,0.000000e+00,1.0,1.0,0.000000e+00,0.989440,0.000000e+00,1.001146,0.000000e+00,0.991057,0.995411,1.489988e+09,1
2017-03-20 06:00:00,-7.871093e-10,-1.007500e-07,1.000000,1.018885,-1.007500e-07,0.25,0.433012,7.499998e-10,1.0,1.0,9.999997e-08,1.027814,9.999997e-08,1.011817,9.999997e-08,1.035908,1.014209,1.489990e+09,2
2017-03-20 06:30:00,-1.422230e-09,-1.820454e-07,1.000000,0.987012,-1.820454e-07,0.25,0.433012,1.298798e-11,1.0,1.0,9.999997e-08,0.977658,9.999997e-08,0.991942,9.999997e-08,0.978448,1.001036,1.489991e+09,3
2017-03-20 07:00:00,-1.381334e-09,-1.768108e-07,1.000000,1.000089,-1.768108e-07,0.25,0.433012,8.963981e-12,1.0,1.0,9.999997e-08,0.984766,9.999997e-08,0.997483,9.999997e-08,1.018105,1.001125,1.489993e+09,4
2017-03-20 07:30:00,-1.364748e-09,-1.746878e-07,0.999999,1.005392,-1.746878e-07,0.25,0.433012,7.486137e-12,1.0,1.0,9.999997e-08,1.015472,9.999997e-08,0.995812,9.999997e-08,1.010285,1.006523,1.489995e+09,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-03-22 19:30:00,-1.388046e-09,-1.776699e-07,0.999978,0.997924,-1.776699e-07,0.25,0.433012,2.130200e-12,1.0,1.0,9.999997e-08,1.000105,9.999997e-08,1.000003,9.999997e-08,0.991587,1.031679,1.490211e+09,125
2017-03-22 20:00:00,-1.377983e-09,-1.763819e-07,0.999978,1.001146,-1.763819e-07,0.25,0.433012,3.213684e-12,1.0,1.0,9.999997e-08,1.008720,9.999997e-08,0.998919,9.999997e-08,0.996946,1.032862,1.490213e+09,126
2017-03-22 20:30:00,-1.391110e-09,-1.780621e-07,0.999978,0.996947,-1.780621e-07,0.25,0.433012,3.756637e-12,1.0,1.0,9.999997e-08,0.991355,9.999997e-08,1.001408,9.999997e-08,0.995026,1.029709,1.490215e+09,127
2017-03-22 21:00:00,-1.342063e-09,-1.717841e-07,0.999978,1.012665,-1.717841e-07,0.25,0.433012,1.266464e-11,1.0,1.0,9.999997e-08,1.019874,9.999997e-08,1.003754,9.999997e-08,1.027031,1.042750,1.490216e+09,128


In [20]:
# use test env
df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
test_steps=5000
env_test = task_fn_test()
agent.task = env_test
agent.config.max_episode_length = test_steps
agent.task.reset()
np.random.seed(0)

# run in deterministic mode, no training, no exploration
agent.episode(True)
agent.task.render('notebook')
agent.task.render('notebook', True)

df = pd.DataFrame(agent.task.unwrapped.infos)
df.index = pd.to_datetime(df['date']*1e9)

  data = df.as_matrix().reshape(


<IPython.core.display.Javascript object>


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [27]:
from UniversalPortfolios.universal import algos
env = task.unwrapped
price_cols = [col for col in df.columns if col.startswith('price')]
for col in price_cols:
    df[col]=df[col].cumprod()

df = df[price_cols + ['portfolio_value']]
    
algo_dict=dict(
    # Pick the same is in https://arxiv.org/pdf/1706.10059.pdf
    # Benchmarks
#     UCRP=algos.UP(),
    
    # Follow the winner
    BestSoFar=algos.BestSoFar(cov_window=env_test.unwrapped.src.window_length-1),
#     UniversalPortfolio=algos.UP(eval_points=1000),
    ONS=algos.ONS(),
    
    # Follow the loser
#     OnlineMovingAverageReversion=algos.OLMAR(window=env.src.window_length-1, eps=10), 
    RMR=algos.RMR(window=env_test.unwrapped.src.window_length-1, eps=10),
#     PassiveAggressiveMeanReversion=algos.PAMR(),
    
    # Pattern matching
    #     CorrelationDrivenNonparametricLearning=algos.CORN(window=30),
)
for name, algo in algo_dict.items():
    print(name)
    perf, _ = test_algo(env_test, algo)
    perf.index=df.index
    df[name]=perf

# put portfolio value at end so we plot it on top and can therefore see it
cols = list(df.columns.drop('portfolio_value'))+['portfolio_value']
df=df[cols]


df.plot(alpha=0.5)

BestSoFar
ONS
RMR


<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2367974a9a0>