Uses tensorforce tensorforce-0.2.0

In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# numeric
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import glob
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm
import datetime

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
%reload_ext autoreload
%autoreload 2

In [4]:
# params
window_length = 50
import datetime
ts = datetime.datetime.utcnow().strftime('%Y%m%d_%H-%M-%S')
save_path = './outputs/tensorforce-PPO-prioritised/tensorforce-PPO-prioritised_%s.model' % ts
save_path = './outputs/tensorforce-PPO-prioritised/tensorforce-PPO-prioritised_20171015_02-50-30.model'
save_path


'./outputs/tensorforce-PPO-prioritised/tensorforce-PPO-prioritised_20171015_02-50-30.model'

In [5]:
log_dir = os.path.join('logs', os.path.splitext(os.path.basename(save_path))[0])
try:
    os.makedirs(log_dir)
except OSError:
    pass
log_dir

'logs/tensorforce-PPO-prioritised_20171015_02-50-30'

# Enviroment

In [6]:
from src.environments.portfolio import PortfolioEnv

In [7]:
from rl.memory import  SequentialMemory, Memory
from collections import deque

class EnvWrapper(PortfolioEnv):
    """Wraps env to normalise and reshape action."""
    def __init__(self, window_length=50, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def step(self, action):
        # also it puts it in a list
        if isinstance(action, list):
            action = action[0]
        
        # we have to normalise for some reason softmax wont work
        if isinstance(action, dict):
            action = np.abs(list(action.values()))
            action /= action.sum()        
        
        return super().step(action) 

In [8]:
df_train = pd.read_hdf('./data/poloniex_30m.hf',key='train')
env = EnvWrapper(
    df=df_train,
    steps=300, 
    scale=True, 
    augment=0.000,
    trading_cost=0, # let just overfit first,
    window_length = window_length,
    
)
env.seed = 0   

df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
env_test = EnvWrapper(
    df=df_test,
    steps=300, 
    scale=True, 
    augment=0.00,
    trading_cost=0, # let just overfit first
    window_length=window_length,
)
env_test.seed = 0  

from tensorforce.contrib.openai_gym import OpenAIGym
environment = OpenAIGym('CartPole-v0')
environment.gym = env

environment_test = OpenAIGym('CartPole-v0')
environment_test.gym = env_test

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-10-15 10:50:30,622] Making new env: CartPole-v0
INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-10-15 10:50:30,629] Making new env: CartPole-v0


In [9]:
# check shapes
obs1, reward, done, info=env.step(np.random.random(env.action_space.shape))
print(reward, done, info)
obs2 = env.reset()
print(obs1.shape,obs2.shape)

0.0 False {'reward': 0.0, 'log_return': 0.0, 'portfolio_value': 1.0, 'return': 0.9994203267253029, 'rate_of_return': 0.0, 'weights_mean': 0.16666666666666666, 'weights_std': 0.092620963292867634, 'cost': 0.0, 'market_value': 0.99991966957965184, 'date': 1463925600.0, 'steps': 2}
(5, 50, 3) (5, 50, 3)


# Model

Derived from  https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py#L90

In [10]:
from tensorforce import Configuration
from tensorforce.agents import PPOAgent
from tensorforce.core.networks import layered_network_builder

In [11]:
# layer helpers from:
# https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py#L90
import tensorflow as tf
from math import sqrt
from tensorforce import util
from tensorforce import TensorForceError

def linear(x, size, bias=True, l2_regularization=0.0):
    if util.rank(x) != 2:
        raise TensorForceError('Invalid input rank for linear layer.')
    with tf.variable_scope('linear'):
        weights = tf.Variable(initial_value=tf.random_normal(shape=(x.get_shape()[1].value, size), stddev=sqrt(2.0 / (x.get_shape()[1].value + size))))
        if l2_regularization > 0.0:
            tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=weights))
        x = tf.matmul(a=x, b=weights)
        if bias:
            bias = tf.Variable(initial_value=tf.zeros(shape=(size,)))
            if l2_regularization > 0.0:
                tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=bias))
            x = tf.nn.bias_add(value=x, bias=bias)
    return x

def nonlinearity(x, name='relu'):
    with tf.variable_scope('nonlinearity'):
        if name == 'elu':
            x = tf.nn.elu(features=x)
        elif name == 'relu':
            x = tf.nn.relu(features=x)
        elif name == 'selu':
            # https://arxiv.org/pdf/1706.02515.pdf
            alpha = 1.6732632423543772848170429916717
            scale = 1.0507009873554804934193349852946
            negative = alpha * tf.nn.elu(features=x)
            x = scale * tf.where(condition=(x >= 0.0), x=x, y=negative)
        elif name == 'sigmoid':
            x = tf.sigmoid(x=x)
        elif name == 'softmax':
            x = tf.nn.softmax(logits=x)
        elif name == 'tanh':
            x = tf.nn.tanh(x=x)
        else:
            raise TensorForceError('Invalid nonlinearity.')
    return x

def dense(x, size, bias=True, activation='relu', l2_regularization=0.0):
    if util.rank(x) != 2:
        raise TensorForceError('Invalid input rank for dense layer.')
    with tf.variable_scope('dense'):
        x = linear(x=x, size=size, bias=bias, l2_regularization=l2_regularization)
        x = nonlinearity(x=x, name=activation)
    return x

def flatten(x):
    with tf.variable_scope('flatten'):
        x = tf.reshape(tensor=x, shape=(-1, util.prod(x.get_shape().as_list()[1:])))
    return x

def conv2d(x, size, window=(3,3), stride=(1,1), bias=False, activation='relu', l2_regularization=0.0, padding='SAME'):
    if util.rank(x) != 4:
        raise TensorForceError('Invalid input rank for conv2d layer.')
    with tf.variable_scope('conv2d'):
        filters = tf.Variable(initial_value=tf.random_normal(shape=(window[0], window[1], x.get_shape()[3].value, size), stddev=sqrt(2.0 / size)))
        if l2_regularization > 0.0:
            tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=filters))
        x = tf.nn.conv2d(input=x, filter=filters, strides=(1, stride[0], stride[1], 1), padding=padding)
        if bias:
            bias = tf.Variable(initial_value=tf.zeros(shape=(size,)))
            if l2_regularization > 0.0:
                tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=bias))
            x = tf.nn.bias_add(value=x, bias=bias)
        x = nonlinearity(x=x, name=activation)
    return x


In [12]:
# build a network for a given input
def network_builder(inputs, summary_level):
    if len(inputs) != 1:
        raise TensorForceError('Layered network must have only one input.')
    x = next(iter(inputs.values()))
    
    x = conv2d(x=x, size=2, window=(1,3), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = conv2d(x=x, size=20, window=(1,window_length-2), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = conv2d(x=x, size=1, window=(1,1), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = flatten(x)
    x = nonlinearity(x,name='softmax')
    
    return x
network=network_builder

# Agent

In [13]:
exploration=dict(
    type="epsilon_anneal",
    epsilon=1,
    epsilon_final= 0.005,
    epsilon_timesteps= 1e5,
    start_after=0,
)
{'action' + str(n): exploration for n in range(env.action_space.shape[0])},

({'action0': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'},
  'action1': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'},
  'action2': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'},
  'action3': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'},
  'action4': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'},
  'action5': {'epsilon': 1,
   'epsilon_final': 0.005,
   'epsilon_timesteps': 100000.0,
   'start_after': 0,
   'type': 'epsilon_anneal'}},)

In [14]:
batch_size=256
exploration=dict(
    type="epsilon_anneal",
    epsilon=1,
    epsilon_final= 0.005,
    epsilon_timesteps= 1e5,
    start_after=0,
)
config = Configuration(   
    # Each agent requires the following ``Configuration`` parameters:
    # https://github.com/reinforceio/tensorforce/blob/master/tensorforce/agents/agent.py#L32
    network=network,
    states=dict(shape=tuple(env.observation_space.shape), type='float'),
    actions={'action' + str(n): dict(continuous=True) for n in range(env.action_space.shape[0])},
    preprocessing = None,# dict or list containing state preprocessing configuration.
    exploration = {'action' + str(n): exploration for n in range(env.action_space.shape[0])}, # dict containing action exploration configuration.
      
    
    # The `MemoryAgent` class additionally requires the following parameters:
    first_update = batch_size*2, # integer indicating the number of steps to pass before the first update.
    memory_capacity = 300000, # integer of maximum experiences to store. (takes 2s to sample with 100k)
    memory = 'prioritized_replay', # string indicating memory type ('replay' or 'prioritized_replay').
    update_frequency = int(batch_size/2), # integer indicating the number of steps between model updates.
    repeat_update = 2, # integer indicating how often to repeat the model update.

    # Each model requires the following configuration parameters:
    # https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/model.py#L33
    discount = 0.97, # float of discount factor (gamma).
    learning_rate = 1e-3, # float of learning rate (alpha). (3e-4 in paper 1e-3 (atari) and 3e-4 in baselines)
    optimizer = 'adam', # string of optimizer to use (e.g. 'adam' in paper).
    device = None, # string of tensorflow device name.
#     tf_summary = log_dir, # string directory to write tensorflow summaries. Default None
#     tf_summary_level = 1, # int indicating which tensorflow summaries to create.
    tf_summary_interval = 1000, # int number of calls to get_action until writing tensorflow summaries on update.
    log_level = 'info', # string containing log level (e.g. 'info').
    distributed = False, # boolean indicating whether to use distributed tensorflow.
    global_model = False, # global model.
    session = None, # session to use. 

    # A Policy Gradient Model expects the following additional configuration parameters:
    # https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/policy_gradient_model.py#L35
    # I edited my tensorflow install to have a flatten layer to make this work (my branch is in requirements.txt)
    baseline=dict(
        type="mlp",
        sizes=[128, 128],
        epochs=1,
        update_batch_size=128,
        learning_rate=0.01
    ), # string indicating the baseline value function (currently 'linear' or 'mlp').
    gae_rewards= True, # boolean indicating whether to use GAE.
    gae_lambda= 0.97, # float of the Generalized Advantage Estimation lambda.
    normalize_rewards= False,# boolean indicating whether to normalize the advantage or not.
    
    # PPO Params 
    # https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/ppo_model.py
    entropy_penalty=0.01, # 0 and 0.01 in baselines
    loss_clipping=0.1,  # Trust region clipping
    epochs=4,  # Number of training epochs for SGD, data is repeated this much 4 (atari),10 in baselines, 10 in paper
    optimizer_batch_size=32,  # Batch size for optimiser, should be small (e.g. 64 in paper)
    random_sampling=True  # Sampling strategy for minibatch replay memory
)

# Create a Trust Region Policy Optimization agent
agent = PPOAgent(config=config)
agent

[2017-10-15 10:50:57,553] Configuration values not accessed: first_update, memory_capacity, memory, update_frequency, repeat_update


<tensorforce.agents.ppo_agent.PPOAgent at 0x7fc5592f0da0>

# Train

## Callbacks

In [15]:
from src.callbacks.tensorforce import EpisodeFinishedTQDM
from src.util import MDD, sharpe

## Train

In [16]:
from tensorforce.execution import Runner
runner = Runner(agent=agent, environment=environment, save_path=save_path, save_episodes=1000)

In [17]:
# Check my PR is included, https://github.com/wassname/tensorforce/tree/merged_6b
import tensorforce.core.memories
assert isinstance(runner.agent.memory,tensorforce.core.memories.PrioritizedReplay)
assert isinstance(runner.agent, tensorforce.agents.MemoryAgent)

In [None]:
# resume
saves=glob.glob(save_path+'-*')
if len(saves)>0:
    # load saved
    last_save = os.path.splitext(saves[0])[0]
    runner.agent.load_model(last_save)
    print('loaded', last_save)

In [None]:
episodes = int(6e6 / 30)
runner.run(
    episodes=episodes,
    max_timesteps=200,
    episode_finished=EpisodeFinishedTQDM(
        log_intv=100, 
        episodes=episodes,
        log_dir=log_dir,
        session=runner.agent.model.session, 
    )
)

TensorBoardLogger started. Run `tensorboard --logdir=/media/isisilon/Data/My_Documents/Documents/eclipse-workspace/rl_keras_finance/portfolio-rl-jiang_2017/logs/tensorforce-PPO-prioritised_20171015_02-50-30` to visualize


In [None]:
# save
agent.save_model(save_path)
save_path

# Test

In [None]:
# one big test
df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
steps=2400#len(df_test)-window_length-2
env_test = EnvWrapper(
    df=df_test,
    steps=steps, 
    scale=True, 
    augment=0.00,
    trading_cost=0, # let just overfit first
    window_length=window_length,
)
env_test.seed = 0  
environment_test = OpenAIGym('CartPole-v0')
environment_test.gym = env_test

agent.load_model(save_path)
runner_test = Runner(agent=agent, environment=environment_test)
runner_test.run(
episodes=1, max_timesteps=steps, episode_finished=EpisodeFinished(10))

df = pd.DataFrame(env_test.infos)
df.index=df['index']

s=sharpe(df.rate_of_return+1)
mdd=MDD(df.rate_of_return+1)
apv=df.portfolio_value.iloc[-1]
print('APV (Accumulated portfolio value): \t{: 2.6f}'.format(apv))
print('SR (Sharpe ratio):                 \t{: 2.6f}'.format( s))
print('MDD (max drawdown):                \t{: 2.6%}'.format( mdd))
print('MMR (mean market returns):         \t{: 2.6f}'.format(df.mean_market_returns.cumprod().iloc[-1]))
print('')

# show one run vs average market performance
plt.title('test MDD={}, Sharpe={}, APV={}'.format(mdd,s,apv))
df.portfolio_value.plot()
df.mean_market_returns.cumprod().plot(label='mean market performance')
plt.legend()

In [None]:


data=[]
for i in range(10):
    agent.load_model(save_path)
    df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
    
    env_test = EnvWrapper(
        df=df_test,
        steps=1800, 
        scale=True, 
        augment=0.00,
        trading_cost=0, # let just overfit first
        window_length=window_length,
    )
    env_test.seed = 0  


    environment_test = OpenAIGym('CartPole-v0')
    environment_test.gym = env_test

    runner_test = Runner(agent=agent, environment=environment_test)
    np.random.seed(i)
    runner_test.run(
    episodes=2, max_timesteps=32, episode_finished=EpisodeFinished(10))
    df = pd.DataFrame(environment_test.gym.infos)
#     df.index=df['index']
    
    s=sharpe(df.rate_of_return+1)
    mdd=MDD(df.rate_of_return+1)
    data.append(dict(sharpe=s,mdd=mdd))
    print('APV (Accumulated portfolio value): \t{: 2.6f}'.format(df.portfolio_value.iloc[-1]))
    print('SR (Sharpe ratio):                 \t{: 2.6f}'.format( s))
    print('MDD (max drawdown):                \t{: 2.6%}'.format( mdd))
    print('MMR (mean market returns):         \t{: 2.6f}'.format(df.mean_market_returns.cumprod().iloc[-1]))
    print('')
    df.portfolio_value.plot(label=str(i))
plt.legend()

In [None]:
data

In [None]:
# one big test over train
# one big test
df_train = pd.read_hdf('./data/poloniex_30m.hf',key='train')
steps=len(df_train)-window_length-2
env = EnvWrapper(
    df=df_train,
    steps=steps, 
    scale=True, 
    augment=0.00,
    trading_cost=0, # let just overfit first
    window_length=window_length,
)
env.seed = 0  
environment = OpenAIGym('CartPole-v0')
environment.gym = env

agent.load_model(save_path)
runner = Runner(agent=agent, environment=environment)
runner.run(
episodes=1, max_timesteps=steps, episode_finished=EpisodeFinished(10))

df = pd.DataFrame(env.infos)
df.index=df['index']

s=sharpe(df.rate_of_return+1)
mdd=MDD(df.rate_of_return+1)
data.append(dict(sharpe=s,mdd=mdd))
print('APV (Accumulated portfolio value): \t{: 2.6f}'.format(df.portfolio_value.iloc[-1]))
print('SR (Sharpe ratio):                 \t{: 2.6f}'.format( s))
print('MDD (max drawdown):                \t{: 2.6%}'.format( mdd))
print('')

# show one run vs average market performance
plt.title('train')
df.portfolio_value.plot()
df.mean_market_returns.cumprod().plot(label='mean market performance')
plt.legend()