In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# numeric
import quandl
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
%reload_ext autoreload
%autoreload 2

In [4]:
from src.environments.portfolio import PortfolioEnv

In [5]:
window_length = 50

In [6]:
from rl.memory import  SequentialMemory, Memory
from collections import deque

class EnvWrapper(PortfolioEnv):
    """Wraps env to normalise and reshape action."""
    def __init__(self, window_length=50, *args, **kwargs):
        self.memory = SequentialMemory(limit=window_length*2, window_length=window_length)
        super().__init__(*args, **kwargs)
        
    def step(self, action):
        # also it puts it in a list
        if isinstance(action, list):
            action = action[0]
        
        # we have to normalise for some reason softmax wont work
        if isinstance(action, dict):
            action = np.abs(list(action.values()))
            action /= action.sum()        
        
        return super().step(action) 

class MemoryWrapper(EnvWrapper):
    """Provides memory to env observations."""
    def __init__(self, window_length=50, *args, **kwargs):
        self.memory = SequentialMemory(limit=window_length*2, window_length=window_length)
        super().__init__(*args, **kwargs)
    def step(self, action):
        obs, reward, done, info = super().step(action)     
        obs = np.array(self.memory.get_recent_state(obs))
        return obs, reward, done, info
    
    def reset(self):
        self.memory.recent_terminals = deque(maxlen=window_length)
        return super().reset()



In [7]:
df_train = pd.read_hdf('./data/poliniex_30m.hf',key='train')
env = EnvWrapper(
    window_length=window_length,
    df=df_train,
    steps=30, 
    scale=True, 
    augument=0.0005    
)
env.seed = 0   

df_test = pd.read_hdf('./data/poliniex_30m.hf',key='test')
env_test = EnvWrapper(
    window_length=window_length,
    df=df_test,
    steps=30, 
    scale=True, 
    augument=0.00)
env_test.seed = 0  

from tensorforce.environments.openai_gym import OpenAIGym
environment = OpenAIGym('CartPole-v0')
environment.gym = env

environment_test = OpenAIGym('CartPole-v0')
environment_test.gym = env_test

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-16 12:56:32,157] Making new env: CartPole-v0
INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-16 12:56:32,163] Making new env: CartPole-v0


In [8]:
# check shapes
obs1, reward, done, info=env.step(np.random.random(env.action_space.shape))
print(reward, done, info)
obs2 = env.reset()
print(obs1.shape,obs2.shape)

-3.87413534225e-05 False {'reward': -3.874135342249592e-05, 'log_return': -0.0011622406026748776, 'portfolio_value': 0.99883843442119424, 'returns': 1.0042273348030557, 'rate_of_return': -0.0011615655788057566, 'cost': 0.0011614656949622685, 'steps': 2}
(5, 50, 3) (5, 50, 3)


In [9]:
from tensorforce import Configuration
from tensorforce.agents import VPGAgent
from tensorforce.core.networks import layered_network_builder

In [12]:
# https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py#L90
import tensorflow as tf
from math import sqrt
from tensorforce import util
from tensorforce import TensorForceError

def linear(x, size, bias=True, l2_regularization=0.0):
    if util.rank(x) != 2:
        raise TensorForceError('Invalid input rank for linear layer.')
    with tf.variable_scope('linear'):
        weights = tf.Variable(initial_value=tf.random_normal(shape=(x.get_shape()[1].value, size), stddev=sqrt(2.0 / (x.get_shape()[1].value + size))))
        if l2_regularization > 0.0:
            tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=weights))
        x = tf.matmul(a=x, b=weights)
        if bias:
            bias = tf.Variable(initial_value=tf.zeros(shape=(size,)))
            if l2_regularization > 0.0:
                tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=bias))
            x = tf.nn.bias_add(value=x, bias=bias)
    return x

def nonlinearity(x, name='relu'):
    with tf.variable_scope('nonlinearity'):
        if name == 'elu':
            x = tf.nn.elu(features=x)
        elif name == 'relu':
            x = tf.nn.relu(features=x)
        elif name == 'selu':
            # https://arxiv.org/pdf/1706.02515.pdf
            alpha = 1.6732632423543772848170429916717
            scale = 1.0507009873554804934193349852946
            negative = alpha * tf.nn.elu(features=x)
            x = scale * tf.where(condition=(x >= 0.0), x=x, y=negative)
        elif name == 'sigmoid':
            x = tf.sigmoid(x=x)
        elif name == 'softmax':
            x = tf.nn.softmax(logits=x)
        elif name == 'tanh':
            x = tf.nn.tanh(x=x)
        else:
            raise TensorForceError('Invalid nonlinearity.')
    return x

def dense(x, size, bias=True, activation='relu', l2_regularization=0.0):
    if util.rank(x) != 2:
        raise TensorForceError('Invalid input rank for dense layer.')
    with tf.variable_scope('dense'):
        x = linear(x=x, size=size, bias=bias, l2_regularization=l2_regularization)
        x = nonlinearity(x=x, name=activation)
    return x

def flatten(x):
    with tf.variable_scope('flatten'):
        x = tf.reshape(tensor=x, shape=(-1, util.prod(x.get_shape().as_list()[1:])))
    return x

def conv2d(x, size, window=(3,3), stride=(1,1), bias=False, activation='relu', l2_regularization=0.0, padding='SAME'):
    if util.rank(x) != 4:
        raise TensorForceError('Invalid input rank for conv2d layer.')
    with tf.variable_scope('conv2d'):
        filters = tf.Variable(initial_value=tf.random_normal(shape=(window[0], window[1], x.get_shape()[3].value, size), stddev=sqrt(2.0 / size)))
        if l2_regularization > 0.0:
            tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=filters))
        x = tf.nn.conv2d(input=x, filter=filters, strides=(1, stride[0], stride[1], 1), padding=padding)
        if bias:
            bias = tf.Variable(initial_value=tf.zeros(shape=(size,)))
            if l2_regularization > 0.0:
                tf.losses.add_loss(l2_regularization * tf.nn.l2_loss(t=bias))
            x = tf.nn.bias_add(value=x, bias=bias)
        x = nonlinearity(x=x, name=activation)
    return x

def network_builder(inputs):
    if len(inputs) != 1:
        raise TensorForceError('Layered network must have only one input.')
    x = next(iter(inputs.values()))
    
    x = conv2d(x=x, size=2, window=(1,3), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = conv2d(x=x, size=20, window=(1,window_length-2), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = conv2d(x=x, size=1, window=(1,1), bias=True, activation='relu', l2_regularization=1e-8, padding='VALID')
    x = flatten(x)
    x = dense(x, size=env.action_space.shape[0],activation='relu', l2_regularization=1e-8)
    x = nonlinearity(x,name='softmax')
    
    return x
network=network_builder

In [13]:
config = Configuration(   
    # Each agent requires the following ``Configuration`` parameters:
    network=network,
    states=dict(shape=(window_length,)+tuple(env.observation_space.shape), type='float'),
    actions={'action' + str(n): dict(continuous=True) for n in range(env.action_space.shape[0])},
    preprocessing = None,# dict or list containing state preprocessing configuration.
    exploration = dict(
        type='EpsilonDecay',
        kwargs=dict(epsilon=1, epsilon_final=0.01, epsilon_timesteps=1e4)
    ),

    # The `BatchAgent` class additionally requires the following parameters:
    batch_size = 50,# integer of the batch size.

    # A Policy Gradient Model expects the following additional configuration parameters:
    sample_actions= True,# boolean of whether to sample actions.
#     baseline='mlp' ,# string indicating the baseline value function (currently 'linear' or 'mlp').
#     baseline_args=dict(size=100, repeat_update=100) ,# list of arguments for the baseline value function.
    override_line_search=False,
    
#     baseline_kwargs= ,# dict of keyword arguments for the baseline value function.
    generalized_advantage_estimation= True ,# boolean indicating whether to use GAE.
    gae_lambda= 0.97,# float of the Generalized Advantage Estimation lambda.
    normalize_advantage= False,# boolean indicating whether to normalize the advantage or not.
    cg_iterations=20,
    max_kl_divergence=0.005,
    cg_damping=0.001,
    line_search_steps=20,
    loglevel="info",
)

# Create a Trust Region Policy Optimization agent
agent = VPGAgent(config=config)

# for some reason these are not set?
# agent.next_internal = agent.current_internal = []

TensorForceError: Invalid input rank for conv2d layer.

In [None]:
from tensorforce.execution import Runner
runner = Runner(agent=agent, environment=environment)

In [None]:
# Callback function printing episode statistics
class EpisodeFinished(object):
    """Logger callback for tensorforce runner."""
    
    def __init__(self, log_intv):
        self.log_intv = log_intv
        self.portfolio_values = [] 
    
    def __call__(self, r):
        if len(r.environment.gym.sim.infos):
            self.portfolio_values.append( r.environment.gym.sim.infos[-1]['portfolio_value'] )
        if r.episode % self.log_intv == 0:
#             df = pd.DataFrame(r.environment.gym.infos)
            print(
                "Finished episode {ep} after {ts} timesteps (reward: {reward: 2.4f} [{rewards_min: 2.4f}, {rewards_max: 2.4f}]) portfolio_value: {portfolio_value: 2.4f} [{portfolio_value_min: 2.4f}, {portfolio_value_max: 2.4f}]".
                format(
                    ep=r.episode,
                    ts=r.timestep,
                    reward=np.mean(r.episode_rewards[-self.log_intv:]),
                    rewards_min=np.min(r.episode_rewards[-self.log_intv:]),
                    rewards_max=np.max(r.episode_rewards[-self.log_intv:]),
                    portfolio_value=np.mean(self.portfolio_values[-self.log_intv:]),
                    portfolio_value_min=np.min(self.portfolio_values[-self.log_intv:]),
                    portfolio_value_max=np.max(self.portfolio_values[-self.log_intv:])
                )
            )
        return True

In [None]:
runner.run(
    episodes=2e6, max_timesteps=200, episode_finished=EpisodeFinished(1000))

In [None]:
# TODO turn off learning during test
runner_test = Runner(agent=agent, environment=environment_test)
runner_test.run(
    episodes=100, max_timesteps=128, episode_finished=EpisodeFinished(10))