In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# numeric
import quandl
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
# %reload_ext autoreload
# %autoreload 2

In [4]:
from src.environments.portfolio import PortfolioEnv

In [5]:
class EnvWrapper(PortfolioEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def step(self, action):
        # also it puts it in a list
        if isinstance(action, list):
            action = action[0]
        
        # we have to normalise for some reason softmax wont work
        if isinstance(action, dict):
            action = np.abs(list(action.values()))
            action /= action.sum()        
        
        return super().step(action) 

In [6]:
df_train = pd.read_hdf('./data/poliniex_30m.hf',key='train')
env = EnvWrapper(
    df=df_train,
    steps=128, 
    scale=True, 
    augument=0.0005    
)
env.seed = 0   

df_test = pd.read_hdf('./data/poliniex_30m.hf',key='test')
env_test = EnvWrapper(
    df=df_test,
    steps=128, 
    scale=True, 
    augument=0.00)
env_test.seed = 0  

In [7]:
from tensorforce.environments.openai_gym import OpenAIGym
environment = OpenAIGym('CartPole-v0')
environment.gym = env

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-15 16:58:37,983] Making new env: CartPole-v0


In [8]:
from tensorforce import Configuration
from tensorforce.agents import TRPOAgent
from tensorforce.core.networks import layered_network_builder

In [9]:
# Define a network builder from an ordered list of layers
# https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py
layers = [
    dict(type='flatten'),
    dict(type='dense', size=32, l2_regularization=1e-8, activation='relu'),
    dict(type='dense', size=32, l2_regularization=1e-8, activation='relu'),    
]
# act will it add's it's own head so we can't add a softmax at the end
network = layered_network_builder(layers_config=layers)


In [10]:
tuple(env.observation_space.shape), env.action_space.shape[0]

((6, 8), 6)

In [11]:
import tensorforce

In [13]:

config = Configuration(
    # Each agent requires the following ``Configuration`` parameters:
    network=network,  
    states=dict(shape=tuple(env.observation_space.shape), type='float'),
    actions={'action' + str(n): dict(continuous=True) for n in range(env.action_space.shape[0])},
    exploration=dict(
        type='EpsilonDecay',
        kwargs=dict(epsilon=1, epsilon_final=0.01, epsilon_timesteps=1e4)
    ),
#     preprocessing=None,

    # The `BatchAgent` class additionally requires the following parameters:
    batch_size=32 , # integer of the batch size.

    # A Policy Gradient Model expects the following additional configuration parameters:
    sample_actions=True , # boolean of whether to sample actions.
#     baseline='linear', # string indicating the baseline value function (currently 'linear' or 'mlp').
#     baseline_args= , # list of arguments for the baseline value function.
#     baseline_kwargs= , # dict of keyword arguments for the baseline value function.
    generalized_advantage_estimation= True , # boolean indicating whether to use GAE estimation.
    gae_lambda= 0.97, # float of the Generalized Advantage Estimation lambda.
    normalize_advantage= True, # boolean indicating whether to normalize the advantage or not.

    # The TRPO agent expects the following additional configuration parameters:
    # https://github.com/reinforceio/tensorforce/blob/master/examples/configs/trpo_cartpole.json
#     learning_rate=0.0002 , # float of learning rate (alpha).
    optimizer= 'adam', # string of optimizer to use (e.g. 'adam').
    cg_damping=0.001 , # float of the damping factor for the conjugate gradient method.
    line_search_steps= 20, # int of how many steps to take during line search.
    max_kl_divergence= 0.001, # float indicating the maximum kl divergence to allow for updates.
    cg_iterations= 20, # int of count of conjugate gradient iterations.
)

# Create a Trust Region Policy Optimization agent
agent = TRPOAgent(config=config)

# for some reason these are not set?
# agent.next_internal = agent.current_internal = []

ValueError: None values not supported.

In [None]:
# why does softmax not work, how to view this?
# agent.model

In [None]:

# Poll new state from client
state = environment.reset()

for i in range(10):
    # Get prediction from agent, execute
    action = agent.act(state=state)

    state, reward, done = environment.execute(action)

    # Add experience, agent automatically updates model according to batch size
    agent.observe(reward=reward, terminal=False)
    
    a=np.array(list(action.values()))
    print(a, a.sum(),a.min(),a.max())

In [None]:
from tensorforce.execution import Runner
runner = Runner(agent=agent, environment=environment)

In [None]:
# Callback function printing episode statistics
def episode_finished(r):
    log_intv = 200
    if r.episode % log_intv == 0:
        df = pd.DataFrame(env.sim.infos)
        print(
            "Finished episode {ep} after {ts} timesteps (reward: {reward: 2.4f}) portfolio_value: {portfolio_value: 2.4f}".
            format(
                ep=r.episode,
                ts=r.timestep,
                reward=r.episode_rewards[-log_intv],
                portfolio_value=df.portfolio_value[-1:].mean()
            )
        )
    return True

In [None]:
runner.run(
    episodes=90000, max_timesteps=200, episode_finished=episode_finished)