In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# numeric
import quandl
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
# %reload_ext autoreload
# %autoreload 2

In [4]:
from src.environments.portfolio import PortfolioEnv

In [5]:
class EnvWrapper(PortfolioEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def step(self, action):
        # also it puts it in a list
        if isinstance(action, list):
            action = action[0]
        
        # we have to normalise for some reason softmax wont work
        if isinstance(action, dict):
            action = np.abs(list(action.values()))
            action /= action.sum()        
        
        return super().step(action) 

In [6]:
df_train = pd.read_hdf('./data/poliniex_30m.hf',key='train')
env = EnvWrapper(
    df=df_train,
    steps=128, 
    scale=True, 
    augument=0.0005    
)
env.seed = 0   

df_test = pd.read_hdf('./data/poliniex_30m.hf',key='test')
env_test = EnvWrapper(
    df=df_test,
    steps=128, 
    scale=True, 
    augument=0.00)
env_test.seed = 0  

In [7]:
from tensorforce.environments.openai_gym import OpenAIGym
environment = OpenAIGym('CartPole-v0')
environment.gym = env

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-15 23:05:23,255] Making new env: CartPole-v0


In [29]:
environment_test = OpenAIGym('CartPole-v0')
environment_test.gym = env_test

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-16 06:10:10,494] Making new env: CartPole-v0


In [8]:
from tensorforce import Configuration
from tensorforce.agents import VPGAgent
from tensorforce.core.networks import layered_network_builder

In [9]:
# Define a network builder from an ordered list of layers
# https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py
layers = [
    dict(type='flatten'),
    dict(type='dense', size=32, l2_regularization=1e-8, activation='selu'),
    dict(type='dense', size=32, l2_regularization=1e-8, activation='selu'),    
    dict(type='dense', size=32, l2_regularization=1e-8, activation='selu'), 
]
# act will it add's it's own head so we can't add a softmax at the end
network = layered_network_builder(layers_config=layers)


In [17]:

config = Configuration(   
    # Each agent requires the following ``Configuration`` parameters:
    network=network,
    states=dict(shape=tuple(env.observation_space.shape), type='float'),
    actions={'action' + str(n): dict(continuous=True) for n in range(env.action_space.shape[0])},
    preprocessing = None,# dict or list containing state preprocessing configuration.
    exploration = dict(
        type='EpsilonDecay',
        kwargs=dict(epsilon=1, epsilon_final=0.01, epsilon_timesteps=1e4)
    ),

    # The `BatchAgent` class additionally requires the following parameters:
    batch_size = 32,# integer of the batch size.

    # A Policy Gradient Model expects the following additional configuration parameters:
    sample_actions= True,# boolean of whether to sample actions.
#     baseline='mlp' ,# string indicating the baseline value function (currently 'linear' or 'mlp').
    baseline_args=dict(size=100, repeat_update=100) ,# list of arguments for the baseline value function.
    override_line_search=False,
    
#     baseline_kwargs= ,# dict of keyword arguments for the baseline value function.
    generalized_advantage_estimation= True ,# boolean indicating whether to use GAE.
    gae_lambda= 0.97,# float of the Generalized Advantage Estimation lambda.
    normalize_advantage= False,# boolean indicating whether to normalize the advantage or not.
    cg_iterations=20,
    max_kl_divergence=0.005,
    cg_damping=0.001,
    line_search_steps=20,
    loglevel="info",
)

# Create a Trust Region Policy Optimization agent
agent = VPGAgent(config=config)

# for some reason these are not set?
agent.next_internal = agent.current_internal = []

In [24]:
from tensorforce.execution import Runner
runner = Runner(agent=agent, environment=environment)

In [70]:
# Callback function printing episode statistics
class EpisodeFinished(object):
    
    def __init__(self, log_intv):
        self.log_intv = log_intv
        self.portfolio_values = [] 
    
    def episode_finished(self, r):
        if len(runner_test.environment.gym.sim.infos):
            self.portfolio_values.append( r.environment.gym.sim.infos[-1]['portfolio_value'] )
        if r.episode % self.log_intv == 0:
#             df = pd.DataFrame(r.environment.gym.infos)
            print(
                "Finished episode {ep} after {ts} timesteps (reward: {reward: 2.4f} [{rewards_min: 2.4f}, {rewards_max: 2.4f}]) portfolio_value: {portfolio_value: 2.4f} [{portfolio_value_min: 2.4f}, {portfolio_value_max: 2.4f}]".
                format(
                    ep=r.episode,
                    ts=r.timestep,
                    reward=np.mean(r.episode_rewards[-self.log_intv:]),
                    rewards_min=np.min(r.episode_rewards[-self.log_intv:]),
                    rewards_max=np.max(r.episode_rewards[-self.log_intv:]),
                    portfolio_value=np.mean(self.portfolio_values[-self.log_intv:]),
                    portfolio_value_min=np.min(self.portfolio_values[-self.log_intv:]),
                    portfolio_value_max=np.max(self.portfolio_values[-self.log_intv:])
                )
            )
        return True

In [28]:
runner.run(
    episodes=900000, max_timesteps=200, episode_finished=episode_finished)

Finished episode 100 after 128 timesteps (reward: -0.0277 [-0.3963,  6.4502]) portfolio_value:  0.8600
Finished episode 200 after 128 timesteps (reward: -0.1461 [-0.2775,  0.2785]) portfolio_value:  0.8632
Finished episode 300 after 128 timesteps (reward: -0.1395 [-0.2846,  0.5663]) portfolio_value:  0.8751
Finished episode 400 after 128 timesteps (reward: -0.1587 [-0.3149,  0.0365]) portfolio_value:  0.8685
Finished episode 500 after 128 timesteps (reward: -0.1402 [-0.2792,  0.2533]) portfolio_value:  0.8543
Finished episode 600 after 128 timesteps (reward: -0.0491 [-0.2891,  10.2761]) portfolio_value:  0.8590
Finished episode 700 after 128 timesteps (reward: -0.1251 [-0.2834,  0.9922]) portfolio_value:  0.9553
Finished episode 800 after 128 timesteps (reward: -0.0755 [-0.3038,  3.1840]) portfolio_value:  11.6895
Finished episode 900 after 128 timesteps (reward:  0.1300 [-0.3031,  8.8457]) portfolio_value:  0.9336
Finished episode 1000 after 128 timesteps (reward: -0.1311 [-0.3085,  0

Finished episode 8000 after 128 timesteps (reward:  0.0940 [-0.2359,  9.9235]) portfolio_value:  0.8973
Finished episode 8100 after 128 timesteps (reward:  0.0445 [-0.2746,  9.1373]) portfolio_value:  0.9688
Finished episode 8200 after 128 timesteps (reward:  0.0336 [-0.2681,  9.3756]) portfolio_value:  0.8286
Finished episode 8300 after 128 timesteps (reward: -0.0329 [-0.3058,  6.2936]) portfolio_value:  0.8534
Finished episode 8400 after 128 timesteps (reward: -0.0831 [-0.4355,  3.3074]) portfolio_value:  1.0300
Finished episode 8500 after 128 timesteps (reward: -0.0926 [-0.2438,  1.5461]) portfolio_value:  0.8139
Finished episode 8600 after 128 timesteps (reward: -0.1100 [-0.2115,  0.2249]) portfolio_value:  0.8125
Finished episode 8700 after 128 timesteps (reward: -0.0809 [-0.3003,  1.3677]) portfolio_value:  0.9518
Finished episode 8800 after 128 timesteps (reward: -0.0171 [-0.2636,  9.0493]) portfolio_value:  0.9517
Finished episode 8900 after 128 timesteps (reward: -0.0712 [-0.2

Finished episode 15900 after 128 timesteps (reward: -0.0584 [-0.2113,  0.2950]) portfolio_value:  0.9158
Finished episode 16000 after 128 timesteps (reward:  0.0079 [-0.2693,  5.7978]) portfolio_value:  0.9216
Finished episode 16100 after 128 timesteps (reward: -0.0215 [-0.2171,  2.9220]) portfolio_value:  0.8921
Finished episode 16200 after 128 timesteps (reward:  0.0325 [-0.2619,  9.9790]) portfolio_value:  0.9671
Finished episode 16300 after 128 timesteps (reward: -0.0655 [-0.2279,  0.1814]) portfolio_value:  0.9155
Finished episode 16400 after 128 timesteps (reward: -0.0402 [-0.1723,  1.7292]) portfolio_value:  1.0363
Finished episode 16500 after 128 timesteps (reward: -0.0646 [-0.2595,  0.4686]) portfolio_value:  0.9354
Finished episode 16600 after 128 timesteps (reward: -0.0464 [-0.1752,  2.5830]) portfolio_value:  0.8681
Finished episode 16700 after 128 timesteps (reward: -0.0503 [-0.2720,  1.6566]) portfolio_value:  1.0262
Finished episode 16800 after 128 timesteps (reward:  0.

Finished episode 23700 after 128 timesteps (reward: -0.0037 [-0.2301,  2.9174]) portfolio_value:  0.9773
Finished episode 23800 after 128 timesteps (reward:  0.0787 [-0.2581,  10.6802]) portfolio_value:  0.9820
Finished episode 23900 after 128 timesteps (reward: -0.0643 [-0.2303,  0.2397]) portfolio_value:  0.9214
Finished episode 24000 after 128 timesteps (reward:  0.1306 [-0.1972,  9.7999]) portfolio_value:  1.0619
Finished episode 24100 after 128 timesteps (reward:  0.1119 [-0.1805,  9.7937]) portfolio_value:  1.0212
Finished episode 24200 after 128 timesteps (reward: -0.0678 [-0.2066,  0.2152]) portfolio_value:  0.9856
Finished episode 24300 after 128 timesteps (reward:  0.0591 [-0.2310,  10.3067]) portfolio_value:  0.9749
Finished episode 24400 after 128 timesteps (reward:  0.0963 [-0.2124,  10.6423]) portfolio_value:  0.9247
Finished episode 24500 after 128 timesteps (reward: -0.0663 [-0.2367,  0.3095]) portfolio_value:  0.9777
Finished episode 24600 after 128 timesteps (reward: 

Finished episode 31500 after 128 timesteps (reward: -0.0169 [-0.2201,  2.1880]) portfolio_value:  0.9304
Finished episode 31600 after 128 timesteps (reward:  0.1392 [-0.1843,  10.0764]) portfolio_value:  0.9312
Finished episode 31700 after 128 timesteps (reward: -0.0606 [-0.2043,  0.3868]) portfolio_value:  0.9486
Finished episode 31800 after 128 timesteps (reward: -0.0518 [-0.2449,  0.3028]) portfolio_value:  0.9872
Finished episode 31900 after 128 timesteps (reward: -0.0610 [-0.2224,  0.1424]) portfolio_value:  0.9633
Finished episode 32000 after 128 timesteps (reward:  0.0508 [-0.1997,  7.8416]) portfolio_value:  0.9180
Finished episode 32100 after 128 timesteps (reward:  0.1824 [-0.2841,  10.2649]) portfolio_value:  1.0461
Finished episode 32200 after 128 timesteps (reward: -0.0242 [-0.1888,  2.6837]) portfolio_value:  0.9509
Finished episode 32300 after 128 timesteps (reward: -0.0343 [-0.1424,  0.5637]) portfolio_value:  0.8910
Finished episode 32400 after 128 timesteps (reward: -

Finished episode 39300 after 128 timesteps (reward: -0.0313 [-0.2140,  2.9956]) portfolio_value:  0.9097
Finished episode 39400 after 128 timesteps (reward:  0.0210 [-0.1701,  5.9492]) portfolio_value:  1.0764
Finished episode 39500 after 128 timesteps (reward:  0.0620 [-0.2022,  6.0985]) portfolio_value:  0.9509
Finished episode 39600 after 128 timesteps (reward: -0.0539 [-0.2670,  0.1816]) portfolio_value:  0.9869
Finished episode 39700 after 128 timesteps (reward: -0.0577 [-0.1985,  0.1957]) portfolio_value:  0.9258
Finished episode 39800 after 128 timesteps (reward:  0.2214 [-0.2267,  11.2122]) portfolio_value:  0.9048
Finished episode 39900 after 128 timesteps (reward:  0.0637 [-0.2336,  6.1916]) portfolio_value:  0.9032
Finished episode 40000 after 128 timesteps (reward: -0.0633 [-0.1765,  0.2602]) portfolio_value:  0.9510
Finished episode 40100 after 128 timesteps (reward:  0.1268 [-0.2109,  10.3349]) portfolio_value:  0.8718
Finished episode 40200 after 128 timesteps (reward:  

KeyboardInterrupt: 

In [71]:
runner_test = Runner(agent=agent, environment=environment_test)
runner_test.run(
    episodes=100, max_timesteps=128, episode_finished=EpisodeFinished(10).episode_finished)

Finished episode 10 after 128 timesteps (reward: -0.0305 [-0.1367,  0.0823]) portfolio_value:  0.9717 [ 0.8722  1.0857]
Finished episode 20 after 128 timesteps (reward:  0.0000 [-0.1393,  0.1881]) portfolio_value:  1.0055 [ 0.8699  1.2070]
Finished episode 30 after 128 timesteps (reward: -0.0473 [-0.1285, -0.0007]) portfolio_value:  0.9544 [ 0.8795  0.9993]
Finished episode 40 after 128 timesteps (reward: -0.0712 [-0.2553,  0.0207]) portfolio_value:  0.9333 [ 0.7746  1.0209]
Finished episode 50 after 128 timesteps (reward: -0.0003 [-0.1229,  0.2287]) portfolio_value:  1.0046 [ 0.8843  1.2570]
Finished episode 60 after 128 timesteps (reward: -0.0200 [-0.0804,  0.1019]) portfolio_value:  0.9815 [ 0.9227  1.1073]
Finished episode 70 after 128 timesteps (reward: -0.0592 [-0.1267,  0.0460]) portfolio_value:  0.9436 [ 0.8810  1.0471]
Finished episode 80 after 128 timesteps (reward: -0.0313 [-0.1400,  0.1044]) portfolio_value:  0.9709 [ 0.8693  1.1100]
Finished episode 90 after 128 timesteps 

In [None]:

# # Poll new state from client
# state = environment.reset()

# for i in range(10):
#     # Get prediction from agent, execute
#     action = agent.act(state=state)

#     state, reward, done = environment.execute(action)

#     # Add experience, agent automatically updates model according to batch size
#     agent.observe(reward=reward, terminal=False)
    
#     a=np.array(list(action.values()))
#     print(a, a.sum(),a.min(),a.max())

In [65]:
%debug

> [0;32m/home/isisilon/.pyenv/versions/3.6.0/envs/jupyter3/lib/python3.6/site-packages/numpy/core/_methods.py[0m(29)[0;36m_amin[0;34m()[0m
[0;32m     27 [0;31m[0;34m[0m[0m
[0m[0;32m     28 [0;31m[0;32mdef[0m [0m_amin[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mkeepdims[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m---> 29 [0;31m    [0;32mreturn[0m [0mumr_minimum[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m,[0m [0;32mNone[0m[0;34m,[0m [0mout[0m[0;34m,[0m [0mkeepdims[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     30 [0;31m[0;34m[0m[0m
[0m[0;32m     31 [0;31m[0;32mdef[0m [0m_sum[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mdtype[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mkeepdims[0m[0;34m=[0m[0;32mF

In [None]:
self.portfolio_value