In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

# numeric
import quandl
import numpy as np
from numpy import random
import pandas as pd

# util
from collections import Counter
import pdb
import time
import tempfile
import itertools
from tqdm import tqdm_notebook as tqdm

# logging
import logging
logger = log = logging.getLogger(__name__)
# log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

In [2]:
import gym
from gym import error, spaces, utils
from gym.utils import seeding

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
%reload_ext autoreload
%autoreload 2

In [4]:
from src.environments.portfolio import PortfolioEnv

In [223]:
class EnvWrapper(PortfolioEnv):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
    def step(self, action):
        # also it puts it in a list
        if isinstance(action, list):
            action = action[0]
        
        # we have to normalise for some reason softmax wont work
        if isinstance(action, dict):
            action = np.abs(list(action.values()))
            action /= action.sum()        
        
        return super().step(action) 

In [224]:
df_train = pd.read_hdf('./data/poliniex_30m.hf',key='train')
env = EnvWrapper(
    df=df_train,
    steps=128, 
    scale=True, 
    augument=0.0005    
)
env.seed = 0   

df_test = pd.read_hdf('./data/poliniex_30m.hf',key='test')
env_test = EnvWrapper(
    df=df_test,
    steps=128, 
    scale=True, 
    augument=0.00)
env_test.seed = 0  

In [226]:
from tensorforce.environments.openai_gym import OpenAIGym
environment = OpenAIGym('CartPole-v0')
environment.gym = env

INFO:gym.envs.registration:Making new env: CartPole-v0
[2017-07-15 15:52:20,523] Making new env: CartPole-v0


In [227]:
from tensorforce import Configuration
from tensorforce.agents import VPGAgent
from tensorforce.core.networks import layered_network_builder

In [228]:
# Define a network builder from an ordered list of layers
# https://github.com/reinforceio/tensorforce/blob/0d07fadec03f76537a2431e17c51cd759d53b5e9/tensorforce/core/networks/layers.py
layers = [
    dict(type='flatten'),
    dict(type='dense', size=32),
    dict(type='dense', size=32),
    dict(type='nonlinearity', name='sigmoid'),
    dict(type='nonlinearity', name='relu'),
    
]
network = layered_network_builder(layers_config=layers)


In [229]:

config = Configuration(
    batch_size=32,
    states=dict(shape=tuple(env.observation_space.shape), type='float'),
#     states=dict(shape=env.observation_space.shape, type=float),
#     states=dict([('action'+str(k),v) for k,v in enumerate([state1]*nb_actions)]),
    # https://github.com/reinforceio/tensorforce/blob/master/tensorforce/environments/openai_gym.py#L84
    actions={'action' + str(n): dict(continuous=True) for n in range(env.action_space.shape[0])},
#     actions={'action' + str(n): dict(continuous=True) for n in range(len(env.action_space.shape))},
#     exploration=dict(
#         type='EpsilonDecay',
#         kwargs=dict(epsilon=1, epsilon_final=0.01, epsilon_timesteps=1e4)),
    network=network,
    generalized_advantage_estimation=False,
    normalize_advantage=False,
    sample_actions=True
)

# Create a Trust Region Policy Optimization agent
agent = VPGAgent(config=config)

# for some reason these are not set?
agent.next_internal = agent.current_internal = []

In [235]:
# why does softmax not work, how to view this?
agent.model

<tensorforce.models.vpg_model.VPGModel at 0x7f71e0c00f28>

In [232]:

# Poll new state from client
state = environment.reset()

for i in range(100):
    # Get prediction from agent, execute
    action = agent.act(state=state)

    state, reward, done = environment.execute(action)

    # Add experience, agent automatically updates model according to batch size
    agent.observe(reward=reward, terminal=False)


-0.00438812733755
-0.00437489420782
-0.000807419859218
0.00328737897099
-0.00327510418118
-0.000199139228671
-0.00146120935013
0.00510737838663
-0.00378710474651
-0.00524470238275
0.00430609927006
-0.0015289046655
-0.00323449487063
-0.0054726359256
-0.000352332110157
-0.00408013979925
0.00551566769843
-0.0084226191255
-0.00160485939118
0.00105002635982
-0.0056811879153
0.00385485701753
-0.000928409106663
0.00499031513451
-0.00512698083541
0.000862944651578
-0.00036530181874
0.00559280263081
-0.000231807997225
0.000179197844571
-0.00923303106192
0.00473967973338
-0.00290978863007
-0.00605129762295
-0.00242724681271
-0.00492676786765
-0.0059147862221
0.000651893696436
-0.00294846142033
-0.00419780257275
-0.00440700486449
3.92402988733e-05
0.00776443210555
-0.010186845306
-0.013979203807
-2.14082403378e-06
-0.00968769598724
-0.00152467765548
0.0037114337242
0.0090361527741
-0.00210604388839
-0.000353805128936
-0.0012306018666
-0.0050666996567
-0.00814795538018
-0.00133065396143
0.01159944

In [237]:
from tensorforce.execution import Runner
runner = Runner(agent=agent, environment=environment)

In [None]:
# Callback function printing episode statistics
def episode_finished(r):
    if r.episode%2==0:
        df = pd.DataFrame(env.sim.infos)
#         df.log_reward[-10:].mean()
#         df.portfolio_value[-10:].mean()
        print("Finished episode {ep} after {ts} timesteps (reward: {reward:2.4f}) portfolio_value: {portfolio_value:2.4f}".format(ep=r.episode, ts=r.timestep, reward=r.episode_rewards[-1], portfolio_value=df.portfolio_value[-10:].mean()))
    return True
runner.run(episodes=90000, max_timesteps=200, episode_finished=episode_finished)

Finished episode 2 after 128 timesteps (reward: -0.2520) portfolio_value: 0.7863
Finished episode 4 after 128 timesteps (reward: -0.1582) portfolio_value: 0.8575
Finished episode 6 after 128 timesteps (reward: -0.2620) portfolio_value: 0.7717
Finished episode 8 after 128 timesteps (reward: -0.2728) portfolio_value: 0.7714
Finished episode 10 after 128 timesteps (reward: -0.3500) portfolio_value: 0.7079
Finished episode 12 after 128 timesteps (reward: -0.1362) portfolio_value: 0.8819
Finished episode 14 after 128 timesteps (reward: -0.1603) portfolio_value: 0.8622
Finished episode 16 after 128 timesteps (reward: -0.2177) portfolio_value: 0.8139
Finished episode 18 after 128 timesteps (reward: -0.1636) portfolio_value: 0.8601
Finished episode 20 after 128 timesteps (reward: -0.2167) portfolio_value: 0.8276
Finished episode 22 after 128 timesteps (reward: -0.2418) portfolio_value: 0.7707
Finished episode 24 after 128 timesteps (reward: -0.2219) portfolio_value: 0.8071
Finished episode 26 

Finished episode 202 after 128 timesteps (reward: -0.1997) portfolio_value: 0.8215
Finished episode 204 after 128 timesteps (reward: -0.2797) portfolio_value: 0.7639
