Pytorch is easier to debug, I like it.

TODO:
- [x] prioritised experience replay, need to grab loss for each sample, and sample based on loss
- [ ] check it for my data, can it overfit?, does the normalisation make sense?
- [x] better metrics
- [ ] do cnn model
- [x] read papers
- [ ] check i'm prioristising by the right things, should lead to lowest loss
- [ ] test on cartpole

Refs: 
- implementations:
    - PPO
        - **pytorch implementation https://github.com/alexis-jacq/Pytorch-DPPO/blob/master/ppo.py**
        - tensorflow implementation https://github.com/reinforceio/tensorforce/blob/master/tensorforce/models/ppo_model.py
    - Prioritised memory
    - Other
        - http://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html#training
        - https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py
- papers:
    - DPPO https://arxiv.org/pdf/1707.02286.pdf
    - PPO 
        - https://arxiv.org/abs/1707.06347
        - https://blog.openai.com/openai-baselines-ppo/
    - TRPO https://arxiv.org/abs/1502.05477

In [1]:
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

# numeric
import numpy as np
from numpy import random
import pandas as pd

# utils
from tqdm import tqdm_notebook as tqdm
from collections import Counter
import tempfile
import logging
import time
import datetime
import random

from collections import OrderedDict
from IPython.display import display
from pprint import pprint

# logging
logger = log = logging.getLogger(__name__)
log.setLevel(logging.INFO)
logging.basicConfig()
log.info('%s logger started.', __name__)

INFO:__main__:__main__ logger started.


In [2]:
import argparse
import os
import sys
import gym
from gym import wrappers
import random
import numpy as np

import torch
import torch.optim as optim
import torch.multiprocessing as mp
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [3]:
import os
os.sys.path.append(os.path.abspath('.'))
%reload_ext autoreload
%autoreload 2

In [4]:
class Params():
    def __init__(self):
        # env
        self.window_length = 50
        # Model
        self.batch_size = 250
        self.lr = 3e-4
        self.gamma = 0.00
        self.gae_param = 0.95
        self.clip = 0.2 # epsilon from eq 7, default 0.2
        self.ent_coeff = 0.
        self.num_epoch = 50
        self.num_steps = 2048*4
        self.time_horizon = 2000000
        self.max_episode_length = 10000
        self.seed = 1

params = Params()

save_path= 'outputs/agent_portfolio-ddpo/{}_seperate_weights.pickle'.format('2017-07-21')
try:
    os.makedirs(os.path.dirname(save_path))
except OSError:
    pass
save_path

'outputs/agent_portfolio-ddpo/2017-07-21_seperate_weights.pickle'

# Memory
refs
- https://github.com/openai/baselines/blob/master/baselines/deepq/replay_buffer.py
- https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py

In [5]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []

    def push(self, events):
        for event in zip(*events):
            self.memory.append(event)
            if len(self.memory)>self.capacity:
                del self.memory[0]

    def clear(self):
        self.memory = []

    def sample(self, batch_size):
        samples = zip(*random.sample(self.memory, batch_size))
        return map(lambda x: torch.cat(x, 0), samples)

# Enviroment

In [6]:
from src.environments.portfolio import PortfolioEnv, sharpe, max_drawdown

# we want to pemute the channels a little

class PermutedPortfolioEnv(PortfolioEnv):
    def reset(self, *args, **kwargs):
        return np.transpose(super().reset(*args, **kwargs),(0,1,2))
    def step(self, *args, **kwargs):
        observation, reward, done, info = super().step(*args, **kwargs)
        observation = np.transpose(observation,(2,0,1))
        return observation, reward, done, info


df_train = pd.read_hdf('./data/poloniex_30m.hf',key='train')
env = PermutedPortfolioEnv(
    df=df_train,
    steps=128, 
    scale=True, 
    augment=0.0025, # let just overfit first,
    trading_cost=0, #0.0025, # let just overfit first,
    window_length = params.window_length,   
)
env.seed(params.seed)
env.reset().shape

df_test = pd.read_hdf('./data/poloniex_30m.hf',key='test')
env_test = PermutedPortfolioEnv(
    df=df_test,
    steps=1280, 
    scale=True, 
    trading_cost=0, #0.0025, # let just overfit first,
    window_length = params.window_length,   
)
env_test.seed(params.seed)
env_test.reset().shape

(3, 5, 50)

# Model

In [7]:
env.reset().shape
# 20*50-2

(3, 5, 50)

In [8]:
import torch.nn.init

class GenericSharedModel(nn.Module):
    def __init__(self, inputs, outputs):
        super(GenericSharedModel, self).__init__()
        num_inputs = int(np.prod(env.observation_space.shape))
        num_outputs = int(np.prod(env.action_space.shape))
        
        # hidden layer sizes
        h_size_1 = 100
        h_size_2 = 64
        
        # shared conv block
        self.conv1 = nn.Conv2d(3, 2, (1, 3))
        self.bn_conv1   = nn.BatchNorm2d(2)
        self.conv2 = nn.Conv2d(2, 20, (1, inputs[1] - 2))
        self.bn_conv2   = nn.BatchNorm2d(20)
        
        # Actor mean
        self.fc1 = nn.Linear(20*inputs[0], h_size_1)
        self.bn_fc1   = nn.BatchNorm1d(h_size_1)
        self.fc2 = nn.Linear(h_size_1, h_size_2)
        self.bn_fc2   = nn.BatchNorm1d(h_size_2)     
        self.mu = nn.Linear(h_size_2, num_outputs)
        
        # Actor std
        self.log_std = nn.Parameter(torch.zeros(num_outputs))
        
        # Critic
        self.fc1b = nn.Linear(20*inputs[0], h_size_1)
        self.bn_fcb1   = nn.BatchNorm1d(h_size_1)
        self.fc2b = nn.Linear(h_size_1, h_size_2)
        self.bn_fcb2   = nn.BatchNorm1d(h_size_2)        
        self.v = nn.Linear(h_size_2,1)
        
        for name, p in self.named_parameters():
            # init parameters like in keras
            if 'bias' in name:
                p.data.fill_(0)
            if ('weight' in name) and ('conv' in name):
                if len(p.size())>1:
                    torch.nn.init.xavier_uniform(p)
                else:
                    pass # leave as uniform
        
        # mode
        self.train()

    def forward(self, inputs):
        # shared conv block
        x = F.elu(self.bn_conv1(self.conv1(inputs)))
        x = F.elu(self.bn_conv2(self.conv2(x)))
        # flatten
        h = x.view(x.size(0),-1)
        
        # the action mean
        x = F.elu(self.bn_fc1(self.fc1(h)))
        x = F.elu(self.bn_fc2(self.fc2(x)))       
        mu = self.mu(x)
        
        # the log standard debian of the action
        log_std = torch.exp(self.log_std).unsqueeze(0).expand_as(mu)
        
        # critic
        x = F.elu(self.bn_fcb1(self.fc1b(h)))
        x = F.elu(self.bn_fcb2(self.fc2b(x)))
        v = self.v(x)
        return mu, log_std, v

# Train

In [9]:
def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path

In [10]:
class Shared_obs_stats():
    """Like batchnorm for input data"""
    def __init__(self, num_inputs):
        self.n = torch.zeros(num_inputs).share_memory_()
        self.mean = torch.zeros(num_inputs).share_memory_()
        self.mean_diff = torch.zeros(num_inputs).share_memory_()
        self.var = torch.zeros(num_inputs).share_memory_()

    def observes(self, obs):
        # observation mean var updates
        x = obs.data.squeeze()
        self.n += 1.
        last_mean = self.mean.clone()
        self.mean += (x-self.mean)/self.n
        self.mean_diff += (x-last_mean)*(x-self.mean)
        self.var = torch.clamp(self.mean_diff/self.n, min=1e-2)

    def normalize(self, inputs):
        obs_mean = Variable(self.mean.unsqueeze(0).expand_as(inputs))
        obs_std = Variable(torch.sqrt(self.var).unsqueeze(0).expand_as(inputs))
        return torch.clamp((inputs-obs_mean)/obs_std, -5., 5.)

In [11]:
def normal(x, mu, sigma_sq):
    a = (-1*(x-mu).pow(2)/(2*sigma_sq)).exp()
    b = 1/(2*sigma_sq*np.pi).sqrt()
    return a*b

In [12]:

cuda = False
torch.manual_seed(params.seed)
work_dir = mkdir('exp', 'ppo')
monitor_dir = mkdir(work_dir, 'monitor')

# env = gym.make(params.env_name)
#env = wrappers.Monitor(env, monitor_dir, force=True)

num_inputs = env.observation_space.shape[0]
num_outputs = env.action_space.shape[0]


#initialize network and optimizer
Model = GenericSharedModel
model = Model(env.observation_space.shape, env.action_space.shape)
if cuda: model.cuda()

# shared_obs_stats = Shared_obs_stats(num_inputs)
optimizer = optim.Adam(model.parameters(), lr=params.lr)
model

GenericSharedModel (
  (conv1): Conv2d(3, 2, kernel_size=(1, 3), stride=(1, 1))
  (bn_conv1): BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True)
  (conv2): Conv2d(2, 20, kernel_size=(1, 48), stride=(1, 1))
  (bn_conv2): BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True)
  (fc1): Linear (100 -> 100)
  (bn_fc1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
  (fc2): Linear (100 -> 64)
  (bn_fc2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
  (mu): Linear (64 -> 6)
  (fc1b): Linear (100 -> 100)
  (bn_fcb1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
  (fc2b): Linear (100 -> 64)
  (bn_fcb2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
  (v): Linear (64 -> 1)
)

In [13]:
from torch.nn.modules.module import _addindent
import torch
import numpy as np
def torch_summarize(model, show_weights=False, show_parameters=True):
    """Summarizes torch model by showing trainable parameters and weights"""
    tmpstr = model.__class__.__name__ + ' (\n'
    for key, module in model._modules.items():
        # if it contains layers let call it recurvisvly to get params and weights
        if type(module) in [
            torch.nn.modules.container.Container,
            torch.nn.modules.container.Sequential
        ]:
            modstr = torch_summarize(module)
        else:
            modstr = module.__repr__()
        modstr = _addindent(modstr, 2)
        
        params = sum([np.prod(p.size()) for p in module.parameters()])
        weights = tuple([tuple(p.size()) for p in module.parameters()])
        
        tmpstr += '  {:15.15} '.format(key) + '[{: 7.7g}]: '.format(params) + modstr 
        if show_weights:
            tmpstr += ', weights={}'.format(weights)
        tmpstr += '\n'   

    tmpstr = tmpstr + ')'
    return tmpstr

print(torch_summarize(model))

GenericSharedModel (
  conv1           [     20]: Conv2d(3, 2, kernel_size=(1, 3), stride=(1, 1))
  bn_conv1        [      4]: BatchNorm2d(2, eps=1e-05, momentum=0.1, affine=True)
  conv2           [   1940]: Conv2d(2, 20, kernel_size=(1, 48), stride=(1, 1))
  bn_conv2        [     40]: BatchNorm2d(20, eps=1e-05, momentum=0.1, affine=True)
  fc1             [  10100]: Linear (100 -> 100)
  bn_fc1          [    200]: BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
  fc2             [   6464]: Linear (100 -> 64)
  bn_fc2          [    128]: BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
  mu              [    390]: Linear (64 -> 6)
  fc1b            [  10100]: Linear (100 -> 100)
  bn_fcb1         [    200]: BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
  fc2b            [   6464]: Linear (100 -> 64)
  bn_fcb2         [    128]: BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True)
  v               [     65]: Linear (64 -> 1)
)


In [None]:
memory = ReplayMemory(params.num_steps)
# memory = PrioritisedReplayMemory(params.num_steps)

num_inputs = int(np.prod(env.observation_space.shape))
num_outputs = int(np.prod(env.action_space.shape))

state = env.reset()
state = Variable(torch.Tensor(state).unsqueeze(0))
done = True
episode_length = 0
reports = []

In [None]:


with tqdm(total=params.time_horizon, mininterval=2, unit='steps') as p:
    episode = -1    
    steps = 0
    # horizon loop
    while steps < params.time_horizon:
        infos = []
        episode_length = 0
        # Sample data from the policy
        while (len(memory.memory) < params.num_steps):
            states = []
            actions = []
            rewards = []
            values = []
            returns = []
            advantages = []
            av_reward = 0
            cum_reward = 0
            cum_done = 0
            # n steps loops
            for step in range(params.num_steps):
                #                 shared_obs_stats.observes(state)
                #                 state = shared_obs_stats.normalize(state)
                states.append(state)
                
                mu, sigma_sq, v = model(state)
                eps = torch.randn(mu.size())
                action = (mu + sigma_sq.sqrt() * Variable(eps))
                env_action = action.data.squeeze().numpy()
                state, reward, done, info = env.step(env_action)
                done = (done or episode_length >= params.max_episode_length)
                
                cum_reward += reward
                reward = max(min(reward, 1), -1)
                rewards.append(reward)
                actions.append(action)
                values.append(v)
                
                steps+=1  
                p.update(1)
                if done:
                    episode += 1
                    cum_done += 1
                    av_reward += cum_reward
                    p.desc='av_reward={: 2.8f}'.format(av_reward / float(cum_done))
                    cum_reward = 0
                    episode_length = 0
                    infos.append(info)
                    state = env.reset()
                
                state = Variable(torch.Tensor(state).unsqueeze(0))
                
                if done:
                    break
            
            # one last step
            R = torch.zeros(1, 1)
            if not done:
                _, _, v = model(state)
                R = v.data
            
            # compute returns and GAE(lambda) advantages:
            values.append(Variable(R))
            R = Variable(R)
            A = Variable(torch.zeros(1, 1))
            for i in reversed(range(len(rewards))):
                td = rewards[i] + params.gamma*values[i+1].data[0,0] - values[i].data[0,0]
                A = float(td) + params.gamma * params.gae_param * A
                advantages.insert(0, A)
                R = A + values[i]
                returns.insert(0, R)
            
            # store useful info:
            memory.push([states, actions, returns, advantages])
            

        # perform several epochs of optimization on the sampled data
        model_old = Model(env.observation_space.shape,
                             env.action_space.shape)
        model_old.load_state_dict(model.state_dict())
        if cuda: model_old.cuda()
        av_loss = 0
        for k in range(params.num_epoch):
            # cf https://github.com/openai/baselines/blob/master/baselines/pposgd/pposgd_simple.py
            batch_states, batch_actions, batch_returns, batch_advantages = memory.sample(
                params.batch_size)
            
            # old probas
            mu_old, sigma_sq_old, v_pred_old = model_old(batch_states.detach())
            probs_old = normal(batch_actions, mu_old, sigma_sq_old)
            
            # new probas
            mu, sigma_sq, v_pred = model(batch_states)
            probs = normal(batch_actions, mu, sigma_sq)
            
            # ratio
            ratio = probs / (1e-15 + probs_old)
            
            # surrogate clip loss
            surr1 = ratio * torch.cat([batch_advantages]*num_outputs,1) # surrogate from conservative policy iteration
            surr2 = ratio.clamp(1-params.clip, 1+params.clip) * torch.cat([batch_advantages]*num_outputs,1)
            loss_clip = -torch.mean(torch.min(surr1, surr2))
            # should this be a mean along axis 0?
            
            # state-value function loss, do we even need this if they don't share params?
            vfloss1 = (v_pred - batch_returns)**2
            v_pred_clipped = v_pred_old + (v_pred - v_pred_old).clamp(-params.clip, params.clip)
            vfloss2 = (v_pred_clipped - batch_returns)**2
            loss_value = 0.5 * torch.mean(torch.max(vfloss1, vfloss2))
            # should this be a mean along axis 0?
            
            # loss on entropy bonus to ensure sufficient exploration
            loss_ent = -params.ent_coeff*torch.mean(probs*torch.log(probs+1e-5))
            
            # total
            total_loss = (loss_clip + loss_value + loss_ent)
#             total_loss = (loss_clip - loss_value + loss_ent)
            av_loss += loss_value.data[0] / float(params.num_epoch)
            
            # before step, update old_model:
            model_old.load_state_dict(model.state_dict())
            
            # step
            optimizer.zero_grad()
            total_loss.backward(retain_variables=True)
            optimizer.step()
        
        # t finish, print:
        df_infos = pd.DataFrame(infos)
        
        report=OrderedDict(
            episode=episode,
#             reward=av_reward / float(cum_done),
            loss=av_loss,
            cash_bias=df_infos.cash_bias.mean(),
            market_value=df_infos.market_value.mean(),
            portfolio_value=df_infos.portfolio_value.mean(),
            reward=df_infos.reward.mean()
        )
        
        s = ', '.join(['{}={:2.4g}'.format(key,value) for key,value in report.items()])
        print(s)
        
        reports.append(report)
        
        memory.clear()
        torch.save(model_old, save_path)


Widget Javascript not detected.  It may not be installed or enabled properly.


episode=63, loss=0.004864, market_value=1.048, cash_bias=0.1248, reward=-3.866e-06, portfolio_value=1.041


  "type " + obj.__name__ + ". It won't be checked "


episode=127, loss=0.00192, market_value=1.038, cash_bias=0.1695, reward=2.815e-06, portfolio_value=1.041
16384/|/av_reward= 0.00000326  1%|| 16384/2000000 [02:30<4:03:17, 135.89steps/s]episode=191, loss=0.002165, market_value=1.028, cash_bias=0.2029, reward=7.439e-06, portfolio_value=1.037
episode=255, loss=0.002224, market_value=1.034, cash_bias=0.2108, reward=4.079e-06, portfolio_value=1.052
40960/|/av_reward= 0.00057724  2%|| 40960/2000000 [06:20<4:13:53, 128.60steps/s]episode=319, loss=0.002281, market_value=1.042, cash_bias=0.2069, reward=2.316e-06, portfolio_value=1.032
49152/|/av_reward=-0.00002268  2%|| 49152/2000000 [07:40<3:59:28, 135.77steps/s]episode=383, loss=0.002313, market_value=1.054, cash_bias=0.2293, reward=6.876e-06, portfolio_value=1.047
episode=447, loss=0.003132, market_value=1.04, cash_bias=0.1413, reward=1.032e-05, portfolio_value=1.027
65536/|/av_reward=-0.00140124  3%|| 65536/2000000 [10:10<4:00:17, 134.17steps/s]episode=511, loss=0.00504, market_value=1.041,

In [None]:
# show progress
df=pd.DataFrame(reports)
g = sns.jointplot(x="episode", y="loss", data=df, kind="reg", size=10)
plt.show()

In [None]:
# env_test = env

# Test
for i in range(10):
    model.train(False)
    state = env_test.reset()
    for i in range(250):
        state = Variable(torch.Tensor(state).unsqueeze(0))
        mu, sigma_sq, v = model(state)
        eps = torch.randn(mu.size())
        action = (mu + sigma_sq.sqrt() * Variable(eps))
        env_action = action.data.squeeze().numpy()
        state, reward, done, info = env_test.step(env_action)
        if done:
            break

    env_test.plot()