In [1]:
import gym
import numpy as np
import pandas as pd
from collections import deque

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import envs.TradingEnv

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N

In [3]:
class QNetwork(nn.Module):
    def __init__(self, gamma=0.99, learning_rate=0.01, state_size=9, 
                 action_size=3, hidden_size=10, batch_size=20,
                 name='QNetwork'):
        nn.Module.__init__(self)
        self.bn1 = nn.BatchNorm1d(state_size)
        self.fc1 = nn.Linear(state_size, hidden_size)
        self.bn2 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.bn4 = nn.BatchNorm1d(hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        self.bn5 = nn.BatchNorm1d(hidden_size)
        self.fc5 = nn.Linear(hidden_size, hidden_size)
        self.output = nn.Linear(hidden_size, action_size)
        
    def forward(self, x):
        x = self.bn1(x)
        x = F.relu(self.fc1(x))
        x = self.bn2(x)
        x = F.relu(self.fc2(x))
        x = self.bn3(x)
        x = F.relu(self.fc3(x))
        x = self.bn4(x)
        x = F.relu(self.fc4(x))
        x = self.bn5(x)
        x = F.relu(self.fc5(x))
        x = self.output(x)
        return x

In [4]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
            
    def sample(self, batch_size):
        idx = np.random.choice(np.arange(len(self.buffer)), 
                               size=batch_size, 
                               replace=False)
        return [self.buffer[ii] for ii in idx]
    
    def length(self):
        return len(self.buffer)

In [16]:
train_episodes = 100000          # max number of episodes to learn from
max_steps = 365                # max steps in an episode
gamma = 0.99                   # future reward discount

action_size = 3

# Exploration parameters
explore_start = 0.01            # exploration probability at start
explore_stop = 0.0            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
hidden_size = 32               # number of units in each Q-network hidden layer
learning_rate = 0.00001         # Q-network learning rate

# Memory parameters
memory_size = 200000            # memory capacity
batch_size = 512                # experience mini-batch size
pretrain_length = batch_size   # number experiences to pretrain the memory

transaction_cost_ratio = 0.02
target_update = 100

In [6]:
mainQN = QNetwork(name='main', hidden_size=hidden_size, gamma=gamma, learning_rate=learning_rate,batch_size=batch_size)
targetQN = QNetwork(name='main', hidden_size=hidden_size, gamma=gamma, learning_rate=learning_rate,batch_size=batch_size)
env = envs.TradingEnv.FxEnv(scenario_length=max_steps, transaction_cost_ratio=transaction_cost_ratio)
env.seed(1234)
np.random.seed(1234)

In [7]:
"""
def make_state(observation, means, stds):
    return np.array([observation[0], observation[1], observation[2],
                    (observation[3]-means[3])/stds[3],
                    (observation[4]-means[4])/stds[4],
                    (observation[5]-means[5])/stds[5],
                    (observation[6]-means[6])/stds[6],
                    (observation[7]-means[7])/stds[7]])
"""
def make_state(observation, means, stds):
    return np.array(observation)

In [8]:
"""
env.reset()
values = []
for i in range(100000):
    observation, reward, done = env.step(env.action_space.sample())
    values.append(observation)
    if done:
        env.reset()

values_array = np.array(values)
means = values_array.mean(axis=0)
stds = values_array.std(axis=0)
print(means)
print(stds)
"""
means = [0.332350000, 0.334510000, 0.333140000, 0.000368700000, 0.000294200000, 0.546393900, 0.562338400, 0.493165397]
stds = [0.47105571, 0.47181888, 0.47133612, 0.67677779, 0.97742333, 0.97325276, 0.95412702, 0.28551883]

In [9]:
"""
memory = Memory(max_size=memory_size)
env.reset()
observation, reward, done, info = env.step(env.action_space.sample())
state = make_state(observation, means, stds)
for i in range(20000):
    memory.add((state))
    observation, reward, done, info = env.step(env.action_space.sample())
    state = make_state(observation, means, stds)
    if done:
        env.reset()
"""

'\nmemory = Memory(max_size=memory_size)\nenv.reset()\nobservation, reward, done, info = env.step(env.action_space.sample())\nstate = make_state(observation, means, stds)\nfor i in range(20000):\n    memory.add((state))\n    observation, reward, done, info = env.step(env.action_space.sample())\n    state = make_state(observation, means, stds)\n    if done:\n        env.reset()\n'

In [10]:
[val[0:3] for val in memory.sample(3)]

NameError: name 'memory' is not defined

In [11]:
"""
opt = optim.Adam(mainQN.parameters(), 0.00001)
i = 0
while(True):
    batch = memory.sample(512)
    batch_state = np.array(batch)
    batch_q = torch.FloatTensor(np.array([val[0:3] for val in batch]))
    
    current_q_values = mainQN(Variable(torch.FloatTensor(batch_state)))
    loss = torch.nn.MSELoss()(current_q_values, batch_q)
    # backpropagation of loss to NN
    # 勾配を初期化
    opt.zero_grad()
    loss.backward()
    opt.step()
    
    if i % 100 == 0:
        print(loss.data.numpy())
    
    if loss.data.numpy() < 0.0001:
        break
        
    i += 1
torch.save(mainQN.state_dict(), 'KeepPolicy.pth')
"""

"\nopt = optim.Adam(mainQN.parameters(), 0.00001)\ni = 0\nwhile(True):\n    batch = memory.sample(512)\n    batch_state = np.array(batch)\n    batch_q = torch.FloatTensor(np.array([val[0:3] for val in batch]))\n    \n    current_q_values = mainQN(Variable(torch.FloatTensor(batch_state)))\n    loss = torch.nn.MSELoss()(current_q_values, batch_q)\n    # backpropagation of loss to NN\n    # 勾配を初期化\n    opt.zero_grad()\n    loss.backward()\n    opt.step()\n    \n    if i % 100 == 0:\n        print(loss.data.numpy())\n    \n    if loss.data.numpy() < 0.0001:\n        break\n        \n    i += 1\ntorch.save(mainQN.state_dict(), 'KeepPolicy.pth')\n"

In [12]:
#mainQN.load_state_dict(torch.load('KeepPolicy.pth'))
targetQN.load_state_dict(mainQN.state_dict())

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [13]:
def optimize(memory, opt, batch_size):
    # Sample mini-batch from memory
    batch = memory.sample(batch_size)
    #memory.buffer.clear()
    states = np.array([each[0] for each in batch])
    ### ポイント！！！
    # actionはスカラーなのでベクトルにする
    # actionsはベクトルでなく、statesと同じ行列
    actions = np.array([[each[1]] for each in batch])
    ### ポイント終わり
    rewards = np.array([each[2] for each in batch])
    next_states = np.array([each[3] for each in batch])
    # doneなら0, not doneなら1
    # 終端状態のQ値はその後の報酬が存在しないためゼロとする
    dones = np.array([1-float(each[4]) for each in batch])

    # Train network
    #non_final_mask = torch.tensor(tuple(map(lambda s: s==False, dones)), dtype=torch.uint8)
    
    max_actions = mainQN(Variable(torch.FloatTensor(next_states))).max(1)[1].view(-1,1).detach()
    target_maxQs = targetQN(Variable(torch.FloatTensor(next_states))).gather(1, max_actions).squeeze().detach()

    #tutorial way
    targets = (torch.FloatTensor(rewards) + torch.FloatTensor(dones) * gamma * target_maxQs).unsqueeze(1)

    for i in range(1):
        current_q_values = mainQN(Variable(torch.FloatTensor(states))).gather(1, torch.LongTensor(actions))
        loss = torch.nn.SmoothL1Loss()(current_q_values, targets)
        # backpropagation of loss to NN
        # 勾配を初期化
        opt.zero_grad()
        loss.backward()
        opt.step()
        
    return loss

In [14]:
# Initialize the simulation
env.reset()
# Take one random step to get the pole and cart moving
observation, reward, done, _ = env.step(env.action_space.sample())
state = make_state(observation, means, stds)
last_action = 0
current_position_reward = 0
memory = Memory(max_size=memory_size)
t = 0
keep_list = []

# Make a bunch of random actions and store the experiences
while(len(memory.buffer)<batch_size):
    # Uncomment the line below to watch the simulation
    # env.render()

    # Make a random action
    if 0.1 > np.random.rand():
        # Make a random action
        action = env.action_space.sample()
    else:
        action = last_action
    next_observation, reward, done, _ = env.step(action)
    next_state = make_state(next_observation, means, stds)
    
    current_position_reward += reward
    
    if last_action != action:
        if last_action != 0:
            clipped_reward = current_position_reward/(t-entry_t)
            memory.add((entry_state, last_action, clipped_reward, next_state, done))
            memory.add((state, action, clipped_reward, next_state, done))
            if current_position_reward > 0:
                for val in keep_list:
                    memory.add((val[0], val[1], clipped_reward, val[2], done))
            keep_list = []
            

        entry_state = state
        entry_t = t
        current_position_reward = 0.0
    else:
        keep_list.append([state, action, next_state])

    last_action = action
    state = next_state
    t += 1

    if done:
        # Start new episode
        env.reset()
        last_action = 0
        current_position_reward = 0.0
        t = 0
        entry_t = 0
        observation, reward, done, _ = env.step(env.action_space.sample())
        state = make_state(observation, means, stds)

In [17]:
rewards_list = []
opt = optim.Adam(mainQN.parameters(), learning_rate)

outputs = np.empty([1,11])
episode_reward_deque = deque(maxlen=100)

count_stop = 0
for ep in range(train_episodes):
    total_reward = 0
    current_position_reward = 0
    last_action = 0
    initial_action = 0
    t = 0
    # Start new episode
    observation = env.reset()
    state = make_state(observation, means, stds)
            
    for t in range(max_steps):
        # Explore or Exploit
        explore_p = explore_stop + (explore_start - explore_stop)*np.exp(-decay_rate*(ep*max_steps+t))
        if ep > train_episodes:
            explore_p = 0.0
        if explore_p > np.random.rand():
            # Make a random action
            action = env.action_space.sample()
        else:
            # Get action from Q-network
            mainQN.eval()
            Qs = mainQN(Variable(torch.FloatTensor([state]))).data.numpy()
            action = np.argmax(Qs)
            
        #result = np.hstack((state, mainQN(Variable(torch.FloatTensor([state]))).data.numpy()))
        #outputs = np.vstack((outputs, result))

        # Take action, get new state and reward
        last_current = observation[3]
        observation, reward, done, info = env.step(action)
        next_state = make_state(observation, means, stds)

        total_reward += reward
        current_position_reward += reward
        
        if last_action != action or done:
            if last_action != 0:
                clipped_reward = current_position_reward/(t-entry_t)
                memory.add((entry_state, last_action, clipped_reward, next_state, done))
                memory.add((state, action, clipped_reward, next_state, done))
                if current_position_reward > 0:
                    for val in keep_list:
                        memory.add((val[0], val[1], clipped_reward, val[2], done))
                keep_list = []
                print('entry_state_action={}, last_action={}, action={}, position reward={:.4f}'.format(np.argmax(entry_state[0:3]), last_action, action, current_position_reward))
                print('entry_point={:.4f} exit_point={:.4f} max_point={:.4f} min_point={:.4f}'.format(entry_point, info, max_point, min_point))
                
            entry_state = state
            initial_action = last_action
            entry_t = t
            current_position_reward = 0.0
            max_point = info
            min_point = info
            entry_point = last_current
        else:
            keep_list.append([state, action, next_state])
            max_point = info if info > max_point else max_point
            min_point = info if info < min_point else min_point
            
        last_action = action
        state = next_state

        mainQN.train()
        loss = optimize(memory, opt, batch_size)
        
        
        if done:
            episode_reward_deque.append(total_reward)
            # the episode ends so no next state
            print('Episode: {}'.format(ep),
                  'Total reward: {:.4f}'.format(total_reward),
                  'Training loss: {:.4f}'.format(loss.data.numpy()),
                  'Explore P: {:.4f}'.format(explore_p),
                  'Reward mean: {:.4f}'.format(np.array(list(episode_reward_deque)).mean()),
                  'Reward std: {:.4f}'.format(np.array(list(episode_reward_deque)).std()))
            rewards_list.append(total_reward)
            
            if ep % target_update == 0:
                targetQN.load_state_dict(mainQN.state_dict())
            
            break
#df = pd.DataFrame(outputs)
#df.to_csv('result.csv')

entry_state_action=0, last_action=2, action=0, position reward=-2.5284
entry_point=119.2600 exit_point=118.8200 max_point=119.1000 min_point=118.7700
entry_state_action=0, last_action=2, action=0, position reward=-1.7780
entry_point=121.2300 exit_point=120.5800 max_point=125.5000 min_point=118.9000
entry_state_action=0, last_action=2, action=0, position reward=14.4408
entry_point=123.2000 exit_point=107.3900 max_point=123.5600 min_point=106.4600
entry_state_action=0, last_action=2, action=0, position reward=1.8250
entry_point=106.0900 exit_point=101.6300 max_point=106.1200 min_point=102.2500
entry_state_action=0, last_action=2, action=0, position reward=-4.8844
entry_point=102.9600 exit_point=106.5100 max_point=107.2800 min_point=100.8300
Episode: 0 Total reward: -4.0498 Training loss: 0.0173 Explore P: 0.0096 Reward mean: -4.0498 Reward std: 0.0000
entry_state_action=0, last_action=2, action=0, position reward=-2.8004
entry_point=108.8100 exit_point=109.4900 max_point=114.5000 min_poi

entry_state_action=0, last_action=2, action=0, position reward=-0.4446
entry_point=109.0800 exit_point=108.4900 max_point=114.4200 min_point=104.9400
entry_state_action=0, last_action=1, action=1, position reward=1.1400
entry_point=109.3200 exit_point=110.3500 max_point=112.0100 min_point=108.9100
Episode: 11 Total reward: -5.1892 Training loss: 0.0228 Explore P: 0.0065 Reward mean: -10.4602 Reward std: 12.2570
entry_state_action=0, last_action=2, action=1, position reward=1.9960
entry_point=124.2900 exit_point=120.3700 max_point=124.4500 min_point=118.9000
entry_state_action=2, last_action=1, action=0, position reward=-9.3066
entry_point=120.2000 exit_point=114.6600 max_point=123.5600 min_point=112.4100
entry_state_action=0, last_action=2, action=1, position reward=0.4996
entry_point=114.6600 exit_point=110.4200 max_point=114.3400 min_point=111.0200
entry_state_action=2, last_action=1, action=0, position reward=-1.3166
entry_point=111.0200 exit_point=111.0300 max_point=111.3300 min_po

entry_state_action=2, last_action=1, action=2, position reward=-3.8236
entry_point=117.9700 exit_point=116.7800 max_point=121.7900 min_point=114.9100
entry_state_action=1, last_action=2, action=2, position reward=-0.8300
entry_point=117.1800 exit_point=117.6100 max_point=123.9500 min_point=116.7600
Episode: 24 Total reward: -9.5172 Training loss: 0.0287 Explore P: 0.0040 Reward mean: -10.6354 Reward std: 10.6125
entry_state_action=0, last_action=2, action=1, position reward=-5.5406
entry_point=113.0100 exit_point=114.3300 max_point=118.1400 min_point=99.8100
entry_state_action=2, last_action=1, action=0, position reward=-2.8648
entry_point=115.0300 exit_point=113.9500 max_point=115.1600 min_point=111.7700
entry_state_action=0, last_action=1, action=0, position reward=-3.9102
entry_point=112.4000 exit_point=109.4800 max_point=114.2400 min_point=108.3200
entry_state_action=0, last_action=2, action=2, position reward=0.5700
entry_point=109.1400 exit_point=109.1700 max_point=109.7400 min_p

entry_state_action=0, last_action=2, action=0, position reward=-15.7560
entry_point=81.1700 exit_point=94.6300 max_point=103.5000 min_point=77.6100
entry_state_action=0, last_action=1, action=0, position reward=-1.0456
entry_point=98.0900 exit_point=98.7600 max_point=100.3300 min_point=96.2400
entry_state_action=0, last_action=2, action=1, position reward=-1.0756
entry_point=97.3000 exit_point=97.1600 max_point=97.2600 min_point=96.7800
entry_state_action=2, last_action=1, action=1, position reward=0.4500
entry_point=96.7800 exit_point=97.6100 max_point=97.1600 min_point=97.1600
Episode: 40 Total reward: -23.4384 Training loss: 0.0243 Explore P: 0.0022 Reward mean: -8.8024 Reward std: 11.9886
entry_state_action=0, last_action=2, action=0, position reward=1.6970
entry_point=121.9500 exit_point=117.4000 max_point=125.1000 min_point=115.6000
entry_state_action=0, last_action=2, action=0, position reward=5.0210
entry_point=116.2500 exit_point=108.4000 max_point=116.1500 min_point=108.9000


entry_state_action=0, last_action=2, action=0, position reward=19.0492
entry_point=117.7600 exit_point=99.7200 max_point=123.9500 min_point=97.0400
entry_state_action=0, last_action=2, action=1, position reward=-0.3766
entry_point=99.5600 exit_point=99.4000 max_point=100.7200 min_point=98.8300
entry_state_action=2, last_action=1, action=0, position reward=0.5012
entry_point=98.8300 exit_point=101.4600 max_point=102.6400 min_point=99.4000
entry_state_action=0, last_action=2, action=2, position reward=-0.0500
entry_point=101.1500 exit_point=101.9300 max_point=101.8800 min_point=101.8800
Episode: 57 Total reward: 11.4544 Training loss: 0.0189 Explore P: 0.0012 Reward mean: -6.2201 Reward std: 13.0816
entry_state_action=0, last_action=1, action=1, position reward=3.6500
entry_point=114.0100 exit_point=117.8800 max_point=121.7900 min_point=109.6200
Episode: 58 Total reward: 1.5898 Training loss: 0.0200 Explore P: 0.0012 Reward mean: -6.0877 Reward std: 13.0094
entry_state_action=0, last_act

entry_state_action=0, last_action=2, action=2, position reward=19.4000
entry_point=128.4500 exit_point=108.4000 max_point=127.8000 min_point=108.9000
Episode: 75 Total reward: 17.4810 Training loss: 0.0213 Explore P: 0.0006 Reward mean: -4.5731 Reward std: 13.1029
entry_state_action=0, last_action=2, action=0, position reward=2.0970
entry_point=124.2500 exit_point=120.0000 max_point=125.1000 min_point=116.9000
entry_state_action=0, last_action=2, action=2, position reward=5.6300
entry_point=117.0000 exit_point=111.7700 max_point=120.5500 min_point=103.9500
Episode: 76 Total reward: 2.1020 Training loss: 0.0162 Explore P: 0.0006 Reward mean: -4.4864 Reward std: 13.0395
entry_state_action=0, last_action=1, action=0, position reward=1.8852
entry_point=77.1500 exit_point=81.1800 max_point=84.0000 min_point=75.7600
entry_state_action=0, last_action=1, action=0, position reward=-0.8574
entry_point=82.4500 exit_point=83.7100 max_point=83.3700 min_point=82.5600
entry_state_action=0, last_actio

entry_state_action=0, last_action=2, action=1, position reward=-5.9298
entry_point=99.1600 exit_point=102.0600 max_point=105.4000 min_point=96.2400
entry_state_action=2, last_action=1, action=1, position reward=-0.2300
entry_point=101.9900 exit_point=101.8300 max_point=102.6600 min_point=101.4400
Episode: 93 Total reward: 7.7326 Training loss: 0.0222 Explore P: 0.0003 Reward mean: -3.9332 Reward std: 12.4754
entry_state_action=0, last_action=1, action=0, position reward=7.7206
entry_point=109.8100 exit_point=119.8800 max_point=121.1200 min_point=102.1700
Episode: 94 Total reward: 5.5644 Training loss: 0.0181 Explore P: 0.0003 Reward mean: -3.8333 Reward std: 12.4474
entry_state_action=0, last_action=2, action=1, position reward=-3.7852
entry_point=111.3500 exit_point=112.4500 max_point=114.4200 min_point=104.9400
entry_state_action=2, last_action=1, action=1, position reward=0.3500
entry_point=112.7600 exit_point=112.8000 max_point=113.2800 min_point=111.9500
Episode: 95 Total reward: 

entry_state_action=0, last_action=2, action=0, position reward=-21.3994
entry_point=104.3900 exit_point=123.4700 max_point=125.5000 min_point=101.2400
entry_state_action=0, last_action=1, action=1, position reward=0.1500
entry_point=123.0000 exit_point=122.7200 max_point=122.5700 min_point=122.5700
Episode: 110 Total reward: -25.8672 Training loss: 0.0246 Explore P: 0.0002 Reward mean: -3.1345 Reward std: 11.8154
entry_state_action=0, last_action=2, action=1, position reward=-2.1674
entry_point=110.8000 exit_point=110.1900 max_point=114.3800 min_point=104.9400
entry_state_action=2, last_action=1, action=0, position reward=-1.4394
entry_point=109.8700 exit_point=111.4600 max_point=111.7900 min_point=110.1900
Episode: 111 Total reward: -6.3186 Training loss: 0.0195 Explore P: 0.0002 Reward mean: -3.1458 Reward std: 11.8179
entry_state_action=0, last_action=1, action=0, position reward=-5.8620
entry_point=82.0400 exit_point=78.1200 max_point=85.4700 min_point=77.1800
entry_state_action=0,

entry_state_action=0, last_action=2, action=0, position reward=-6.7782
entry_point=90.0700 exit_point=95.4900 max_point=100.7800 min_point=88.9000
entry_state_action=0, last_action=2, action=1, position reward=-3.7192
entry_point=94.1600 exit_point=94.5700 max_point=98.6100 min_point=92.9100
entry_state_action=2, last_action=1, action=1, position reward=-3.4900
entry_point=95.4600 exit_point=91.0800 max_point=94.7500 min_point=88.4800
Episode: 125 Total reward: -22.0478 Training loss: 0.0116 Explore P: 0.0001 Reward mean: -3.2467 Reward std: 12.2046
entry_state_action=0, last_action=2, action=2, position reward=1.2000
entry_point=110.2600 exit_point=108.3100 max_point=114.4200 min_point=104.9400
Episode: 126 Total reward: -0.2552 Training loss: 0.0169 Explore P: 0.0001 Reward mean: -3.1892 Reward std: 12.2050
entry_state_action=0, last_action=1, action=2, position reward=-12.2808
entry_point=91.1400 exit_point=83.3800 max_point=94.4200 min_point=80.6100
entry_state_action=1, last_actio

entry_state_action=1, last_action=2, action=0, position reward=-0.2752
entry_point=80.1900 exit_point=79.1500 max_point=81.3100 min_point=78.7600
entry_state_action=0, last_action=2, action=2, position reward=2.1700
entry_point=79.1500 exit_point=76.8900 max_point=79.2700 min_point=75.7600
Episode: 138 Total reward: -10.3360 Training loss: 0.0110 Explore P: 0.0001 Reward mean: -3.1618 Reward std: 12.0677
entry_state_action=0, last_action=1, action=0, position reward=-2.3212
entry_point=118.8500 exit_point=119.7200 max_point=121.1200 min_point=109.6200
entry_state_action=0, last_action=2, action=0, position reward=-4.7784
entry_point=115.6800 exit_point=118.8800 max_point=121.7900 min_point=114.9100
entry_state_action=0, last_action=1, action=1, position reward=1.0500
entry_point=118.4100 exit_point=119.5900 max_point=119.7800 min_point=118.5400
Episode: 139 Total reward: -13.1584 Training loss: 0.0187 Explore P: 0.0001 Reward mean: -3.3887 Reward std: 12.0402
entry_state_action=0, last

entry_state_action=0, last_action=2, action=2, position reward=-13.5200
entry_point=100.4300 exit_point=114.1400 max_point=118.1400 min_point=100.1300
Episode: 153 Total reward: -3.0634 Training loss: 0.0106 Explore P: 0.0000 Reward mean: -2.0636 Reward std: 12.3691
entry_state_action=0, last_action=2, action=0, position reward=-10.4638
entry_point=109.1500 exit_point=115.7300 max_point=121.1200 min_point=102.1700
Episode: 154 Total reward: -13.0768 Training loss: 0.0155 Explore P: 0.0000 Reward mean: -2.2775 Reward std: 12.3727
entry_state_action=0, last_action=2, action=0, position reward=-6.1284
entry_point=115.9300 exit_point=118.8800 max_point=121.7900 min_point=109.6200
entry_state_action=0, last_action=1, action=1, position reward=5.1400
entry_point=118.4100 exit_point=123.6800 max_point=123.7200 min_point=118.5400
Episode: 155 Total reward: -4.2752 Training loss: 0.0091 Explore P: 0.0000 Reward mean: -2.4668 Reward std: 12.2565
entry_state_action=0, last_action=2, action=2, pos

entry_state_action=2, last_action=1, action=1, position reward=-1.2000
entry_point=87.8900 exit_point=86.6900 max_point=88.9900 min_point=86.6500
Episode: 175 Total reward: 7.2852 Training loss: 0.0169 Explore P: 0.0000 Reward mean: -3.0256 Reward std: 11.9283
entry_state_action=0, last_action=2, action=0, position reward=0.0448
entry_point=114.4000 exit_point=113.5700 max_point=123.9500 min_point=112.7600
entry_state_action=0, last_action=2, action=0, position reward=-0.8560
entry_point=111.5100 exit_point=108.9100 max_point=114.4800 min_point=107.3700
Episode: 176 Total reward: -5.1094 Training loss: 0.0126 Explore P: 0.0000 Reward mean: -3.0977 Reward std: 11.9189
entry_state_action=0, last_action=2, action=0, position reward=-5.3556
entry_point=115.6100 exit_point=115.6400 max_point=121.7900 min_point=113.9900
entry_state_action=0, last_action=2, action=0, position reward=-5.4494
entry_point=115.6400 exit_point=118.5400 max_point=123.9500 min_point=115.8600
entry_state_action=0, la

entry_state_action=0, last_action=2, action=0, position reward=-14.7948
entry_point=91.0300 exit_point=102.9500 max_point=105.4000 min_point=90.5100
Episode: 192 Total reward: -16.0954 Training loss: 0.0109 Explore P: 0.0000 Reward mean: -3.5608 Reward std: 11.8396
entry_state_action=0, last_action=2, action=0, position reward=11.2522
entry_point=121.1400 exit_point=107.4600 max_point=123.5600 min_point=106.4600
entry_state_action=0, last_action=2, action=0, position reward=5.7670
entry_point=108.3700 exit_point=101.2500 max_point=110.9400 min_point=100.8300
Episode: 193 Total reward: 12.2090 Training loss: 0.0198 Explore P: 0.0000 Reward mean: -3.5160 Reward std: 11.8905
entry_state_action=0, last_action=2, action=0, position reward=11.2522
entry_point=121.1400 exit_point=107.4600 max_point=123.5600 min_point=106.4600
entry_state_action=0, last_action=2, action=0, position reward=5.7670
entry_point=108.3700 exit_point=101.2500 max_point=110.9400 min_point=100.8300
Episode: 194 Total r

entry_state_action=1, last_action=2, action=0, position reward=0.5818
entry_point=98.5900 exit_point=95.4900 max_point=100.7800 min_point=87.4700
entry_state_action=0, last_action=2, action=0, position reward=0.0710
entry_point=96.0300 exit_point=93.0600 max_point=98.6100 min_point=92.9500
entry_state_action=0, last_action=2, action=0, position reward=5.0738
entry_point=92.9100 exit_point=86.7600 max_point=97.2100 min_point=86.3100
entry_state_action=0, last_action=1, action=1, position reward=3.7100
entry_point=90.1600 exit_point=92.8900 max_point=94.4200 min_point=88.2600
Episode: 209 Total reward: 5.4466 Training loss: 0.0199 Explore P: 0.0000 Reward mean: -3.6274 Reward std: 12.0323
entry_state_action=0, last_action=2, action=0, position reward=3.3640
entry_point=117.5500 exit_point=112.0500 max_point=121.2500 min_point=111.8000
Episode: 210 Total reward: 1.1630 Training loss: 0.0098 Explore P: 0.0000 Reward mean: -3.3571 Reward std: 11.8316
entry_state_action=0, last_action=2, act

entry_state_action=0, last_action=1, action=2, position reward=-10.1150
entry_point=110.7300 exit_point=101.6300 max_point=110.9400 min_point=102.2500
entry_state_action=1, last_action=2, action=0, position reward=-11.6614
entry_point=102.2500 exit_point=111.3500 max_point=118.1400 min_point=99.8100
Episode: 225 Total reward: -23.7810 Training loss: 0.0150 Explore P: 0.0000 Reward mean: -2.6407 Reward std: 11.8405
entry_state_action=0, last_action=2, action=0, position reward=0.3806
entry_point=119.9200 exit_point=115.3600 max_point=125.5000 min_point=116.9700
entry_state_action=0, last_action=2, action=0, position reward=5.3808
entry_point=114.3400 exit_point=107.3900 max_point=113.9700 min_point=106.4600
Episode: 226 Total reward: 1.6762 Training loss: 0.0148 Explore P: 0.0000 Reward mean: -2.6214 Reward std: 11.8459
entry_state_action=0, last_action=1, action=2, position reward=-4.6100
entry_point=96.2400 exit_point=92.8200 max_point=98.6100 min_point=86.3100
entry_state_action=1, l

Episode: 242 Total reward: 3.4242 Training loss: 0.0108 Explore P: 0.0000 Reward mean: -2.7636 Reward std: 10.7405
entry_state_action=0, last_action=1, action=2, position reward=22.7754
entry_point=78.3600 exit_point=101.1800 max_point=103.5000 min_point=77.6100
entry_state_action=1, last_action=2, action=0, position reward=0.4040
entry_point=102.2300 exit_point=96.4400 max_point=101.1800 min_point=97.4400
Episode: 243 Total reward: 21.7122 Training loss: 0.0116 Explore P: 0.0000 Reward mean: -2.5625 Reward std: 11.0054
entry_state_action=0, last_action=2, action=1, position reward=-8.6946
entry_point=105.9200 exit_point=114.6900 max_point=113.7900 min_point=106.3500
entry_state_action=2, last_action=1, action=0, position reward=1.8222
entry_point=113.7300 exit_point=118.6100 max_point=121.6000 min_point=114.4000
entry_state_action=0, last_action=2, action=0, position reward=-3.7118
entry_point=118.6100 exit_point=120.0700 max_point=125.5000 min_point=116.4400
Episode: 244 Total reward

KeyboardInterrupt: 

In [None]:
rews = np.array(rewards_list).T
eps = range(len(rews))
smoothed_rews = running_mean(rews, 10)
plt.plot(eps[-len(smoothed_rews):], smoothed_rews)
plt.plot(eps, rews, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Total Reward')

In [21]:
np.array(rewards_list[1990:2000]).mean()

999.7061111111111

In [22]:
rewards_list[1990:2000]

[(1991, 18.950000000000003),
 (1992, 20.279999999999987),
 (1993, 5.310000000000002),
 (1994, 3.039999999999992),
 (1995, -26.14),
 (1996, 0),
 (1997, 0),
 (1998, 18.269999999999996),
 (1999, 0)]