In [30]:
import gym
import gym_anytrading

from gym_anytrading.envs import TradingEnv, ForexEnv, StocksEnv, Actions, Positions 
from gym_anytrading.datasets import FOREX_EURUSD_1H_ASK, STOCKS_GOOGL
import matplotlib.pyplot as plt
from keras import applications
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Dropout, Input, Concatenate, Conv2D
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.agents.ddpg import DDPGAgent
from rl.policy import BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.random import OrnsteinUhlenbeckProcess
from rl.callbacks import FileLogger, ModelIntervalCheckpoint, TrainIntervalLogger

from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

import json
import numpy as np

In [31]:
class EnvProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 2  # (height, width, channel)
        return self.process_obs_1(observation)
    
    def process_obs_1(self, observation):
        prices = []
        diff = []
        for o in observation:
            prices.append(o[0])
            diff.append(o[1])
        prices = preprocessing.normalize([prices], norm='l2', axis=1, copy=True, return_norm=False)[0]
        diff = preprocessing.normalize([diff], norm='l2', axis=1, copy=True, return_norm=False)[0]
        new_obs = np.column_stack((prices, diff))
        return new_obs
    
    def process_obs_2(self, observation):
        prices = []
        diff = []
        for o in observation:
            prices.append(o[0])
            diff.append(o[1] / o[0] * 100)
        prices = preprocessing.normalize([prices], norm='l2', axis=1, copy=True, return_norm=False)[0]
        new_obs = np.column_stack((prices, diff))
        return new_obs
        
    def process_state_batch(self, batch):
#         print(batch)
        return batch

    def process_reward(self, reward):
        return reward

class DDPGEnvProcessor(EnvProcessor):
    def process_action(self,action):
        return np.argmax(action)
    
class CustomEpsGreedyQPolicy(EpsGreedyQPolicy):
    def __init__(self, eps=0.1, update_interval=100):
        EpsGreedyQPolicy.__init__(self, eps)
        self.update_interval = update_interval
        self.count = 0
        self.init_eps = self.eps
        
    def select_action(self, q_values):
        assert q_values.ndim == 1
        nb_actions = q_values.shape[0]

        if np.random.uniform() < self.eps:
            action = np.random.randint(0, nb_actions)
        else:
            action = np.argmax(q_values)
        self.count += 1
        if (self.count % self.update_interval) == 0:
            self.eps = self.init_eps / (self.count / self.update_interval)
            print(self.eps)
        return action

In [32]:
env = gym.make('forex-v0', frame_bound=(50, 4000), window_size=10)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Conv2D(input_shape=(1,) + env.observation_space.shape, filters=32, kernel_size=(4,2), padding='same'))
model.add(Conv2D(filters=64, kernel_size=(4,2), padding='same'))
model.add(Conv2D(filters=128, kernel_size=(4,2), padding='same'))
model.add(Conv2D(filters=128, kernel_size=(4,2), padding='same'))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(1024, activation='relu'))
model.add(Dense(nb_actions, activation='linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 1, 10, 32)         544       
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 1, 10, 64)         16448     
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 1, 10, 128)        65664     
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 1, 10, 128)        131200    
_________________________________________________________________
flatten_7 (Flatten)          (None, 1280)              0         
_________________________________________________________________
dense_29 (Dense)             (None, 1024)              1311744   
_________________________________________________________________
dense_30 (Dense)             (None, 1024)              1049600   
__________

In [33]:
# 3 = enable_double_dqn=True, nb_steps=1000000, EpsGreedyQPolicy **
# 4 = enable_double_dqn=True, nb_steps=100000, EpsGreedyQPolicy
# 5 = enable_double_dqn=True, nb_steps=100000, BoltzmannQPolicy **
# 6 = enable_double_dqn=True, nb_steps=1000000, CustomEpsGreedyQPolicy, process_obs_2
# 7 = enable_double_dqn=True, nb_steps=1000000, EpsGreedyQPolicy, process_obs_2 **
# 8 = enable_double_dqn=True, nb_steps=1000000, CustomEpsGreedyQPolicy **
# 9 = enable_double_dqn=True, nb_steps=1000000, CustomEpsGreedyQPolicy(update_interval=2500), train_interval=256, batch_size=512
# 10 = enable_double_dqn=True, nb_steps=1000000, BoltzmannQPolicy, train_interval=256, batch_size=512 **
# 11 = DDPG, train_interval=256, batch_size=512 ***
# 12 = enable_double_dqn=True, nb_steps=1000000, BoltzmannQPolicy, train_interval=32, batch_size=64 
# 13 = DDPG, train_interval=128, batch_size=64 ***
# 14 = DDPG, train_interval=128, batch_size=256, lr = 0.01, target_model_update=0.01, gamma=0.9
# 15 = DDPG, train_interval=128, batch_size=256, lr = 0.0001, target_model_update=0.0001, gamma=0.9
train_no = 14
weights_filename = 'dqn_weights_{}.h5f'.format(train_no)
checkpoint_weights_filename = 'dqn_weights_{step}_'+'{}.h5f'.format(train_no)
log_filename = 'dqn_log_{}.csv'.format(train_no)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=680)]
callbacks += [TrainIntervalLogger(interval=100)]

In [34]:
processor = EnvProcessor()
memory = SequentialMemory(limit=100000, window_length=1)
# policy = CustomEpsGreedyQPolicy(update_interval = 2500, eps = 1.0)
policy = BoltzmannQPolicy()
agent = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=20,
               target_model_update=1e-2, policy=policy, batch_size=64, processor=processor, 
               train_interval=32, enable_double_dqn=True)
agent.compile(Adam(lr=1e-3), metrics=['mae'])


In [35]:
agent.fit(env, nb_steps=4000, visualize=False, verbose=2, callbacks=callbacks)

# After training is done, we save the final weights.
agent.save_weights('dqn_{}_weights.h5f'.format('forex-v0'), overwrite=True)

Training for 4000 steps ...
Training for 4000 steps ...
Interval 1 (0 steps performed)
Interval 2 (100 steps performed)
Interval 3 (200 steps performed)
Interval 4 (300 steps performed)
Interval 5 (400 steps performed)
Interval 6 (500 steps performed)
Interval 7 (600 steps performed)
Interval 8 (700 steps performed)
Interval 9 (800 steps performed)
Interval 10 (900 steps performed)
Interval 11 (1000 steps performed)
Interval 12 (1100 steps performed)
Interval 13 (1200 steps performed)
Interval 14 (1300 steps performed)
Interval 15 (1400 steps performed)
Interval 16 (1500 steps performed)
Interval 17 (1600 steps performed)
Interval 18 (1700 steps performed)
Interval 19 (1800 steps performed)
Interval 20 (1900 steps performed)
Interval 21 (2000 steps performed)
Interval 22 (2100 steps performed)
Interval 23 (2200 steps performed)
Interval 24 (2300 steps performed)
Interval 25 (2400 steps performed)
Interval 26 (2500 steps performed)
Interval 27 (2600 steps performed)
Interval 28 (2700 st

In [36]:
print("total reward: {}, total profit: {}".format(env._total_reward, env._total_profit))

total reward: -37.199999999999456, total profit: 0.9890048874492001


In [8]:
(1,) + (10,2)

(1, 10, 2)