In [None]:
%matplotlib inline
from trading_gym.env import TradeEnv
from datetime import datetime
import random
import matplotlib
import matplotlib.pylab as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
#matplotlib.use('tkAgg')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device =  torch.device("cpu")

def custom_obs_features_func(history, info):
    list = []
    for i in range(len(history.obs_list)):
        list.append(history.obs_list[i].close)
    
    return list

def custom_reward_func(exchange):
    #info
    '''
    {'index': 56, 'date': '2010-01-01 01:04', 'nav': 50000, 'amount': 250000, 'avg_price': 1.4325899999999998,
    'profit': {'total': -282.0124161115024, 'fixed': -272.23990618194, 'floating': -9.7725099295624},
    'buy_at': 52, 'latest_price': 1.43231}
    '''
    #print(exchange.info)
    #print('Profit: {} , floating: {} , fixed: {}'.format(exchange.profit, exchange.floating_profit, exchange.fixed_profit))
    
    # profit , index - 50
    if(exchange.floating_profit > 0):
        reward = exchange.floating_profit * (exchange.info["index"] - 50) * 0.001
    else:
        reward = exchange.profit * 0.001
    
    
    reward = exchange.floating_profit * (exchange.info["index"] - 50) * 0.01
    #print(exchange.info["amount"])
    #print(exchange.available_actions)
    #print(reward)
    
    return reward

# Hyper Parameters
BATCH_SIZE = 32
LR = 0.01                   # learning rate
EPSILON = 0.9               # greedy policy
GAMMA = 0.9                 # reward discount
TARGET_REPLACE_ITER = 100   # target update frequency
MEMORY_CAPACITY = 5000
env = TradeEnv(data_path='eurusd_patterns_10_test2_slope_trend_pro.lite1-3.csv',
               ops_shape=[],
               get_obs_features_func=custom_obs_features_func,
               get_reward_func=custom_reward_func,
               nav=5000, 
               data_kwargs={'use_ta': False}
              )

#nv = TradeEnv(data_path='eurusd_patterns_10_test2_slope_trend_pro.lite1-6.csv',ops_shape=[],get_obs_features_func=custom_obs_features_func,nav=1000,get_reward_func=custom_reward_func, data_kwargs={'use_ta': False})
#env = TradeEnv(data_path='eurusd_patterns_10_test2_slope_trend_pro.lite1-6.csv',nav=1000, data_kwargs={'use_ta': False})
env = env.unwrapped
N_ACTIONS = 3
N_STATES = 51
ENV_A_SHAPE = 0
n_episodes = 1000 #10000
PATH = "./training_game_01.h5"


class Net(nn.Module):
    def __init__(self, ):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(N_STATES, 50).to(device)
        self.fc1.weight.data.normal_(0, 0.1)   # initialization
        self.fc2 = nn.Linear(50, 50).to(device)
        self.fc2.weight.data.normal_(0, 0.1)   # initialization
        self.out = nn.Linear(50, 50).to(device)
        self.out.weight.data.normal_(0, 0.1)   # initialization

    def forward(self, x):
        #print(x)
        #print(x.size())
        x = x.to(device)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        #print(x.size())
        actions_value = self.out(x)
        #print(actions_value)
        #print(actions_value.size())
        return actions_value


class DQN(object):
    def __init__(self):
        self.eval_net, self.target_net = Net().to(device), Net().to(device)

        self.learn_step_counter = 0                                     # for target updating
        self.memory_counter = 0                                         # for storing memory
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))     # initialize memory
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
        self.loss_func = nn.MSELoss()

    def choose_action(self, x):
        #print(x)
        x = torch.unsqueeze(torch.FloatTensor(x), 0).to(device)
        # input only one sample
        if np.random.uniform() < EPSILON:   # greedy
            actions_value = self.eval_net.forward(x)
            action = torch.max(actions_value, 1)[1].data.cpu().numpy()
            action = action[0] if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)  # return the argmax index
            
        else:   # random
            action = np.random.randint(0, N_ACTIONS)
            action = action if ENV_A_SHAPE == 0 else action.reshape(ENV_A_SHAPE)
            
        return action

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % MEMORY_CAPACITY
        self.memory[index, :] = transition
        self.memory_counter += 1

    def learn(self):
        # target parameter update
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_net.load_state_dict(self.eval_net.state_dict())
        self.learn_step_counter += 1

        # sample batch transitions
        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)
        b_memory = self.memory[sample_index, :]
        b_s = torch.FloatTensor(b_memory[:, :N_STATES]).to(device)
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int)).to(device)
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2]).to(device)
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:]).to(device)

        # q_eval w.r.t the action in experience
        q_eval = self.eval_net(b_s).gather(1, b_a)  # shape (batch, 1)
        q_next = self.target_net(b_s_).detach()     # detach from graph, don't backpropagate
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)   # shape (batch, 1)
        loss = self.loss_func(q_eval, q_target)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
    

dqn = DQN()


print('\nCollecting experience...')
for i_episode in range(n_episodes):
    t = 0
    rewards = 0
    obs0 = env.reset()
    while True:
        #env.render()
        action = dqn.choose_action(obs0)
        observation, reward, done, info = env.step(action)

        #action 0 sell, 1 hold, 2 buy

        # 儲存 experience
        if(info["profit"]["floating"] > 20):
            dqn.store_transition(obs0, action, reward, observation)
        
        # 累積 reward
        rewards += reward

        # 有足夠 experience 後進行訓練
        if dqn.memory_counter > MEMORY_CAPACITY:
            dqn.learn()
            #print(dqn.memory_counter)

        # 進入下一 state
        obs0 = observation

        if(done):
            #torch.save(dqn, PATH)
            print('({}) finished after {} timesteps, total profit: {}, memery: {}'.format(i_episode+1, t+1, info["profit"]["total"],dqn.memory_counter))
            break

        t += 1

env.close()


Collecting experience...
(1) finished after 1550 timesteps, total profit: -378.10766847989504, memery: 0
(2) finished after 1191 timesteps, total profit: -354.85142172681975, memery: 0
(3) finished after 34281 timesteps, total profit: -366.41246353579146, memery: 5987
(4) finished after 1394 timesteps, total profit: -391.2549554443455, memery: 5991
(5) finished after 1228 timesteps, total profit: -414.93931079121955, memery: 6003
(6) finished after 1394 timesteps, total profit: -371.296534852914, memery: 6003
(7) finished after 1367 timesteps, total profit: -351.2323108470363, memery: 6006
(8) finished after 1218 timesteps, total profit: -416.001367591529, memery: 6007
(9) finished after 1325 timesteps, total profit: -361.46909099535907, memery: 6007
(10) finished after 1209 timesteps, total profit: -364.9035138400277, memery: 6007
(11) finished after 875 timesteps, total profit: -350.61460520403375, memery: 6008
(12) finished after 1425 timesteps, total profit: -362.7774257489806, me