In [1]:
from Env.UpbitEnvironment import UpbitSimpleSimulator

import random
import math
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple, deque
from itertools import count
import sys

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

env = UpbitSimpleSimulator()

b_ipython = 'inline' in matplotlib.get_backend()
if b_ipython:
    from IPython import display
    
plt.ion()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [2]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        
    def push(self, *args):
        self.memory.append(Transition(*args))
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
            return len(self.memory)

In [3]:
class DQN(nn.Module):
    
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        self.layer1 = nn.Linear(n_observations, 128)
        self.layer2 = nn.Linear(128, 128)
        self.layer3 = nn.Linear(128, n_actions)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

In [4]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4

n_action = env.action_space.n
state, info = env.reset()
n_input_state = 10
n_observation = len(state) * n_input_state

policy_net = DQN(n_observations=n_observation, n_actions=n_action).to(device=device)
target_net = DQN(n_observations=n_observation, n_actions=n_action).to(device=device)
target_net.load_state_dict(policy_net.state_dict())

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)

steps_done = 0

def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1).indices.view(1, 1)
    else:
        return torch.tensor([[env.action_space.sample()]], device=device, dtype=torch.long)

episode_durations = []

def plot_durations(show_result = False):
    plt.figure(1)
    durations_t = torch.tensor(episode_durations, dtype=torch.float)
    if show_result:
        plt.title('Result')
    else:
        plt.clf()
        plt.title('Training')
    plt.xlabel('Episode')
    plt.ylabel('Duration')
    plt.plot(durations_t.numpy())
    
    if len(durations_t) >= 100:
        means = durations_t.unfold(0, 100, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(99), means))
        plt.plot(means.numpy())

    plt.pause(0.001)
    if b_ipython:
        display.display(plt.gcf())
        if not show_result:
            display.clear_output(wait=True)
            
    

In [5]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(batch_size=BATCH_SIZE)
    
    batch = Transition(*zip(*transitions))
    
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                            batch.next_state)), device=device, dtype=torch.bool)
    
    non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
    
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    
    # Q(s_t, a) 계산 - 모델이 Q(s_t)를 계산하고, 취한 행동의 열을 선택합니다.
    # 이들은 policy_net에 따라 각 배치 상태에 대해 선택된 행동입니다.
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1).values
        
    # 기대 Q 값 계산
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Huber 손실 계산
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # 모델 최적화
    optimizer.zero_grad()
    loss.backward()
    # 변화도 클리핑 바꿔치기
    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)
    optimizer.step()
    
    

In [6]:
print(n_observation)
print(policy_net)

900
DQN(
  (layer1): Linear(in_features=900, out_features=128, bias=True)
  (layer2): Linear(in_features=128, out_features=128, bias=True)
  (layer3): Linear(in_features=128, out_features=3, bias=True)
)


In [7]:
state = torch.tensor(state, dtype=torch.float32, device=device)
print(state.shape)

torch.Size([90])


In [8]:
state, info = env.reset()
state_deque = deque(maxlen=n_input_state)
for i in range(n_input_state):
    state_deque.append(state)
print(len(state_deque[0]))

90


In [9]:
state = torch.cat([torch.tensor(s) for s in list(state_deque)]).unsqueeze(0).to(device)
print(state.shape)
print(state[0])

torch.Size([1, 900])
tensor([ 9.1999e+07,  9.1150e+07,  9.1648e+07,  9.1610e+07,  9.1259e+07,
         9.1259e+07,  9.1610e+07,  9.1434e+07,  2.6864e+03,  2.4621e+11,
         9.1999e+07,  9.1150e+07, -3.5100e+05, -3.8315e-03,  2.1905e-04,
         1.1904e+11,  2.4621e+11,  1.2998e+03,  2.6864e+03,  1.0500e+08,
         3.8820e+07,  9.1259e+07,  5.8813e-02,  9.1260e+07,  1.2568e-01,
         9.1277e+07,  1.3969e-02,  9.1279e+07,  5.4777e-03,  9.1280e+07,
         2.8664e-04,  9.1281e+07,  8.0093e-01,  9.1302e+07,  1.0000e-02,
         9.1303e+07,  5.2589e-02,  9.1313e+07,  2.1900e-02,  9.1314e+07,
         1.0951e-03,  9.1320e+07,  2.9922e-02,  9.1322e+07,  4.1360e-02,
         9.1332e+07,  2.4071e-03,  9.1343e+07,  1.2011e-03,  9.1347e+07,
         2.4015e-02,  9.1258e+07,  4.8497e-02,  9.1252e+07,  1.5341e-03,
         9.1251e+07,  1.0200e-03,  9.1250e+07,  2.3715e-01,  9.1249e+07,
         2.0000e-02,  9.1245e+07,  4.4074e-01,  9.1244e+07,  1.1190e-01,
         9.1242e+07,  5.6553e-

In [11]:
if torch.cuda.is_available():
    num_episodes = 600
    n_iter = 5
else:
    num_episodes = 50
    n_iter = 50
    
target_coin = 'BTC'
    
for i_episode in range(num_episodes):
    # 환경과 상태 초기화
    state, info = env.reset(target=target_coin, end_condition=0.01)
    state_deque = deque(maxlen=n_input_state)
    for i in range(n_input_state):
        next_state, _, _, _, _ = env.step(0)
        state_deque.append(next_state)
    state = torch.cat([torch.tensor(s) for s in list(state_deque)]).unsqueeze(0).to(device)
    cumulative_reward = 0
    for t in range(n_iter):
        action = select_action(state)
        observation, reward, terminated, truncated, info = env.step(action.item())
        cumulative_reward += reward
        reward = torch.tensor([reward], device=device)
        done = terminated or truncated

        if terminated:
            next_state = None
        else:
            state_deque.append(observation)
            next_state = torch.cat([torch.tensor(s) for s in list(state_deque)]).unsqueeze(0).to(device)
            # next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0)

        # 메모리에 변이 저장
        memory.push(state, action, next_state, reward)

        # 다음 상태로 이동
        state = next_state

        # (정책 네트워크에서) 최적화 한단계 수행
        optimize_model()

        # 목표 네트워크의 가중치를 소프트 업데이트
        # θ′ ← τ θ + (1 −τ )θ′
        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_net.load_state_dict(target_net_state_dict)
        
        if b_ipython:
            display.clear_output()
        sys.stdout.write(f"\rEpisode {i_episode + 1} - {t + 1}\n")
        sys.stdout.write(f"Action: {action[0][0]:>30}\n")
        sys.stdout.write(f'Reward: {reward[0]:>30}\n')
        sys.stdout.write(f"Current {target_coin} Price: {info['current_price']:>30}\n")
        sys.stdout.write(f"Prev Asset Value:{info['prev_asset_value']:>30}\n")
        sys.stdout.write(f"Current Asset Value:{info['curr_asset_value']:>30}\n")
        sys.stdout.write(f"Free KRW: {info['free_krw']:>30}\n")
        sys.stdout.write(f"Used KRW: {info['used_krw']:>30}\n")
        sys.stdout.write(f"Free Coin: {info['free_coin']:>30}\n")
        sys.stdout.write(f"Used Coin: {info['used_coin']:>30}\n")
        sys.stdout.write(f"Buy Order Amount: {info['buy_order_amount']:>30}\n")
        sys.stdout.write(f"Buy Order Price: {info['buy_order_price']:>30}\n")
        sys.stdout.write(f"Sell Order Amount: {info['sell_order_amount']:>30}\n")
        sys.stdout.write(f"Sell Order Price: {info['sell_order_price']:>10}\n")
            
        if not b_ipython:
            for i in range(14): sys.stdout.write("\033[F")
        sys.stdout.flush()

        if done:
            episode_durations.append(cumulative_reward)
            plot_durations()
            break
    

print('Complete')
plot_durations(show_result=True)
plt.ioff()
plt.show()

Episode 2 - 5
Action:                              0
Reward:                              0
Current BTC Price:                     91252000.0
Prev Asset Value:            100006.57519835181
Current Asset Value:            100006.57519835181
Free KRW:             100006.57519835181
Used KRW:                              0
Free Coin:                            0.0
Used Coin:                              0
Buy Order Amount:                              0
Buy Order Price:                              0
Sell Order Amount:                              0
Sell Order Price:          0


KeyboardInterrupt: 