In [213]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [214]:
import gym
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [215]:
class BitcoinTradingEnv(gym.Env):
    def __init__(self, initial_balance=1000000, transaction_cost=0.0005, window_size=96):
        super(BitcoinTradingEnv, self).__init__()
        
        self.initial_balance = initial_balance
        self.transaction_cost = transaction_cost
        self.window_size = window_size
        
        self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(self.window_size, 7), dtype=np.float32)
        
        self.action_space = gym.spaces.Box(low=np.array([-100]), high=np.array([100]), dtype=np.float32)

    def reset(self, data):
        self.data = data
        self.balance = self.initial_balance
        self.bitcoin_held = 0
        self.current_step = self.window_size
        self.done = False
        self.total_asset_value = self.balance
        self.trade_direction = 0
        
        return self._next_observation()

    def _next_observation(self):
        frame = self.data.iloc[self.current_step - self.window_size:self.current_step]
        
        # Pad the frame if it's smaller than window_size
        if len(frame) < self.window_size:
            padding = self.window_size - len(frame)
            frame = frame.reindex(range(-padding, len(frame)))
            frame.iloc[:padding] = frame.iloc[padding]
        
        obs = np.column_stack((
            frame['open'].values,
            frame['high'].values,
            frame['low'].values,
            frame['close'].values,
            frame['volume'].values,
            frame['value'].values,
            np.full(self.window_size, self.balance)
        )).astype(np.float32)
        
        return obs

    def step(self, action):
        current_price = self.data.iloc[self.current_step]['close']
        
        if action > 0:
            bitcoin_bought = (self.balance / current_price) * action
            self.bitcoin_held += bitcoin_bought
            self.balance -= self.balance * action * (1 + self.transaction_cost)
            
        elif action < 0:
            bitcoin_sold = self.bitcoin_held * (-action)
            self.balance += current_price * bitcoin_sold * (1 - self.transaction_cost)
            self.bitcoin_held -= bitcoin_sold

        self.balance = float(self.balance)
        self.current_step += 1
        if self.current_step >= len(self.data) - 1:
            self.done = True

        self.total_asset_value = self.balance + self.bitcoin_held * current_price
        reward = float(self.total_asset_value - self.initial_balance)
        next_state = self._next_observation()
        self.trade_direction += action

        return next_state, reward, self.done, {}

    def render(self):
        profit = self.total_asset_value - self.initial_balance
        print(f"Total Asset Value: {self.total_asset_value:.2f}, Profit: {profit:.2f}, Trade Direction: {self.trade_direction}")

In [216]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        mean = torch.tanh(self.fc3(x))
        return Normal(mean, torch.ones_like(mean) * 0.1)

class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.fc3(x)
        return value

In [217]:
class PPOAgent:
    def __init__(self, state_dim, action_dim, hidden_dim=64, lr=1e-4, gamma=0.99, clip_epsilon=0.2):
        self.actor = Actor(state_dim, action_dim, hidden_dim)
        self.critic = Critic(state_dim, hidden_dim)
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr)
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon

    def select_action(self, state):
        state = torch.FloatTensor(state).reshape(1, -1)  # Reshape to (1, state_dim)
        dist = self.actor(state)
        action = dist.sample()
        action_clipped = torch.clamp(action, -1, 1)
        log_prob = dist.log_prob(action)
        return action_clipped.detach().numpy()[0], log_prob.detach()

    def compute_advantage(self, rewards, values, next_values, dones):
        advantages = []
        returns = []
        advantage = 0
        for step in reversed(range(len(rewards))):
            td_error = rewards[step] + self.gamma * next_values[step] * (1 - dones[step]) - values[step]
            advantage = td_error + self.gamma * advantage * (1 - dones[step])
            advantages.insert(0, advantage)
            returns.insert(0, advantage + values[step])
        return advantages, returns

    def update(self, states, actions, log_probs, returns, advantages):
        advantages = torch.FloatTensor(advantages)
        returns = torch.FloatTensor(returns)
        states = torch.FloatTensor(np.array(states))  # Convert list to numpy array first
        actions = torch.FloatTensor(actions)
        log_probs_old = torch.FloatTensor(log_probs)

        # Reshape states to (batch_size, window_size * 7)
        states = states.reshape(states.shape[0], -1)

        for _ in range(10):
            dist = self.actor(states)
            log_probs_new = dist.log_prob(actions)
            ratio = torch.exp(log_probs_new - log_probs_old)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            self.actor_optimizer.zero_grad()
            actor_loss.backward()
            self.actor_optimizer.step()

        values = self.critic(states).squeeze()
        critic_loss = (returns - values).pow(2).mean()
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

In [218]:
data = pd.read_csv('../../data/dataset/KRW-BTC_recent.csv')
chunk_size = 10000
data_chunks = [data.iloc[i:i+chunk_size] for i in range(0, len(data), chunk_size)]

env = BitcoinTradingEnv(window_size=96)
state_dim = env.observation_space.shape
action_dim = env.action_space.shape[0]

agent = PPOAgent(state_dim[0] * state_dim[1], action_dim, hidden_dim=64)

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [219]:
def plot_rewards(all_rewards):
    plt.figure(figsize=(10, 6))
    plt.plot(all_rewards, label='Rewards per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.title('Total Rewards Over Episodes')
    plt.legend()
    plt.grid(True)
    plt.show()

In [220]:
epochs = 1000
update_timesteps=2000
all_rewards = []

for epoch in tqdm(range(epochs)):
    for index, chunk in enumerate(data_chunks):
        obs = env.reset(chunk)
        episode_reward = 0
        log_probs = []
        values = []
        rewards = []
        states = []
        actions = []
        dones = []

        for t in range(update_timesteps):
            action, log_prob = agent.select_action(obs)
            value = agent.critic(torch.FloatTensor(obs).unsqueeze(0)).item()

            next_obs, reward, done, _ = env.step(action)
            episode_reward += reward

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            states.append(obs)
            actions.append(action)
            dones.append(done)

            obs = next_obs

            if done:
                break

        next_value = agent.critic(torch.FloatTensor(obs).unsqueeze(0)).item()
        values.append(next_value)
        advantages, returns = agent.compute_advantage(rewards, values[:-1], values[1:], dones)

        agent.update(states, actions, log_probs, returns, advantages)
        all_rewards.append(episode_reward)
        
    if not epoch == 0 and epoch % 10 == 0:
        print(f'Epoch {epoch}/{epochs}, Reward: {np.average(all_rewards[-100:])}')
        
plot_rewards(all_rewards)


  self.balance = float(self.balance)
  reward = float(self.total_asset_value - self.initial_balance)
Training...:   0%|          | 0/1000 [00:05<?, ?it/s]


ValueError: Expected parameter loc (Tensor of shape (1, 1)) of distribution Normal(loc: tensor([[nan]], grad_fn=<TanhBackward0>), scale: tensor([[0.1000]])) to satisfy the constraint Real(), but found invalid values:
tensor([[nan]], grad_fn=<TanhBackward0>)