In [3]:
%pip install -q -r requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

* action: -100 to 100 represents selling or buying percentage


In [3]:
class BitcoinEnvironment:
    def __init__(self, initial_balance=1000000, commission=0.00, window_size=96, data=None):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.initial_balance = initial_balance
        self.commission = commission
        self.window_size = window_size
        self.data = data.to(self.device)
        self.reset()

    def reset(self):
        self.current_step = self.window_size
        self.initial_price = self.data[0 + self.window_size].item()
        self.balance = self.initial_balance
        self.btc_held = 0
        self.done = False
        return self.get_state()

    def step(self, action):
        if self.done:
            return self.get_state(), 0, True, {}

        old_portfolio_value = self.balance + (self.btc_held * self.data[self.current_step].item())

        current_price = self.data[self.current_step].item()

        if action > 0:
            btc_to_buy = (self.balance * (action / 100)) / current_price
            cost = btc_to_buy * current_price
            commission = cost * self.commission
            if cost + commission <= self.balance:
                self.balance -= (cost + commission)
                self.btc_held += btc_to_buy
        elif action < 0:
            btc_to_sell = self.btc_held * (abs(action) / 100)
            revenue = btc_to_sell * current_price
            commission = revenue * self.commission
            self.balance += (revenue - commission)
            self.btc_held -= btc_to_sell

        self.current_step += 1

        new_portfolio_value = self.balance + (self.btc_held * self.data[self.current_step].item())
        reward = new_portfolio_value - old_portfolio_value

        if self.current_step >= len(self.data) - 1:
            self.done = True

        portfolio_value = self.balance + self.btc_held * self.data[self.current_step].item()
        lazy_profit = self.initial_balance * current_price / self.initial_price

        return self.get_state(), reward, self.done, {}, portfolio_value, lazy_profit

    def get_state(self):
        window_data = self.data[self.current_step - self.window_size:self.current_step]

        normalized_data = (window_data - window_data.mean()) / window_data.std()

        portfolio_state = torch.tensor([
            self.balance / self.initial_balance,
            self.btc_held * self.data[self.current_step].item() / self.initial_balance
        ], device=self.device)

        return torch.cat([normalized_data, portfolio_state])

    def render(self):
        print(f"Cash Balance: ₩{self.balance:,.2f} / BTC Held: {self.btc_held:,.8f}")
        print(f"Portfolio Value: ₩{(self.balance + self.btc_held * self.data[self.current_step].item()):,.2f}")
        print("--------------------")

In [4]:
class PPOAgent:
    def __init__(self, state_dim, hidden_dim=64, lr=3e-4, gamma=0.99, epsilon=0.2, value_coef=0.5, entropy_coef=0.01, num_epochs=10):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.actor = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        ).to(self.device)

        self.critic = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        ).to(self.device)

        self.optimizer = optim.Adam(list(self.actor.parameters()) + list(self.critic.parameters()), lr=lr)

        self.gamma = gamma
        self.epsilon = epsilon
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        self.num_epochs = num_epochs

    def get_action(self, state):
        state = state.unsqueeze(0).to(self.device)
        with torch.no_grad():
            mean, log_std = self.actor(state).squeeze(0).chunk(2)
        std = log_std.exp()
        dist = Normal(mean, std)
        action = dist.sample()
        action = torch.clamp(action, min=-100, max=100)
        log_prob = dist.log_prob(action)
        return action, log_prob

    def update(self, states, actions, rewards, next_states, dones, old_log_probs):
        states = torch.stack(states).to(self.device)
        actions = torch.FloatTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.stack(next_states).to(self.device)
        dones = torch.FloatTensor(dones).to(self.device)
        old_log_probs = torch.FloatTensor(old_log_probs).to(self.device)

        with torch.no_grad():
            values = self.critic(states).squeeze()
            next_values = self.critic(next_states).squeeze()
            deltas = rewards + self.gamma * next_values * (1 - dones) - values
            advantages = self._compute_gae(deltas)

        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)

        for _ in range(self.num_epochs):
            mean, log_std = self.actor(states).chunk(2, dim=-1)
            std = log_std.exp()
            dist = Normal(mean, std)
            new_log_probs = dist.log_prob(actions)

            ratio = torch.exp(new_log_probs - old_log_probs)
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()

            value_pred = self.critic(states).squeeze()
            value_loss = nn.MSELoss()(value_pred, rewards + self.gamma * next_values * (1 - dones))

            entropy = dist.entropy().mean()

            loss = actor_loss + self.value_coef * value_loss - self.entropy_coef * entropy

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        return loss.item()

    def _compute_gae(self, deltas, lambda_=0.95):
        advantages = []
        gae = 0
        for delta in reversed(deltas):
            gae = delta + self.gamma * lambda_ * gae
            advantages.insert(0, gae)
        return torch.stack(advantages)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_dir = '/content/drive/MyDrive/Bitcoin Trade/models/PPO/KRW-BTC_recent.csv'
data_df = pd.read_csv(data_dir)

coin_price_data = torch.tensor(data_df['close'].values, dtype=torch.float32).to(device)

In [6]:
def graph(agent_profit, lazy_profit):
    plt.figure(figsize=(12, 6))

    plt.plot(agent_profit, label='Profit Agent', linewidth=1)
    plt.plot(lazy_profit, label='Profit Lazy', linewidth=1)

    plt.xlabel('Time Period')
    plt.ylabel('Profit')
    plt.title('Profit Comparison: Agent vs Lazy')
    plt.legend()

    formatter = FuncFormatter(lambda x, _: f'₩{int(x):,}')
    plt.gca().yaxis.set_major_formatter(formatter)

    plt.show()

In [7]:
def train_ppo_agent(env, agent, num_episodes=100, max_steps=1000):
    device = agent.device

    for episode in tqdm(range(num_episodes)):
        state = env.reset()
        episode_reward = 0
        states, actions, rewards, log_probs, next_states, dones = [], [], [], [], [], []
        agent_profits, lazy_profits, net_rewards = [], [], []

        for step in range(max_steps):
            if not isinstance(state, torch.Tensor):
                state = torch.FloatTensor(state)
            state = state.to(device)
            action, log_prob = agent.get_action(state)
            next_state, reward, done, _, agent_profit_instance, lazy_profit_instance = env.step(action)

            agent_profits.append(agent_profit_instance)
            lazy_profits.append(lazy_profit_instance)
            net_rewards.append(reward)

            # print(f"State device: {state.device}")
            # print(f"Action device: {action.device}")

            states.append(state)
            actions.append(action)
            rewards.append(reward)
            log_probs.append(log_prob)
            next_states.append(next_state)
            dones.append(done)

            episode_reward += reward
            state = next_state

            if done:
                break

        loss = agent.update(states, actions, rewards, next_states, dones, log_probs)

        if episode > 0 and episode % 100 == 0:
            graph(agent_profits, lazy_profits)
            print(f"Episode {episode}, Total Reward: {sum(net_rewards):,.2f}")
            env.render()

    return agent

In [None]:
env = BitcoinEnvironment(data=coin_price_data)
state_dim = env.get_state().shape[0]
agent = PPOAgent(state_dim)

with torch.autograd.profiler.profile(use_device = 'cuda') as prof:
  train_ppo_agent(env, agent)
print(prof.key_averages().table(sort_by="cuda_time_total"))

  0%|          | 0/100 [00:00<?, ?it/s]