<a href="https://colab.research.google.com/github/gitHubAndyLee2020/ExchangeAgent/blob/main/stock_exchange_decision_transformer_agent_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Stock Exchange Decision Transformer Agent v1

In [None]:
!pip install datasets
!pip install gym-anytrading
!pip install accelerate -U

### Handling Training Data

In [None]:
import gym_anytrading

df = gym_anytrading.datasets.STOCKS_GOOGL.copy()

In [None]:
df.head(20)

In [None]:
df.tail(20)

In [None]:
import os
import random
from datasets import load_dataset
import numpy as np

In [None]:
os.environ["WANDB_DISABLED"] = "true"
dataset = load_dataset("techandy42/ppo-200K-collected-dataset-steps-1")

In [None]:
dataset

### Before Reshaping

In [None]:
# Take the first sample from the dataset
first_sample = dataset['train'][0]

first_sample_values = {key: value for key, value in first_sample.items()}

for key, value in first_sample_values.items():
    print(f"{key}: {value}")

In [None]:
# Take the last sample from the dataset
last_sample = dataset['train'][-1]

last_sample_values = {key: value for key, value in last_sample.items()}

for key, value in last_sample_values.items():
    print(f"{key}: {value}")

In [None]:
# Take a random sample from the dataset
random_sample = random.choice(dataset['train'])

random_sample_values = {key: value for key, value in random_sample.items()}

for key, value in random_sample_values.items():
    print(f"{key}: {value}")

In [None]:
# Define a function to reshape the observation
def reshape_observation(example):
    example['observation'] = example['observation'][0][-1]
    return example

# Apply the function to each item in the dataset
dataset = dataset.map(reshape_observation)

### After Reshaping

In [None]:
# Take the first sample from the dataset
first_sample = dataset['train'][0]

first_sample_values = {key: value for key, value in first_sample.items()}

for key, value in first_sample_values.items():
    print(f"{key}: {value}")

In [None]:
# Take the last sample from the dataset
last_sample = dataset['train'][-1]

last_sample_values = {key: value for key, value in last_sample.items()}

for key, value in last_sample_values.items():
    print(f"{key}: {value}")

In [None]:
# Take a random sample from the dataset
random_sample = random.choice(dataset['train'])

random_sample_values = {key: value for key, value in random_sample.items()}

for key, value in random_sample_values.items():
    print(f"{key}: {value}")

### Action Type Count

In [None]:
# Dataset Action Type Analysis
action_type_count = {
    "short_no_reward": 0,
    "short_reward": 0,
    "long": 0,
}

def count_action_type(example):
    if example['action'][0] == 0:
      if example['reward'][0] == 0:
        action_type_count["short_no_reward"] += 1
      else:
        action_type_count["short_reward"] += 1
    else:
      action_type_count["long"] += 1
    return example

dataset = dataset.map(count_action_type)

In [None]:
for key, value in action_type_count.items():
    print(f"{key}: {value}")

### Define DataCollator

In [None]:
import random
import torch
import torch.nn.functional as F
import numpy as np
from dataclasses import dataclass
from transformers import DecisionTransformerConfig, DecisionTransformerModel, Trainer, TrainingArguments

In [None]:
@dataclass
class DecisionTransformerGymDataCollator:
    return_tensors: str = "pt"
    max_len: int = 10 #subsets of the episode we use for training
    state_dim: int = 2  # size of state space
    act_dim: int = 1  # size of action space
    max_ep_len: int = 2324 # max episode length in the dataset
    scale: float = 1.0  # normalization of rewards/returns
    state_mean: np.array = None  # to store state means
    state_std: np.array = None  # to store state stds
    p_sample: np.array = None  # a distribution to take account trajectory lengths
    n_traj: int = 0 # to store the number of trajectories in the dataset

    def __init__(self, dataset) -> None:
        self.dataset = dataset
        # calculate dataset stats for normalization of states
        states = []
        traj_lens = []
        for obs in dataset["observation"]:
            states.extend(obs)
            traj_lens.append(len(obs))
        self.n_traj = len(traj_lens)
        states = np.vstack(states)
        self.state_mean, self.state_std = np.mean(states, axis=0), np.std(states, axis=0) + 1e-6

        traj_lens = np.array(traj_lens)
        self.p_sample = traj_lens / sum(traj_lens)

    def _discount_cumsum(self, x, gamma):
        discount_cumsum = np.zeros_like(x)
        discount_cumsum[-1] = x[-1]
        for t in reversed(range(x.shape[0] - 1)):
            discount_cumsum[t] = x[t] + gamma * discount_cumsum[t + 1]
        return discount_cumsum

    def __call__(self, features):
        batch_size = len(features)
        # this is a bit of a hack to be able to sample of a non-uniform distribution
        batch_inds = np.random.choice(
            np.arange(self.n_traj),
            size=batch_size,
            replace=True,
            p=self.p_sample,  # reweights so we sample according to timesteps
        )
        # a batch of dataset features
        s, a, r, d, rtg, timesteps, mask = [], [], [], [], [], [], []

        for ind in batch_inds:
            # for feature in features:
            feature = self.dataset[int(ind)]
            si = random.randint(0, len(feature["reward"]) - 1)

            # get sequences from dataset
            s.append(np.array(feature["observation"][si : si + self.max_len]).reshape(1, -1, self.state_dim))
            a.append(np.array(feature["action"][si : si + self.max_len]).reshape(1, -1, self.act_dim))
            r.append(np.array(feature["reward"][si : si + self.max_len]).reshape(1, -1, 1))

            d.append(np.array(feature["done"][si : si + self.max_len]).reshape(1, -1))
            timesteps.append(np.arange(si, si + s[-1].shape[1]).reshape(1, -1))
            timesteps[-1][timesteps[-1] >= self.max_ep_len] = self.max_ep_len - 1  # padding cutoff
            rtg.append(
                self._discount_cumsum(np.array(feature["reward"][si:]), gamma=1.0)[
                    : s[-1].shape[1]   # TODO check the +1 removed here
                ].reshape(1, -1, 1)
            )
            if rtg[-1].shape[1] < s[-1].shape[1]:
                print("if true")
                rtg[-1] = np.concatenate([rtg[-1], np.zeros((1, 1, 1))], axis=1)

            # padding and state + reward normalization
            tlen = s[-1].shape[1]
            s[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, self.state_dim)), s[-1]], axis=1)
            s[-1] = (s[-1] - self.state_mean) / self.state_std
            a[-1] = np.concatenate(
                [np.ones((1, self.max_len - tlen, self.act_dim)) * -10.0, a[-1]],
                axis=1,
            )
            r[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), r[-1]], axis=1)
            d[-1] = np.concatenate([np.ones((1, self.max_len - tlen)) * 2, d[-1]], axis=1)
            rtg[-1] = np.concatenate([np.zeros((1, self.max_len - tlen, 1)), rtg[-1]], axis=1) / self.scale
            timesteps[-1] = np.concatenate([np.zeros((1, self.max_len - tlen)), timesteps[-1]], axis=1)
            mask.append(np.concatenate([np.zeros((1, self.max_len - tlen)), np.ones((1, tlen))], axis=1))

        s = torch.from_numpy(np.concatenate(s, axis=0)).float()
        a = torch.from_numpy(np.concatenate(a, axis=0)).float()
        r = torch.from_numpy(np.concatenate(r, axis=0)).float()
        d = torch.from_numpy(np.concatenate(d, axis=0))
        rtg = torch.from_numpy(np.concatenate(rtg, axis=0)).float()
        timesteps = torch.from_numpy(np.concatenate(timesteps, axis=0)).long()
        mask = torch.from_numpy(np.concatenate(mask, axis=0)).float()

        return {
            "states": s,
            "actions": a,
            "rewards": r,
            "returns_to_go": rtg,
            "timesteps": timesteps,
            "attention_mask": mask,
        }

### Define Model

In [None]:
class TrainableDT(DecisionTransformerModel):
    def __init__(self, config):
        super().__init__(config)
        self.action_probs_list = []

    def forward(self, **kwargs):
        output = super().forward(**kwargs)

        # Binary action prediction
        action_preds = output[1]
        action_targets = kwargs["actions"]
        attention_mask = kwargs["attention_mask"]
        act_dim = action_preds.shape[2]

        action_preds = action_preds.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]
        action_targets = action_targets.reshape(-1, act_dim)[attention_mask.reshape(-1) > 0]

        # Sigmoid activation to get probabilities
        action_probs = torch.sigmoid(action_preds)
        self.action_probs_list.append(action_probs)
        # Thresholding to get binary actions
        action_preds_binary = (action_probs > 0.5).float()

        # Binary Cross-Entropy Loss
        loss = F.binary_cross_entropy(action_probs, action_targets)

        return {"loss": loss}

    def original_forward(self, **kwargs):
        return super().forward(**kwargs)

### Train Model

In [None]:
collator = DecisionTransformerGymDataCollator(dataset["train"])

config = DecisionTransformerConfig(state_dim=collator.state_dim, act_dim=collator.act_dim)
model = TrainableDT(config)

In [None]:
training_args = TrainingArguments(
    output_dir="output/",
    remove_unused_columns=False,
    num_train_epochs=140,
    per_device_train_batch_size=64,
    learning_rate=1e-4,
    weight_decay=1e-4,
    warmup_ratio=0.1,
    optim="adamw_torch",
    max_grad_norm=0.25,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=collator,
)

trainer.train()

In [None]:
# Examine action_probs values across training steps
for i in range(0, len(model.action_probs_list), 100):
  print(model.action_probs_list[i])

### Test Model

In [None]:
# Function that gets an action from the model using autoregressive prediction with a window of the previous 20 timesteps.
def get_action(model, states, actions, rewards, returns_to_go, timesteps):
    # Reshape inputs
    states = states.reshape(1, -1, model.config.state_dim)
    actions = actions.reshape(1, -1, model.config.act_dim)
    returns_to_go = returns_to_go.reshape(1, -1, 1)
    timesteps = timesteps.reshape(1, -1)

    # Apply window of the previous 20 timesteps
    states = states[:, -model.config.max_length :]
    actions = actions[:, -model.config.max_length :]
    returns_to_go = returns_to_go[:, -model.config.max_length :]
    timesteps = timesteps[:, -model.config.max_length :]
    padding = model.config.max_length - states.shape[1]

    device = model.device  # Get the device from the model

    # Pad all tokens to sequence length
    zero_pad = torch.zeros(padding, device=device)
    attention_mask = torch.cat((zero_pad, torch.ones(states.shape[1], device=device)), dim=0)
    attention_mask = attention_mask.to(dtype=torch.long).reshape(1, -1)

    # Ensure all tensors are moved to the same device before concatenation
    zero_tensor_state = torch.zeros((1, padding, model.config.state_dim), device=device)
    zero_tensor_action = torch.zeros((1, padding, model.config.act_dim), device=device)
    zero_tensor_return = torch.zeros((1, padding, 1), device=device)
    zero_tensor_timestep = torch.zeros((1, padding), dtype=torch.long, device=device)

    states = torch.cat((zero_tensor_state, states.to(device)), dim=1).float()
    actions = torch.cat((zero_tensor_action, actions.to(device)), dim=1).float()
    returns_to_go = torch.cat((zero_tensor_return, returns_to_go.to(device)), dim=1).float()
    timesteps = torch.cat((zero_tensor_timestep, timesteps.to(device)), dim=1)

    state_preds, action_preds, return_preds = model.original_forward(
        states=states,
        actions=actions,
        returns_to_go=returns_to_go,
        timesteps=timesteps,
        attention_mask=attention_mask,
        return_dict=False,
    )

    # Apply sigmoid to the last timestep's action prediction and threshold to get a binary action
    last_action_pred = torch.sigmoid(action_preds[0, -1])
    binary_action = (last_action_pred > 0.5).float()

    return binary_action

In [None]:
import numpy as np
import gymnasium as gym
import gym_anytrading

In [None]:
env_id = "stocks-v0"

df = gym_anytrading.datasets.STOCKS_GOOGL.copy()

window_size = 10
start_index = window_size
end_index = len(df)

env = gym.make(
    env_id,
    df=df,
    window_size=window_size,
    frame_bound=(start_index, end_index)
)

print("observation_space:", env.observation_space)

In [None]:
max_ep_len = 2324
device = "cuda"
scale = 1.0  # normalization for rewards/returns
TARGET_RETURN = 2000 / scale  # evaluation is conditioned on a return of 12000, scaled accordingly

state_mean = collator.state_mean.astype(np.float32)
state_std = collator.state_std.astype(np.float32)
print(state_mean)

state_dim = 20
act_dim = 1
# Create the decision transformer model

state_mean = torch.from_numpy(state_mean).to(device=device)
state_std = torch.from_numpy(state_std).to(device=device)

In [None]:
# Interact with the environment
episode_return, episode_length = 0, 0
state = env.reset()
state = state[0].flatten()
target_return = torch.tensor(TARGET_RETURN, device=device, dtype=torch.float32).reshape(1, 1)
states = torch.from_numpy(state).reshape(1, state_dim).to(device=device, dtype=torch.float32)
actions = torch.zeros((0, act_dim), device=device, dtype=torch.float32)
rewards = torch.zeros(0, device=device, dtype=torch.float32)
model_action_type_count = {
    "short_no_reward": 0,
    "short_reward": 0,
    "long": 0,
}

timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
for t in range(max_ep_len):
    actions = torch.cat([actions, torch.zeros((1, act_dim), device=device)], dim=0)
    rewards = torch.cat([rewards, torch.zeros(1, device=device)])

    action = get_action(
        model,
        (states - state_mean) / state_std,
        actions,
        rewards,
        target_return,
        timesteps,
    )
    actions[-1] = action
    action = action.detach().cpu().numpy()
    # Choose random action for each step
    # action = np.array([random.choice([0, 1])])

    state, reward, done, _, info = env.step(action)
    state = state.flatten()

    if action[0] == 0:
        if reward == 0:
            model_action_type_count["short_no_reward"] += 1
        else:
            model_action_type_count["short_reward"] += 1
    elif action[0] == 1:
        model_action_type_count["long"] += 1

    cur_state = torch.from_numpy(state).to(device=device).reshape(1, state_dim)
    states = torch.cat([states, cur_state], dim=0)
    rewards[-1] = reward

    pred_return = target_return[0, -1] - (reward / scale)
    target_return = torch.cat([target_return, pred_return.reshape(1, 1)], dim=1)
    timesteps = torch.cat([timesteps, torch.ones((1, 1), device=device, dtype=torch.long) * (t + 1)], dim=1)

    episode_return += reward
    episode_length += 1

    if done:
        break

In [None]:
for key, value in model_action_type_count.items():
    print(f"{key}: {value}")

In [None]:
print(f"Episode return: {episode_return}")
print(f"Episode length: {episode_length}")