# GP2 Trading Model

### Imports

In [None]:
from data_processor import DataProcessor
from plot import backtest_stats, backtest_plot, get_baseline, get_daily_return, drop_dup_dates
import pickle
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import itertools

In [None]:
from config_private import ALPACA_API_KEY, ALPACA_API_SECRET
API_BASE_URL = 'https://paper-api.alpaca.markets'
from config_tickers import GP2_TICKER
from config import INDICATORS
from config import CDL

### Environment

In [None]:
import gymnasium as gym
import numpy as np
from numpy import random as rd


class StockTradingEnv(gym.Env):
    def __init__(
        self,
        config,
        gamma=0.99,
        turbulence_thresh=99,
        max_stock=None,
        min_stock_rate=0.1,
        initial_capital=1e5,
        reward_scaling=2**-11,
        initial_stocks=None,
    ):
        price_ary = config["price_array"]
        tech_ary = config["tech_array"]
        turbulence_ary = config["turbulence_array"]
        date_ary = config["date_array"]
        if_train = config["if_train"]
        self.price_ary = price_ary.astype(np.float32)
        self.tech_ary = tech_ary.astype(np.float32)
        self.turbulence_ary = turbulence_ary
        self.date_ary = date_ary

        self.tech_ary = self.tech_ary * 2**-7
        self.turbulence_bool = (turbulence_ary > turbulence_thresh).astype(np.float32)
        self.turbulence_ary = (
            self.sigmoid_sign(turbulence_ary, turbulence_thresh) * 2**-5
        ).astype(np.float32)

        stock_dim = self.price_ary.shape[1]
        self.gamma = gamma
        self.max_stock = max_stock
        self.min_stock_rate = min_stock_rate
        self.reward_scaling = reward_scaling
        self.initial_capital = initial_capital
        self.initial_stocks = (
            np.zeros(stock_dim, dtype=np.float32)
            if initial_stocks is None
            else initial_stocks
        )
        
        # reset()
        self.current_step = None
        self.num_trades = None
        self.cash = None
        self.stocks = None
        self.total_assets = None
        self.gamma_reward = None
        self.initial_total_assets = None

        # environment information
        self.env_name = "StockEnv"
        self.state_dim = 109 # Size of get_state() array 
        self.action_dim = stock_dim
        self.max_step = self.price_ary.shape[0] - 1
        self.if_train = if_train
        self.if_discrete = False
        self.episode_return = 0.0
        self.observation_space = gym.spaces.Box(
            low=-5000, high=5000, shape=(self.state_dim,), dtype=np.float32
        )
        self.action_space = gym.spaces.Box(
            low=-1, high=1, shape=(self.action_dim,), dtype=np.float32
        )
        
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_step = 0
        self.num_trades = 0
        price = self.price_ary[self.current_step]
        
        if self.if_train:
            self.stocks = (
                self.initial_stocks + rd.randint(0, 17, size=self.initial_stocks.shape)
            ).astype(np.float32)
            self.stocks_cool_down = np.zeros_like(self.stocks)
            self.cash = (
                self.initial_capital * rd.uniform(0.95, 1.05)
                - (self.stocks * price).sum()
            )
        else:
            self.stocks = self.initial_stocks.astype(np.float32)
            self.stocks_cool_down = np.zeros_like(self.stocks)
            self.cash = self.initial_capital
        
        self.total_assets = self.cash + (self.stocks * price).sum()
        self.initial_total_assets = self.total_assets
        self.gamma_reward = 0.0
        observation = self.get_state(price)
        info = {}
        return observation, info

    def step(self, action):
        self.current_step += 1
        price = self.price_ary[self.current_step]
        self.max_stock = np.round(np.floor(100_000 / price)).astype(int)
        action = np.round((action * self.max_stock)).astype(int)
        min_action = np.round((self.max_stock * self.min_stock_rate)).astype(int)
        self.stocks_cool_down += 1
        
        if self.turbulence_bool[self.current_step] == 0:
            
            # Sell Logic
            for index in np.where((action < -min_action) & (self.stocks_cool_down > 0))[0]:
                if price[index] > 0:
                    sell_num_shares = min(self.stocks[index], -action[index])
                    sell_value = price[index] * sell_num_shares
                    self.stocks[index] -= sell_num_shares
                    self.cash += sell_value
                    self.stocks_cool_down[index] = 0
                    self.num_trades += 1

            # Buy Logic
            for index in np.where((action > min_action) & (self.stocks_cool_down > 0))[0]:
                if price[index] > 0:
                    buy_num_shares = min(self.cash // price[index], action[index])
                    buy_value = price[index] * buy_num_shares
                    self.stocks[index] += buy_num_shares
                    self.cash -= buy_value
                    self.stocks_cool_down[index] = 0
                    self.num_trades += 1

        # turbulence logic
        else:
            self.cash += (self.stocks * price).sum()
            self.num_trades += np.count_nonzero(self.stocks)
            self.stocks[:] = 0
            self.stocks_cool_down[:] = 0
        
        # Reward Calculations
        observation = self.get_state(price)
        total_assets = self.cash + (self.stocks * price).sum()
        reward = (total_assets - self.total_assets) * self.reward_scaling
        self.total_assets = total_assets
        self.gamma_reward = self.gamma_reward * self.gamma + reward
        terminated = self.current_step == self.max_step
        truncated = False
        info = {}
        if terminated:
            reward = self.gamma_reward
            self.episode_return = total_assets / self.initial_total_assets

        return observation, reward, terminated, truncated, info

    def get_state(self, price):
        cash = np.array(self.cash * (2**-12), dtype=np.float32)
        scale = np.array(2**-6, dtype=np.float32)
        observation = np.hstack(
            (
                cash,
                price * scale,
                self.stocks * scale,
                self.stocks_cool_down,
                self.tech_ary[self.current_step],
            )
        )
        # print(len(observation))
        return observation

    @staticmethod
    def sigmoid_sign(ary, thresh):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x * np.e)) - 0.5

        return sigmoid(ary / thresh) * thresh

In [None]:
import ray
import ray.rllib.algorithms.ppo as ppo
from ray.tune.schedulers.pb2 import PB2 # Dependencies: pip install GPy sklearn
from ray.tune import sample_from

import datetime
%matplotlib inline

import YahooDownloader
import FeatureEngineer, data_split



import time


### DRL Agent Class

note: 16 workers, 64 vectorized enviroments instances per worker

In [None]:
import ray

from pprint import pprint

from ray import tune
import ray.rllib.algorithms.ppo as ppo
from ray.tune.schedulers.pb2 import PB2 # Dependencies: pip install GPy sklearn
from ray.rllib.algorithms import Algorithm
from ray.tune import register_env

from ray.air import RunConfig, FailureConfig
from ray.tune.tune_config import TuneConfig
from ray.air.config import CheckpointConfig
from ray.tune.callback import Callback

from typing import Dict, Optional, Any, List, Union


class DRLlibv2:
    """
    It instantiates RLlib model with Ray tune functionality
    Params
    -------------------------------------
    trainable:
        Any Trainable class that takes config as parameter
    train_env:
        Training environment instance
    train_env_name: str
        Name of the training environment
    params: dict
        hyperparameters dictionary
    run_name: str
        tune run name
    framework: str
        "torch" or "tf" for tensorflow
    local_dir: str
         to save the results and tensorboard plots
    num_workers: int
        number of workers
    num_samples: int
         Number of samples of hyperparameters config to run
    scheduler:
        Stopping suboptimal trials
    log_level: str = "WARN",
        Verbosity: "DEBUG"
    num_gpus: Union[float, int] = 1
        GPUs for trial
    num_cpus: Union[float, int] = 20
        CPUs for rollout collection
    dataframe_save: str
        Saving the tune results
    metric: str
        Metric for hyperparameter optimization in Bayesian Methods
    mode: str
        Maximize or Minimize the metric
    max_failures: int
        Number of failures to TuneError
    timeout: int
        Number of seconds to run the experiment
    checkpoint_num_to_keep: int
        Number of checkpoints to keep
    checkpoint_freq: int
        Checkpoint freq wrt training iterations
    reuse_actors:bool
        Reuse actors for tuning
    callbacks:
        callbacks integration for ray tune

    It has the following methods:
    Methods
    -------------------------------------
        train_tune_model: It takes in the params dictionary and fits in sklearn style to our trainable class
        restore_agent: It restores previously errored or stopped trials or experiments
        infer_results: It returns the results dataframe and trial informations
        get_test_agent: It returns the testing agent for inference

    Example
    ---------------------------------------
    def sample_ppo_params():
        return {
            "entropy_coeff": tune.loguniform(0.00000001, 0.1),
            "lr": tune.loguniform(5e-5, 0.001),
            "sgd_minibatch_size": tune.choice([ 32, 64, 128, 256, 512]),
            "lambda": tune.choice([0.1,0.3,0.5,0.7,0.9,1.0]),
        }
    drl_agent = DRLlibv2(
        trainable="PPO",
        train_env=env(train_env_config),
        train_env_name="StockTrading_train",
        framework="torch",
        log_level="DEBUG",
        run_name = 'test',
        local_dir = "test",
        params = sample_ppo_params(),
        num_samples = 16,
        timeout=3600
        checkpoint_freq=5
    )
    #Tune or train the model
    res = drl_agent.train_tune_model()

    #Get the tune results
    results_df, best_result = drl_agent.infer_results()

    #Get the best testing agent
    test_agent = drl_agent.get_test_agent(test_env_instance,'StockTrading_testenv')
    """

    def __init__(
        self,
        trainable: Union[str, Any],
        params: dict,
        train_env=None,
        train_env_name: str='',
        run_name: str = "tune_run",
        framework: str = "torch",
        local_dir: str = "tune_results",
        num_workers: int = 16,
        num_samples: int = 0,
        scheduler=None,
        log_level: str = "WARN",
        num_gpus: Union[float, int] = 1,
        num_cpus: Union[float, int] = 20,
        dataframe_save: str = "tune.csv",
        metric: str = "episode_reward_mean",
        mode: Union[str, List[str]] = "max",
        max_failures: int = 0,
        timeout: int = 3600,
        checkpoint_num_to_keep: Union[None, int] = None,
        checkpoint_freq: int = 0,
        reuse_actors: bool = True,
        callbacks:Optional[List["Callback"]]=None
    ):

        if train_env is not None:register_env(train_env_name, lambda config: train_env)

        self.params = params
        self.params["framework"] = framework
        self.params["log_level"] = log_level
        self.params["num_gpus"] = num_gpus
        self.params["num_workers"] = num_workers
        self.params["env"] = train_env_name

        self.run_name = run_name
        self.local_dir = local_dir
        self.scheduler = scheduler
        self.num_samples = num_samples
        self.trainable = trainable
        if isinstance(self.trainable, str):
            self.trainable.upper()
        self.num_cpus = num_cpus
        self.num_gpus = num_gpus
        self.dataframe_save = dataframe_save
        self.metric = metric
        self.mode = mode
        self.max_failures = max_failures
        self.timeout = timeout
        self.checkpoint_freq = checkpoint_freq
        self.checkpoint_num_to_keep = checkpoint_num_to_keep
        self.reuse_actors = reuse_actors
        self.callbacks = callbacks

    def train_tune_model(self):
        """
        Tuning and training the model
        Returns the results object
        """
        ray.init(
            num_cpus=self.num_cpus, num_gpus=self.num_gpus, ignore_reinit_error=True
        )

        tuner = tune.Tuner(
            self.trainable,
            param_space=self.params,
            tune_config=TuneConfig(
                num_samples=self.num_samples,
                metric=self.metric,
                mode=self.mode,
                time_budget_s=self.timeout,
                reuse_actors=self.reuse_actors,
            ),
            run_config=RunConfig(
                name=self.run_name,
                local_dir=self.local_dir,
                callbacks=self.callbacks,
                failure_config=FailureConfig(
                    max_failures=self.max_failures, fail_fast=False
                ),
                checkpoint_config=CheckpointConfig(
                    num_to_keep=self.checkpoint_num_to_keep,
                    checkpoint_score_attribute=self.metric,
                    checkpoint_score_order=self.mode,
                    checkpoint_frequency=self.checkpoint_freq,
                    checkpoint_at_end=True,
                ),
                verbose=3,
            ),
        )

        self.results = tuner.fit()
        
        return self.results

    def infer_results(self, to_dataframe: str = None, mode: str = "a"):
        """
        Get tune results in a dataframe and best results object
        """
        results_df = self.results.get_dataframe()

        if to_dataframe is None:
            to_dataframe = self.dataframe_save

        results_df.to_csv(to_dataframe, mode=mode)

        best_result = self.results.get_best_result()

        return results_df, best_result

    def restore_agent(
        self,
        checkpoint_path: str = "",
        resume_unfinished: bool = True,
        resume_errored: bool = False,
        restart_errored: bool = False,
    ):
        """
        Restore errored or stopped trials
        """
        if checkpoint_path == "":
            checkpoint_path = self.results.get_best_result().checkpoint._local_path

        restored_agent = tune.Tuner.restore(
            checkpoint_path,
            restart_errored=restart_errored,
            resume_unfinished=resume_unfinished,
            resume_errored=resume_errored,
        )
        print(restored_agent)
        self.results = restored_agent.fit()

        return self.results

    def get_test_agent(self, test_env, test_env_name: str, checkpoint=None):
        """
        Get test agent
        """
        if test_env is not None:register_env(test_env_name, lambda config: test_env)

        if checkpoint is None:
            checkpoint = self.results.get_best_result().checkpoint

        testing_agent = Algorithm.from_checkpoint(checkpoint)

        return testing_agent

### Preprocessing

In [None]:
ticker_list = GP2_TICKER
action_dim = len(GP2_TICKER)

In [None]:
print(ticker_list)

In [None]:
print(INDICATORS)

In [None]:
print(CDL)

In [None]:
state_dim = 109