## Submission Requirements/Details

- Load train.csv into a pandas dataframe
- Train a RL agent using our custom gym environment* 
- Save model to disk 
- Edit main.py to use model in step function (see sample main.py for details)
- Zip main.py and your model together and submit on [tamudatathon.com/koth]
- Note your score and try again!

*Custom env provided in this notebook and in the util.py. Feel free to modify the env implementation (such as the reward func) to improve performance

---

## About This Notebook

This notebook does several things
- **Creates** a custom gym environment to make RL agent training easy
- **Validates** and tests the custom gym environment
- **Downloads** sample data (not the stock actually used for challenge) and cleans it for use 
- **Trains** a basic agent to play the trading game 
- **Tests** the agent to see how much money it makes!

You can download this notebook and running it locally on the training dataset so you can train a model for your real submission. 

One last note, this custom gym environment only accepts a basic BUY, SELL, or HOLD action, not a tuple containing both an action and a fraction. You'll have to modify your final implementation to make use of the fraction feature. (Or don't and simply set fraction = 1). 


Good luck!

## Custom Gym Environment

In [None]:
import numpy as np
import gym
from gym import spaces
import pandas as pd

In [None]:
class DeepStockTraderEnv(gym.Env):
  """
  Custom Environment that follows gym interface
  This environment enables agents to make a decision at every timestep in
  a historical stock environment.

  The reward function is defined by how much money the bot made in a particular 
  timestep. (This is 0 in cases where no shares are held)
  """

  metadata={ 'render.modes': ['console'] }

  BUY = 0
  SELL = 1
  HOLD = 2

  def __init__(self, pd_data):
    super(DeepStockTraderEnv, self).__init__()

    self.data = pd_data.values
    self.columns_map = {c.lower(): i for i, c in enumerate(pd_data.columns)}

    self.row_size = len(self.columns_map)

    min_val = np.min(self.data)
    low = np.array([min_val for i in range(self.row_size)])

    max_val = np.max(self.data)
    high = np.array([max_val for i in range(self.row_size)])

    self.observation_space = spaces.Box(low=low, 
                                            high=high, 
                                            shape=(self.row_size,), 
                                            dtype=np.float64)

    self.action_space = spaces.Discrete(3)

    # Variables that track the bot's current state
    self.n_shares = 0 # num of shares currently held
    self.cash = 1000  # starting cash
    self.timestep = 0 # cur index of row/timestep in dataset
    self.n_buys = 0   # num of buys
    self.n_sells = 0  # num of sells
    self.n_holds = 0  # num of holds
    self.account_vals = [] # list tracking the account performance over time

  def reset(self):
    self.n_shares = 0 
    self.cash = 1000
    self.timestep = 1 # + 1 since we return the first observation
    self.n_buys = 0
    self.n_sells = 0
    self.n_holds = 0
    self.account_vals = []

    return np.copy(self.data[0])

  def total(self, timestep=-1, open=True):
    return self.cash + self.n_shares * self.data[timestep, self.columns_map["open" if open else "close"]]

  def step(self, action):

    # ********************** EXECUTE ACTION **********************
    open_j = self.columns_map["open"]
    close_j = self.columns_map["close"]
    if action == self.BUY:
        self.n_shares += self.cash / self.data[self.timestep, open_j]
        self.cash = 0
        self.n_buys += 1
    elif action == self.SELL:
        self.cash += self.n_shares * self.data[self.timestep, open_j]
        self.n_shares = 0
        self.n_sells += 1
    elif action == self.HOLD:
        self.n_holds += 1
    else:
        raise ValueError(f"Illegal Action value: {action}")

    self.account_vals.append(self.total(self.timestep))
    # ************************************************************

    # IMPORTANT 
    # We define reward to be (total account value at close) - (total account value at open)
    # Basically your reward is the amount gained over the course of the day 
    reward = self.total(self.timestep, open=False) - self.total(self.timestep)
    done = self.timestep+1 == len(self.data)-1
    info = {
        "n_buys": self.n_buys,
        "n_sells": self.n_sells,
        "n_holds": self.n_holds,
        "cash": self.cash,
        "n_shares": self.n_shares
    }

    self.timestep += 1

    return np.copy(self.data[self.timestep]), reward, done, info

  def render(self, mode='console'):
    if mode != 'console':
        raise NotImplementedError()
    
    print(f"------------Step {self.timestep}------------")
    print(f'total:   \t{self.total(self.timestep)}')
    print(f'cash:    \t{self.cash}')
    print(f'n_shares:\t{self.n_shares}')
    print(f'n_buys:  \t{self.n_buys}')
    print(f'n_sells:\t{self.n_sells}')
    print(f'n_holds:\t{self.n_holds}')

## Data Collection and Cleaning

In [None]:
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.0
!pip install yfinance
!pip install pandas-ta

In [None]:
from stable_baselines.common.env_checker import check_env

In [None]:
import pandas_ta as pdt
import yfinance as yf
from datetime import datetime, timedelta

# GET STOCK DATA
stonk = yf.Ticker('CANF')
df = stonk.history(start=datetime.now() - timedelta(days=2000), end=datetime.now())
df.ta.strategy("all")

# Clean data
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})
for row in missing_value_df.iterrows():
  if row[1].percent_missing > 0.1:
    df.drop(columns=[row[0]], inplace=True)
df = df.dropna()
df

## Env Validation and Testing

In [None]:
env = DeepStockTraderEnv(df)
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

In [None]:
import random
BUY = 0
SELL = 1
HOLD = 2

obs = env.reset()
env.render()

print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

# Hardcoded best agent: always go left!
n_steps = 20
for step in range(n_steps):
  print("Step {}".format(step + 1))
  obs, reward, done, info = env.step(random.randint(0, 2))
  # print('obs=', obs, 'reward=', reward, 'done=', done)
  env.render()
  if done:
    print("Goal reached!", "reward=", reward)
    break

env.reset();

## Sample Training Loop

*See trainer.py for a pytorch example built by Seth Hamilton*

In [None]:
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env

# Instantiate the env
env = DeepStockTraderEnv(df)
# wrap it
env = make_vec_env(lambda: env, n_envs=1)

In [None]:
# Train the agent
model = DQN('MlpPolicy', env, verbose=1).learn(10000)

In [None]:
# Test the trained agent
obs = env.reset()
timestep = 1
while True:
  action, _ = model.predict(obs, deterministic=True)
  obs, reward, done, info = env.step(action)

  # if env.total(timestep) > 10000:
  #   pdb.set_trace()
  env.render(mode='console')
  if done:
    print("Goal reached!", "reward=", reward)
    break

  timestep += 1
env.render(mode='console')

In [None]:
big_gain = np.exp(np.log(1908.20/1000)/(2000/365))
big_gain