<a href="https://colab.research.google.com/github/zcappai/AI-Stock-Trading/blob/main/AI_Trading_Bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
Pseudocode for Deep Q-Learning
State = (current share price, number of shares, remaining balance)
Actions = (buy, sell, hold)
Reward = Profit/loss from buying/selling/holding (change in portfolio value)
Goal = Maximise the change of the portfolio value (maximise profits)
epsilon = probability of taking a random action (initial value = 0.1, but can start at 1 with epsilon decay)
gamma = discount rate (0 <= gamma <= 1), gamma closer to 1 takes future rewards into account more strongly, but closer to 0 it is more focused on immediate rewards
alpha = learning rate (step-size) (0 <= alpha <= 1), closer to 1 means that it learns quickly and closer to 0 means it learns slowly

The state could also be tested with additional values from the environment. e.g. the change in the stock price from the previous state.

-- Process --
State of stock price => buy/sell/hold => profit/loss from action

I shall start with training the model on the data of one company and expand to many companies

For each episode (year):
  Set the initial state
  For each step (day) in the episode (year):
    Choose an action from the learned action-value function Q (with epsilon probability of being random)
    Take the action and observe the returned reward and new state
    New Q-value = current Q-value + alpha [reward + gamma * maximum expected future reward − current Q-value]
    current state = new state
  Until the state is terminal (end of the year)

Should I factor in the number of shares bought/sold?

Different numbers of layers in the neural network need to be experimented with.
Different number of units per layer as well.
"""

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.losses import Huber
from keras.optimizers import Adam
import numpy as np
from datetime import datetime
from collections import deque
from enum import Enum
import random
from google.colab import drive
import pandas as pd

class Company(Enum):
  Apple = "AAPL"
 
class Action(Enum):
  HOLD = 0
  BUY = 1
  SELL = 2

drive.mount('/content/gdrive', force_remount=True)

class Agent:
  def __init__(self, currentSharePrice, startingBalance, company):
    self.modelName = datetime.now().strftime("D%d/%m/%Y_T%H/%M/%S_model")
    self.numOfActions = 3 # 0 = hold, 1 = buy, 2 = sell
    self.epsilon = 0.1
    self.gamma = 0.95 # focused on long term rewards
    self.alpha = 0.001 # learns more slowly
    self.initialState = {
        "currentSharePrice": currentSharePrice,
        "numOfShares": 0,
        "startingBalance": startingBalance
        }
    self.currentSharePrice = currentSharePrice
    self.currentShares = 0
    self.balance = startingBalance
    self.actionQuantity = 10 # buy/sell 10 shares at a time
    self.experienceReplay = deque(maxlen=5000)
    self.miniBatchSize = 30
    self.updateWeights = 30
    self.company = company
    self.generate_models()

  def get_stock_data(self):
    stockData = pd.read_csv("/content/gdrive/MyDrive/StockData/{}.csv".format(self.company))
    closingPrices = list(stockData["Close"])
    pass

  def generate_models(self):
    model = Sequential()
    model.add(Dense(100, activation="relu", input_dim=3))
    model.add(Dense(200, activation="relu"))
    model.add(Dense(300, activation="relu"))
    model.add(Dense(200, activation="relu"))
    model.add(Dense(100, activation="relu"))
    model.add(Dense(self.numOfActions))
    adam = Adam(learning_rate = self.alpha)
    model.compile(optimizer = adam, loss=Huber(delta=1.35))
    self.model = model
    self.target_model = model

  def train(self):
    self.state = self.initialState
    for i in range(100):
      if i % self.updateWeights == 0:
        self.equalise_weights()

      print(self.experienceReplay)
      state = np.array([[self.state[k] for k in self.state]])
      qValue = self.model.predict(state)
      if (np.random.uniform() < self.epsilon):
        chosenAction = np.random.randint(0, self.numOfActions)
      else:
        chosenAction = np.argmax(qValue[0])

      if (chosenAction == Action.BUY.value):
        self.balance -= self.actionQuantity * self.currentSharePrice
        self.currentShares += self.actionQuantity
        newState = {
            "currentSharePrice": self.currentSharePrice,
            "numOfShares": state[0][1] + self.actionQuantity,
            "startingBalance": self.balance
        }
        reward = self.actionQuantity * self.currentSharePrice
      elif (chosenAction == Action.SELL.value):
        self.balance += self.actionQuantity * self.currentSharePrice
        self.currentShares -= self.actionQuantity
        newState = {
            "currentSharePrice": self.currentSharePrice,
            "numOfShares": state[0][1] - self.actionQuantity,
            "startingBalance": self.balance
        }
        reward = self.actionQuantity * self.currentSharePrice
      else:
        newState = self.state
        reward = 0
      
      newStateList = np.array([[newState[k] for k in self.state]])
      self.experienceReplay.append((state, chosenAction, reward, newStateList))

      if (len(self.experienceReplay) < self.miniBatchSize):
        continue

      miniBatch = random.sample(self.experienceReplay, self.miniBatchSize)

      for sample in miniBatch:
        sampleState, sampleAction, sampleReward, sampleNewState = sample
        stateQValues = self.model.predict(sampleState)

        newStateQValue = self.target_model.predict(sampleNewState)
        newStateMaxQValue = np.max(newStateQValue[0])
        targetQValue = sampleReward + self.gamma * newStateMaxQValue

        stateQValues[0][sampleAction] = targetQValue

        self.model.fit(sampleState, stateQValues, epochs=1, batch_size=self.miniBatchSize, verbose=2)

      self.state = newState

  def equalise_weights(self):
    weights = self.model.get_weights()
    target_weights = self.target_model.get_weights()
    self.target_model.set_weights(weights)

# All monetary values are in Pound sterling (£)
currentSharePrice = 10
startingBalance = 1000
trainingCompany = Company.Apple.value
agent = Agent(currentSharePrice, startingBalance, trainingCompany)
agent.train()