# 1. Libraries & Sample Data
The first step is to load our Python Libraries and download the sample data. The dataset represents Apple stock price (1d bars) for the year 2010

In [None]:
# Load Python Libraries
import math
import keras
import random
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import deque
from tqdm.notebook import tqdm
from IPython.display import display, HTML

# for dataframe display
pd.set_option("display.max_rows", None)
def display_df(df):
    # Puts the scrollbar next to the DataFrame
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" + df.to_html() + "</div>"))


In [None]:
# Download Sample Data (cleaned, not normalized, without features)
data = pd.read_csv('aapl_2010_3m_CLEAN.csv')

# 2. Train / Test Split
Now that we have our our cleaned price dataset, we are ready to feed the data into our model. With this in mind, we select Close as our singular training feature, and split the data ito train and test data (80/20 split)

In [None]:
# split dataset df into train (80%) and test (20%) datasets
dataset = data[['Date', 'Close']]

training_rows = int(len(dataset.index)*0.8)
train_df = dataset.loc[:training_rows].set_index("Date")
test_df = dataset.loc[training_rows+1:].set_index("Date")

In [None]:
# display train and test dfs (ensure no overlap)
display_df(train_df)
display_df(test_df)

In [None]:
# convert train and test dfs to np arrays with dtype=float
X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)
# print the shape of X_train to remind yourself how many examples and features are in the dataset
X_train.shape

# 3. Define the Agent
Now that our data is ready to use, we can define the Reinforcement Learning Agent.

### Define the DQN Model
The first step in defining our agent is the Deep Q-Network model definition. For this excercise, we are creating a sequential model with three layers. The first two layers have output shape of 32 and 8, respectively, and a RELU activation. The output layer has an output shape of the size of our action space (buy, sell, hold), and a linear activation. Our Loss function is Mean Squared Error, and our optimizer is Adam with a learning rate of 0.001. Use Keras to build this model.

In [None]:
@keras.saving.register_keras_serializable()
# Define DQN Model Architecture
class DQN(keras.Model):
    def __init__(self, state_size, action_size):
    
        model = keras.models.Sequential()
        #Input Layer
        model.add(keras.layers.Dense(units=32, input_dim=state_size, activation="relu"))
        #Hidden Layer
        model.add(keras.layers.Dense(units=8, activation="relu"))
        #Output Layer 
        model.add(keras.layers.Dense(action_size, activation="linear"))
        model.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=0.001))

        self.model = model


### Define Agent Class
Now that we have defined our underlying DQN Model, we must define out Reinforcement Learning Agent. The agent initialization is provided for you, you must define an act function, and an expereince replay function. As a reminder, the act function defines how our model will act (buy, hold, or sell) given a certain state. The Experience Replay function tackles catastrophic forgetting in our training process, by maintaining a memory buffer to allow training on independent / randomized minibatches of previous states. 

In [None]:
class Agent:
    def __init__(self, window_size, is_eval=False, model_name=""):
        #State size depends and is equal to the the window size, n previous days
        self.window_size = window_size
        self.state_size = window_size # normalized previous days, 
        self.action_size = 3 # sit, buy, sell
        self.memory = deque(maxlen=1000)
        # inventory of close prices 
        self.inventory = []
        self.model_name = model_name
        self.is_eval = is_eval

        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
        self.model = keras.models.load_model(model_name) if is_eval else self._model()

    #Deep Q Learning model- returns the q-value when given state as input 
    def _model(self):
        model = DQN(self.state_size, self.action_size).model
        return model
    
    #Return the action on the value function
    #With probability (1-$\epsilon$) choose the action which has the highest Q-value.
    #With probability ($\epsilon$) choose any action at random.
    #Intitially high epsilon-more random, later less
    #The trained agents were evaluated by different initial random condition
    #and an e-greedy policy with epsilon 0.05. This procedure is adopted to minimize the possibility of overfitting during evaluation.
 
    def act(self, state): 
        #If it is test and self.epsilon is still very high, once the epsilon become low, there are no random
        #actions suggested.
        if not self.is_eval and random.random() <= self.epsilon:
            return random.randrange(self.action_size)      
        # print("state", state)
        options = self.model.predict(state.flatten().reshape(1, self.window_size))
        #action is based on the action that has the highest value from the q-value function.
        return np.argmax(options[0])

    def expReplay(self, batch_size):
        mini_batch = []
        l = len(self.memory)
        for i in range(l - batch_size + 1, l):
            mini_batch.append(self.memory[i])
        
        # the memory during the training phase. 
        for state, action, reward, next_state, done in mini_batch:
            target = reward # reward or Q at time t    
            #update the Q table based on Q table equation
            #set_trace()
            if not done:
                #max of the array of the predicted. 
                target = reward + self.gamma * np.amax(self.model.predict(next_state.flatten().reshape(1, self.window_size)))  
                
            # Q-value of the state currently from the table    
            target_f = self.model.predict(state.flatten().reshape(1, self.window_size))  
            # Update the output Q table for the given action in the table     
            target_f[0][action] = target
            #train and fit the model where state is X and target_f is Y, where the target is updated. 
            self.model.fit(state.flatten().reshape(1, self.window_size), target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# 4. Train the Agent
Now that our agent is defined, we are ready to train it. 

### Helper Functions
Before we define the training loop, we will write some helper functions: one for printing price data, one to define the sigmoind funtion, one to grab the state representation, and one to plot the output of our trained model. The printing, sigmoid, and plotting functions are defined for you. You must define the function which gets the state representation.

In [None]:
# prints formatted price
def formatPrice(n):
    return ("-$" if n < 0 else "$") + "{0:.2f}".format(abs(n))

# returns the sigmoid
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

# returns an an n-day state representation ending at time t

def getState(data, t, n):    
    d = t - n + 1
    if d >= 0:
        block = data[d:t + 1] 
    else:
        block =  np.array([data[0]]*n) # pad with t0
    res = []
    for i in range(n - 1):
        feature_res = []
        for feature in range(data.shape[1]):
            feature_res.append(sigmoid(block[i + 1, feature] - block[i, feature]))
        res.append(feature_res)
    # display(res)
    return np.array([res])

# Plots the behavior of the output
def plot_behavior(data_input, states_buy, states_sell, profit, train=True):
    fig = plt.figure(figsize = (15,5))
    plt.plot(data_input, color='k', lw=2., label= 'Close Price')
    plt.plot(data_input, '^', markersize=10, color='r', label = 'Buying signal', markevery = states_buy)
    plt.plot(data_input, 'v', markersize=10, color='g', label = 'Selling signal', markevery = states_sell)
    plt.title('Total gains: %f'%(profit))
    plt.legend()
    # locs, labels = plt.xticks()
    # print(locs, labels)
    if train:
        plt.xticks(range(len(train_df.index.values)), train_df.index.values, rotation=45) # location, labels
    else:
        plt.xticks(range(len(test_df.index.values)), test_df.index.values, rotation=45) # location, labels

    #plt.savefig('output/'+name+'.png')
    plt.show()

### Training Loop

In [None]:
# display the shape of your training data in order to remond yourself how may features and examples there are in your training set
X_train.shape

In [None]:
keras.utils.disable_interactive_logging()
from tqdm.notebook import tqdm

window_size = 1
agent = Agent(window_size)
dot = keras.utils.model_to_dot(
    agent.model,
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
)
dot.write("model.png", format='png')
from IPython import display

display.Image('model.png')

In [None]:
import time
keras.config.disable_traceback_filtering()

# track number of examples in dataset (i.e. number of days to train on)
l = X_train[:,0].shape[0] - 1

# batch size defines how often to run the expReplay method
batch_size = 32

#An episode represents a complete pass over the data.
episode_count = 2

st = time.time()

for e in range(episode_count + 1):
    # print()
    state = getState(X_train, 0, window_size + 1)
    #set_trace()
    total_profit = 0
    agent.inventory = []
    states_sell = []
    states_buy = []
    for t in tqdm(range(l), desc="Running episode " + str(e) + "/" + str(episode_count)):
        action = agent.act(state)    
        # sit
        next_state = getState(X_train, t + 1, window_size + 1)
        reward = 0

        if action == 1: # buy
            # inverse transform to get true buy price in dollars
            buy_price = X_train[t].item()
            agent.inventory.append(buy_price)
            # print('inventory', agent.inventory)
            states_buy.append(t)
            print("Buy: " + formatPrice(buy_price))

        elif action == 2 and len(agent.inventory) > 0: # sell
            bought_price = agent.inventory.pop(0)  
            # print('inventory', agent.inventory)
            # inverse transform to get true sell price in dollars
            sell_price = X_train[t].item()

            # reward is max of profit (close price at time of sell - close price at time of buy)
            reward = max(sell_price - bought_price, 0)
            total_profit += sell_price - bought_price
            states_sell.append(t)
            print("Sell: " + formatPrice(sell_price) + " | Profit: " + formatPrice(sell_price - bought_price))

        done = True if t == l - 1 else False
        #appends the details of the state action etc in the memory, which is used further by the exeReply function
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state

        if done:
            print("--------------------------------")
            print("Total Profit: " + formatPrice(total_profit))
            print("--------------------------------")
            plot_behavior(X_train, states_buy, states_sell, total_profit)

        if len(agent.memory) > batch_size:
            agent.expReplay(batch_size)    
            

    if e % 2 == 0:
        agent.model.save("model_ep" + str(e) + ".keras")

print("TOTAL TRAINING TIME", time.time()-st)

# 5. Test the trained model 
Finally, we get to test our trained model to see how well it performs in our test set. Using the training loop above, define a method to run our trained model on our X_test dataset. 

## Define Parameters
Some test parameters are defined for you below. Fill out the missing data. If you need a hint, look up at the training loop. 

In [None]:
l_test = len(X_test) - 1
state = getState(X_test, 0, window_size + 1)
total_profit = 0
done = False
states_sell_test = []
states_buy_test = []
#Get the trained model
agent = Agent(window_size, is_eval=True, model_name="model_ep"+str(episode_count)+".keras")
agent.inventory = []

In [None]:
for t in range(l_test):
    action = agent.act(state)
    #print(action)
    #set_trace()
    next_state = getState(X_test, t + 1, window_size + 1)
    reward = 0

    if action == 1: # buy
        # inverse transform to get true buy price in dollars
        buy_price = X_test[t].item()
        agent.inventory.append(buy_price)
        states_buy_test.append(t)
        print("Buy: " + formatPrice(buy_price))

    elif action == 2 and len(agent.inventory) > 0: # sell
        bought_price = agent.inventory.pop(0)  
        # print('inventory', agent.inventory)
        # inverse transform to get true sell price in dollars
        sell_price = X_test[t].item()

        # reward is max of profit (close price at time of sell - close price at time of buy)
        reward = max(sell_price - bought_price, 0)
        total_profit += sell_price - bought_price
        states_sell_test.append(t)
        print("Sell: " + formatPrice(sell_price) + " | Profit: " + formatPrice(sell_price - bought_price))


    if t == l_test - 1:
        done = True
        
    agent.memory.append((state, action, reward, next_state, done))
    state = next_state

    if done:
        print("------------------------------------------")
        print("Total Profit: " + formatPrice(total_profit))
        print("------------------------------------------")
        
plot_behavior(X_test, states_buy_test, states_sell_test, total_profit, train=False)
