# 0. Behind the Scenes Setup

In [None]:
# Env Setup:
# conda create project-env python=3.11

# pip install scikit-learn
# pip install yfinance
# pip install matplotlib
# pip install seaborn
# pip install tensorflow-cpu
# pip install keras
# pip install tqdm
# pip install ipywidgets
# pip install jupyter
# pip install pydot
# pip install pydot-ng
# pip install graphviz

# python -m ipykernel install --user --name project-env --display-name "Python (project-env)"



### Check that kernel installed correctly

In [None]:
import sys
sys.executable
# Desired output = '/home/<user>/anaconda3/envs/project-env/bin/python'

### Load Libraries

In [None]:
# Load libraries
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
import datetime
import math
import random
from sklearn.preprocessing import StandardScaler
from IPython.core.debugger import set_trace
from collections import deque

### Load Data

In [None]:
# Download Apple Ticker Data
data = yf.download("AAPL", start="2010-01-01", end="2010-03-01", interval="1d")
for i in range(4, len(data.index), int(len(data.index)/5)):
    data.iloc[i] = np.nan
data.to_csv('aapl_2010_2020_1d.csv')
data = pd.read_csv('aapl_2010_2020_1d.csv')

# 1. Libraries & Sample Data
The first step is to load our Python Libraries and download the sample data. The dataset represents Apple stock price (1d bars) for the year 2010

In [None]:
# Load Python Libraries
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import keras
import datetime
import math
import random
from sklearn.preprocessing import StandardScaler
from IPython.core.debugger import set_trace
from collections import deque
from IPython.display import display, HTML

# for dataframe display
pd.set_option("display.max_rows", None)
def display_df(df):
    # Puts the scrollbar next to the DataFrame
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" + df.to_html() + "</div>"))


In [None]:
# Download Sample Data
data = pd.read_csv('aapl_2010_2020_1d.csv')

# 2. Exploratory Data Analysis
Next, we want to analyze our data. Display the data as a dataframe, and plot some relevant data so you can get an idea of what our dataset looks like.

In [None]:
# Display as Dataframe
display_df(data)

In [None]:
# Index data by Date
data.set_index('Date', inplace=True)
display_df(data)

In [None]:
# Plot some Relevant Data
data['Close'].plot()

# 3. Data Cleaning
Next, we need to clean our data for training our model. This requires removal of NaN values.

In [None]:
# Check for null values
print('Number of Null Values =', data.isnull().sum())

In [None]:
# forward fill missing values
data=data.ffill()
display_df(data)

In [None]:
# Check for null values
print('Number of Null Values =', data.isnull().sum())

# 4. Feature Selection
Now that we have cleaned our stock data, we need to select which features to train our model on. For this project, we will be training with Close data and a 10-day Moving Average of Close. 

In [None]:
# Calculate 10-day MA Close
data['MA'] = data['Close'].rolling(window=10).mean()
display_df(data)

In [None]:
# Remove rows with MA=NaN
data = data.dropna(axis=0)
display_df(data)

In [None]:
# Define new dataframe with only the training features
dataset = data[['Close', 'MA']]
display_df(dataset)

# 5. Noralization
Now that we have cleaned our data, created our indicators of interest, and selected our features, we must normalize our data. For this project, we use the sklearn StandardScaler, which centers the data and normalizes to unit variance. 

In [None]:
# Display & Plot Un-normalized Dataset
display_df(dataset)
dataset['Close'].plot()
dataset['MA'].plot()

In [None]:
# Normalize Dataset with StandardScaler
normlist = []
normed_dataset = pd.DataFrame(index=dataset.index)
for col in dataset.columns:
    normalizer = StandardScaler()
    column_data = pd.DataFrame(dataset[col])
    normalizer.fit(column_data)
    normed_dataset[col] = normalizer.transform(column_data).flatten()
    normlist.append(normalizer)


In [None]:
# Display & Plot Normalized Dataset
display_df(normed_dataset)
normed_dataset['Close'].plot()
normed_dataset['MA'].plot()

# 4. Train / Test Split
Now that our data cleaned, features are selected, and the dataset is normalized, we are ready to feed the data into our model. With this in mind, we split the data ito train and test data (80/20 split)

In [None]:
# split dataset df into train (80%) and test (20%) datasets
normed_dataset_integer_index = normed_dataset.reset_index(drop=False)
training_rows = int(len(normed_dataset.index)*0.8)
train_df = normed_dataset_integer_index.loc[:training_rows].set_index("Date")
test_df = normed_dataset_integer_index.loc[training_rows+1:].set_index("Date")


# X=list(data["Close"])
# X=[float(x) for x in X]
# X_train, X_test = X[0:train_size], X[train_size:len(X)]

In [None]:
# display train and test dfs (ensure no overlap)
display_df(train_df)
display_df(test_df)

In [None]:
# convert train and test dfs to np arrays with dtype=float
X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)
# print the shape of X_train to remind yourself how many examples and features are in the dataset
X_train.shape
# track index to remember which feature is which
idx_close = 0
idx_ma = 1

# 5. Define the Agent
Now that our data is ready to use, we can define the Reinforcement Learning Agent.

### Define the DQN Model
The first step in defining our agent is the Deep Q-Network model definition. For this project, we are creating a model sequential model with four layers. The first three layers have output shape of 64, 32, and 8, respectively, and a RELU activation. The output layer has an output shape of the size of our action space (buy, sell, hold), and a linear activation. Our Loss finction is Mean Squared Error, and our optimizer is Adam with a learning rate of 0.001. Use Keras to build this model.

In [None]:
# Define DQN Model Architecture
class DQN():
    def __init__(self, state_size, action_size):
    
        model = keras.models.Sequential()
        #Input Layer
        model.add(keras.layers.Dense(units=64, input_dim=state_size, activation="relu"))
        #Hidden Layers
        model.add(keras.layers.Dense(units=32, activation="relu"))
        model.add(keras.layers.Dense(units=8, activation="relu"))
        #Output Layer 
        model.add(keras.layers.Dense(action_size, activation="linear"))
        model.compile(loss="mse", optimizer=keras.optimizers.Adam(learning_rate=0.001))

        self.model = model


### Define Agent Class
Now that we have defined our underlying DQN Model, we must define out Reinforcement Learning Agent. The agent initialization is provided for you, you must define an act function, and an expereince replay function. As a reminder, the act function defines how our model will act (buy, hold, or sell) given a certain state. The Experience Replay function tackles catastrophic forgetting in our training process, by maintaining a memory buffer to allow training on independent / randomized minibatches of previous states. 

In [None]:
class Agent:
    def __init__(self, window_size, num_features, is_eval=False, model_name=""):
        #State size depends and is equal to the the window size, n previous days
        self.window_size = window_size
        self.num_features = num_features
        self.state_size = window_size*num_features # normalized previous days, 
        self.action_size = 3 # sit, buy, sell
        self.memory = deque(maxlen=1000)
        # inventory of close prices 
        self.inventory = []
        self.model_name = model_name
        self.is_eval = is_eval

        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        
        self.model = keras.models.load_model(model_name) if is_eval else self._model()

    #Deep Q Learning model- returns the q-value when given state as input 
    def _model(self):
        model = DQN(self.state_size, self.action_size).model
        return model
    
    #Return the action on the value function
    #With probability (1-$\epsilon$) choose the action which has the highest Q-value.
    #With probability ($\epsilon$) choose any action at random.
    #Intitially high epsilon-more random, later less
    #The trained agents were evaluated by different initial random condition
    #and an e-greedy policy with epsilon 0.05. This procedure is adopted to minimize the possibility of overfitting during evaluation.
 
    def act(self, state): 
        #If it is test and self.epsilon is still very high, once the epsilon become low, there are no random
        #actions suggested.
        if not self.is_eval and random.random() <= self.epsilon:
            return random.randrange(self.action_size)      
        # print("state", state)
        options = self.model.predict(state.flatten().reshape(self.window_size, self.num_features))
        #action is based on the action that has the highest value from the q-value function.
        return np.argmax(options[0])

    def expReplay(self, batch_size):
        mini_batch = []
        l = len(self.memory)
        for i in range(l - batch_size + 1, l):
            mini_batch.append(self.memory[i])
        
        # the memory during the training phase. 
        for state, action, reward, next_state, done in mini_batch:
            target = reward # reward or Q at time t    
            #update the Q table based on Q table equation
            #set_trace()
            if not done:
                # print("pred output", self.model(next_state.flatten().reshape(self.window_size, self.num_features)))
                #max of the array of the predicted. 
                target = reward + self.gamma * np.amax(self.model.predict(next_state.flatten().reshape(self.window_size, self.num_features)))     
            # Q-value of the state currently from the table    
            target_f = self.model.predict(state.flatten().reshape(self.window_size, self.num_features))
            # Update the output Q table for the given action in the table     
            target_f[0][action] = target
            #train and fit the model where state is X and target_f is Y, where the target is updated. 
            self.model.fit(state.flatten().reshape(self.window_size, self.num_features), target_f, epochs=1, verbose=0)
        
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

# 6. Train the Agent
Now that our data is ready and our agent is defined, we are ready to train the agent. 

### Helper Functions
Before we define the training loop, we will write some helper functions: one for printing price data, one to define the sigmoind funtion, one to grap the current state, and one to plot the output of our trained model. 

In [None]:
# prints formatted price
def formatPrice(n):
    return ("-$" if n < 0 else "$") + "{0:.2f}".format(abs(n))

# returns the sigmoid
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

# returns an an n-day state representation ending at time t

def getState(data, t, n):    
    d = t - n + 1
    if d >= 0:
        block = data[d:t + 1] 
    else:
        block =  np.array([data[0], 
                           data[0]]) # pad with t0
    res = []
    for i in range(n - 1):
        feature_res = []
        for feature in range(data.shape[1]):
            feature_res.append(sigmoid(block[i + 1, feature] - block[i, feature]))
        res.append(feature_res)
    # display(res)
    return np.array([res])

# Plots the behavior of the output
def plot_behavior(data_input, states_buy, states_sell, profit):
    fig = plt.figure(figsize = (15,5))
    plt.plot(data_input, color='r', lw=2.)
    plt.plot(data_input, '^', markersize=10, color='m', label = 'Buying signal', markevery = states_buy)
    plt.plot(data_input, 'v', markersize=10, color='k', label = 'Selling signal', markevery = states_sell)
    plt.title('Total gains: %f'%(profit))
    plt.legend()
    # locs, labels = plt.xticks()
    # print(locs, labels)
    plt.xticks(range(len(train_df.index.values)), train_df.index.values, rotation=45) # location, labels
    #plt.savefig('output/'+name+'.png')
    plt.show()

### Training Loop

In [None]:
X_train.shape

In [None]:
keras.utils.disable_interactive_logging()
from tqdm.notebook import tqdm

window_size = 1
agent = Agent(window_size, num_features=X_train.shape[1])
dot = keras.utils.model_to_dot(
    agent.model,
    show_shapes=True,
    show_dtype=True,
    show_layer_names=True,
)
dot.write("model", format='png')
from IPython import display

display.Image('model.png')

In [None]:
keras.config.disable_traceback_filtering()

# track number of examples in dataset (i.e. number of days to train on)
l = X_train[:,0].shape[0] - 1

# batch size defines how often to run the expReplay method
batch_size = 32

#An episode represents a complete pass over the data.
episode_count = 2

normalizer_close = normlist[idx_close]
normalizer_ma = normlist[idx_ma]

X_train_true_price = normalizer_close.inverse_transform(X_train[:, idx_close].reshape(-1, 1))
X_train_true_ma = normalizer_ma.inverse_transform(X_train[:, idx_ma].reshape(-1, 1))


for e in range(episode_count + 1):
    # print()
    state = getState(X_train, 0, window_size + 1)
    #set_trace()
    total_profit = 0
    agent.inventory = []
    states_sell = []
    states_buy = []
    for t in tqdm(range(l), desc="Running episode " + str(e) + "/" + str(episode_count)):
        action = agent.act(state)    
        # sit
        next_state = getState(X_train, t + 1, window_size + 1)
        reward = 0

        if action == 1: # buy
            # inverse transform to get true buy price in dollars
            buy_price = normalizer_close.inverse_transform([[X_train[t, idx_close]]])[0][0]
            agent.inventory.append(buy_price)
            # print('inventory', agent.inventory)
            states_buy.append(t)
            print("Buy: " + formatPrice(buy_price))

        elif action == 2 and len(agent.inventory) > 0: # sell
            bought_price = agent.inventory.pop(0)  
            # print('inventory', agent.inventory)
            # inverse transform to get true sell price in dollars
            sell_price = normalizer_close.inverse_transform([[X_train[t, idx_close]]])[0][0]

            # reward is max of profit (close price at time of sell - close price at time of buy)
            reward = max(sell_price - bought_price, 0)
            total_profit += sell_price - bought_price
            states_sell.append(t)
            print("Sell: " + formatPrice(sell_price) + " | Profit: " + formatPrice(sell_price - bought_price))

        done = True if t == l - 1 else False
        #appends the details of the state action etc in the memory, which is used further by the exeReply function
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state

        if done:
            print("--------------------------------")
            print("Total Profit: " + formatPrice(total_profit))
            print("--------------------------------")
            #set_trace()
            #pd.DataFrame(np.array(agent.memory)).to_csv("Agent"+str(e)+".csv")
            #Chart to show how the model performs with the stock goin up and down for each 
            # plot_behavior(X_train ,states_buy, states_sell, total_profit)
            plot_behavior(X_train_true_price ,states_buy, states_sell, total_profit)

        if len(agent.memory) > batch_size:
            agent.expReplay(batch_size)    
            

    if e % 2 == 0:
        agent.model.save("model_ep" + str(e) + ".keras")