In [None]:
# PYTORCH IMPORTS
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# OTHER USEFUL PYTHON MODULES AND PACKAGES
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# BASE PYTHON IMPORTS
import random
from collections import deque, namedtuple
from itertools import count
from time import time
from pathlib import Path

# SET UP MATPLOTLIB
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

from data_simulation import process_price_traces

In [None]:
class ENV_DQN:
        def __init__(self, state_size):
            self.state_size = state_size 
            self.action_size = 3 # hold, buy, sell
            self.inventory = []
            self.total_profit = 0
            #self.gamma = 0.95
            #self.epsilon = 1.0
            #self.epsilon_min = 0.01
            #self.epsilon_decay = 0.995
            #self.model = self._model()
            #self.loss_l = []
            #self.val_loss_l = []

        #def _model(self):
        #    model = Sequential()
        #    model.add(Dense(units=64, input_dim=self.state_size, activation="relu"))
        #    model.add(Dense(units=32, activation="relu"))
        #    model.add(Dense(units=8, activation="relu"))
        #    model.add(Dense(self.action_size, activation="linear"))
        #    model.compile(loss="mse", optimizer=Adam(learning_rate=0.001))
        #    return model

        def step(self, state, observation, action):
            reward = 0
            if action == 1: # buy
                self.inventory.append(state[-1])
            elif action == 2 and len(self.inventory) > 0: # sell
                bought_price = self.inventory.pop(0)
                # positive income from the transactions or 0
                reward = max(observation[0] - bought_price, 0)
                # cumulative profit for the episode
                self.total_profit += state[-1] - bought_price
            return reward

In [None]:
class DQN(nn.Module):
    
    # DEFINE THE SPECIAL INIT METHOD
    def __init__(self, features, output, hidden):

        super(DQN, self).__init__()

        self.linear_layer1 = nn.Linear(features, hidden)
        self.linear_layer2 = nn.Linear(hidden, hidden)
        self.linear_layer3 = nn.Linear(hidden, output)


    # DEFINE THE FORWARD METHOD
    def forward(self, x):
        out = F.relu(self.linear_layer1(x))
        out = F.relu(self.linear_layer2(out))
        out = self.linear_layer3(out)
        
        return out

In [None]:
Transition = namedtuple(
    'Transition',
    ('state', 'action', 'next_state', 'reward')
    )


class ReplayMemory(object):
    """
    A class to store gym transitions
    """
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [None]:
def getState(data, t, n):

    if t < 5:
        window = -(t-n+1)*[data[0]] + list(data[0: t+1])
    else:
        window = data[t-n+1:t + 1]

    return np.array(window).reshape((1, n))

In [None]:
start_price =  100
alpha = 1.0
time_steps = 100
num_traces = 500

process_traces = process_price_traces(
    start_price=start_price,
    alpha=alpha,
    time_steps=time_steps,
    num_traces=num_traces)

data = process_traces

In [None]:
window_size = 5

In [None]:
BATCH_SIZE = 128
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 1000
TAU = 0.005
LR = 1e-4
HIDDEN = 128
MEMORY = 300
NUM_EPISODES = 100

# GET THE NUMBER OF ACTIONS FROM THE ENVIRONMENT
NUM_ACTIONS = 3

# RESET THE ENVIRONMENT
state = getState(data[0], 0, window_size)

# GET THE NUMBER OF FEATURES IN A STATE
NUM_FEATURES = len(state)

# DEFINE THE POLICY AND TARGET NET AS INSTANCES OF THE DQN MODEL
policy_net = DQN(features=NUM_FEATURES, hidden=HIDDEN, output=NUM_ACTIONS)
target_net = DQN(features=NUM_FEATURES, hidden=HIDDEN, output=NUM_ACTIONS)

# SYNC THE WEIGHTS OF THE TARGET AND POLICY NETS
# w/ load_state_dict() and state_dict() INHERITED METHODS
#policy_net_weights = policy_net.state_dict()
target_net_weights = target_net.state_dict()
#target_net.load_state_dict(policy_net_weights)
policy_net.load_state_dict(target_net_weights)

# DEFINE THE OPTIMIZER - USE AdamW with amsgrad=True
optimizer = optim.Adam(policy_net.parameters(), lr=LR, amsgrad=True)

# DEFINE THE REPLAY MEMORY AS INSTANCE OF ReplayMemory
REPLAY_MEMORY = ReplayMemory(MEMORY)

In [None]:
def eps_decay(steps_done):
    eps_threshold = EPS_END + (EPS_START - EPS_END) * np.exp(-1. * steps_done / EPS_DECAY)
    return eps_threshold

In [None]:
def select_action(state, eps, net="policy"):

    threshold = random.random()

    if threshold > eps:
        with torch.no_grad():

            if net=="policy":
                return policy_net(state).max(1)[1].view(1, 1)
            else:
                return target_net(state).max(1)[1].view(1, 1)
    else:
        action_space = np.array([0,1,2])
        return torch.tensor([[action_space.sample()]], device="cpu", dtype=torch.long)

In [None]:
def optimize_model():

    if len(REPLAY_MEMORY) < BATCH_SIZE:
        return 0

    transitions = REPLAY_MEMORY.sample(BATCH_SIZE)

    batch = Transition(*zip(*transitions))

    # FIND THE INDICES OF TRANSITIONS THAT ARE NON-TERMINAL STATES
    non_final_mask = torch.tensor(
        tuple(map(lambda s: s is not None, batch.next_state)), 
        device="cpu", 
        dtype=torch.bool
        )
    
    # FIND THAT STATES THAT ARE NON-TERMINAL STATES
    non_final_next_states = torch.cat(
        [s for s in batch.next_state if s is not None]
        )
    
    state_batch = torch.cat(batch.state)

    action_batch = torch.cat(batch.action)
    
    reward_batch = torch.cat(batch.reward)

    state_action_values = policy_net(state_batch).gather(1, action_batch)

    next_state_values = torch.zeros(len(state_batch))
    
    with torch.no_grad():

        next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0]
        
    expected_state_action_values = GAMMA*next_state_values + reward_batch
    
    loss = nn.SmoothL1Loss()

    LOSS = loss(state_action_values, expected_state_action_values)

    optimizer.zero_grad()

    LOSS.backward()

    torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100)

    optimizer.step()

    return 1

In [None]:
episode_durations = []
episode_rewards = []
total_transitions = 0
t_p = []
start_time = time()

num_episodes =300

for i_episode in range(num_episodes):

    # generate new data for each episode
    process_traces = process_price_traces(
        start_price=start_price,
        alpha=alpha,
        time_steps=time_steps,
        num_traces=num_traces)

    # START THE ENVIRONMENT AND OBSERVE THE STATE
    env = ENV_DQN(window_size)
    state = getState(data[0], 0, window_size)
    state = torch.tensor(state, dtype=torch.float32, device="cpu").unsqueeze(0)
    cum_reward = 0
    
    # FOR EVERY TRANSITION IN THE EPISODE (ITERATES UNTIL TERMINAL STATE)
    for t in count():
        
        total_transitions += 1
        
        eps = eps_decay(total_transitions)

        action = select_action(state, eps, net="target")

        observation = getState(data[i_episode], t + 1, window_size) 
        reward = env.step(state, observation, action.item())
        cum_reward += reward
        reward = torch.tensor([reward], device="cpu")
        t_p.append(env.total_profit)

        next_state = torch.tensor(observation, dtype=torch.float32, device="cpu").unsqueeze(0)

        REPLAY_MEMORY.push(state, action, next_state, reward)

        state = next_state

        optimize_model()

        target_net_state_dict = target_net.state_dict()
        policy_net_state_dict = policy_net.state_dict()
        
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        
        target_net.load_state_dict(target_net_state_dict)

        if t == num_traces-1:
            episode_durations.append(t + 1)
            episode_rewards.append(cum_reward)
            break
