In [None]:
!pip install ptan
!pip install tensorboardX



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import sys
import gym
import csv
import math
import glob
import time
import ptan
import enum
import torch
import collections
import numpy as np
import torch.nn as nn
from gym import wrappers
import matplotlib as mpl
import torch.optim as optim
from gym.utils import seeding
import torch.nn.functional as F
import matplotlib.pyplot as plt
from ptan.agent import TargetNet, DQNAgent
from ptan.actions import EpsilonGreedyActionSelector
from ptan.experience import ExperienceSourceFirstLast, ExperienceReplayBuffer

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive

drive.mount('/content/drive')

!ls "/content/drive/My Drive/RL_Project"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
mean_val_-0.017+ENV_B+DuelingDQN.data  mean_val_0.15+ENV_B+DQN.data
mean_val_-0.040+ENV_A+DuelingDQN.data  YNDX_150101_151231.csv
mean_val_0.052+ENV_A+DQN.data	       YNDX_160101_161231.csv


In [None]:
data_path = '/content/drive/My Drive/RL_Project/YNDX_2016.csv'
val_data_path = "/content/drive/My Drive/RL_Project/YNDX_2015.csv"

In [None]:
retention = 10
commission=0.0
EPSILON = 0.02

## Data

In [None]:
data_fields=['open', 'high', 'low', 'close', 'volume']
data_headings=['<OPEN>', '<HIGH>', '<LOW>', '<CLOSE>', '<VOL>']
Records = collections.namedtuple('Records', field_names=data_fields)

In [None]:
def create_records(data):
  for key in data:
    data[key]=np.array(data[key])
  rec_collection = Records(open=data['open'], high=data['high'], low=data['low'], close=data['close'], volume=data['volume'])
  return Records(open=rec_collection.open, 
                 high=(rec_collection.high - rec_collection.open) / rec_collection.open, 
                 low=(rec_collection.low - rec_collection.open) / rec_collection.open, 
                 close=(rec_collection.close - rec_collection.open) / rec_collection.open, 
                 volume=rec_collection.volume)

def read_csv(path):
    print("Dataset File", path)
    with open(path, 'rt', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=',')
        headings = next(reader)

        data = dict()
        for key in data_fields:
          data[key]=[]
        
        for row in reader:
            values = []
            for index, key in zip([headings.index(s) for s in tuple(data_headings)], data):
                data[key].append(float(row[index]))
    return create_records(data)

In [None]:
def price_files(dir_name):
    result = []
    for path in glob.glob(os.path.join(dir_name, "*.csv")):
        result.append(path)
    return result

## Environment

In [None]:
stock_actions = {'Skip':0,'Buy':1,'Close':2}
stock_actions_rev = {}
for key in stock_actions:
    stock_actions_rev[stock_actions[key]]=key

In [None]:
class StockExchangeEnvironmentA(gym.Env):
    
    def __init__(self):
        self._records=records
        self._actions = stock_actions
        self._num_actions=len(self._actions)
        self._shape=(3*retention+2, )
        self._action_space = gym.spaces.Discrete(n=self._num_actions)
        self._observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=self._shape, dtype=np.float32)
        
    def reset(self):
        self.open_price = 0
        self._count=retention
        self._records=records
        self.bought = False
        
        results, _, __ = self.step_results(0, 1, 1)
        
        return results
    
    def step(self, action_type):
        reward = 0
        if isinstance(action_type, np.int64):
          action_type=stock_actions_rev[action_type]
        current_action = self._actions[action_type]
        # if we are holding any stock 
        if self.bought:
            # if current action is to close
            if current_action == stock_actions['Close']:
                reward -= commission
                self.bought = False
                self.open_price = 0.0           
            else:
                # cant do anything in this case
                pass
        # if we are not holding any stock
        else:
            # current action is to buy
            if self._actions[action_type] == stock_actions['Buy']:
                self.bought = True
                self.open_price = self._records.open[self._count] * (self._records.close[self._count] + 1)
                reward -= commission
            else:
                # can't do anything in this case
                pass
        close_initial = self._records.open[self._count] * (self._records.close[self._count] + 1)
        self._count += 1
        close_final = self._records.open[self._count] * (self._records.close[self._count] + 1)

        results, reward, terminated = self.step_results(reward, close_initial, close_final)
        
        return results, reward, terminated, dict()
    
    def step_results(self, reward, close_initial, close_final):
        if self.bought:
            reward += ((close_final - close_initial)/close_initial)*100

        terminated = False
        terminated |= self._count >= self._records.close.shape[0]-1
   
        itr=0
        results = np.ndarray(shape=self._shape, dtype=np.float32)
        for record in range(1-retention,1):
            results[itr] = self._records.high[self._count + record]
            results[itr+1] = self._records.high[self._count + record]
            results[itr+2] = self._records.high[self._count + record]
            itr+=3

        results[itr] = float(self.bought)
        if not self.bought:
            results[itr+1] = 0.0
        else:
            results[itr+1] = (self._records.open[self._count] * (self._records.close[self._count] + 1) - self.open_price) / self.open_price    
        
        return results, reward, terminated

    def get_environment_space(self):
      return self._observation_space, self._action_space 

In [None]:
class StockExchangeEnvironmentB(gym.Env):
    
    def __init__(self):
        self._records=records
        self._actions = stock_actions
        self._num_actions=len(self._actions)
        self._shape=(3*retention+2, )
        self._action_space = gym.spaces.Discrete(n=self._num_actions)
        self._observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=self._shape, dtype=np.float32)
        
    def reset(self):
        self.open_price = 0
        self._count=retention
        self._records=records
        self.bought = False
        
        results, _, __ = self.step_results(0, 1, 1)
        
        return results
    
    def step(self, action_type):
        reward = 0
        if isinstance(action_type, np.int64):
          action_type=stock_actions_rev[action_type]
        current_action = self._actions[action_type]
        # if we are holding any stock 
        if self.bought:
            # if current action is to close
            if current_action == stock_actions['Close']:
                reward -= commission
                self.bought = False
                self.open_price = 0.0           
            else:
                # cant do anything in this case
                pass
        # if we are not holding any stock
        else:
            # current action is to buy
            if self._actions[action_type] == stock_actions['Buy']:
                self.bought = True
                self.open_price = self._records.open[self._count] * (self._records.close[self._count] + 1)
                reward -= commission
            else:
                # can't do anything in this case
                pass
        close_initial = self._records.open[self._count] * (self._records.close[self._count] + 1)
        self._count += 1
        close_final = self._records.open[self._count] * (self._records.close[self._count] + 1)

        results, reward, terminated = self.step_results(reward, close_initial, close_final)
        
        return results, reward, terminated, dict()
    
    def step_results(self, reward, close_initial, close_final):
        if self.bought:
            reward += (close_final - close_initial)

        terminated = False
        terminated |= self._count >= self._records.close.shape[0]-1
   
        itr=0
        results = np.ndarray(shape=self._shape, dtype=np.float32)
        for record in range(1-retention,1):
            results[itr] = self._records.high[self._count + record]
            results[itr+1] = self._records.high[self._count + record]
            results[itr+2] = self._records.high[self._count + record]
            itr+=3

        results[itr] = float(self.bought)
        if not self.bought:
            results[itr+1] = 0.0
        else:
            results[itr+1] = (self._records.open[self._count] * (self._records.close[self._count] + 1) - self.open_price)  
        
        return results, reward, terminated

    def get_environment_space(self):
      return self._observation_space, self._action_space 

## Models

In [None]:
class DQN(nn.Module):
    def __init__(self, observations, actions):
        super(DQN, self).__init__()

        self.fc = nn.Sequential(
            nn.Linear(observations.shape[0], 20),
            nn.ReLU(),
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 3),
            nn.ReLU(),
            nn.Linear(3, actions.n)
        )

    def forward(self, x):
        return self.fc(x)


class DuelingDQN(nn.Module):
    def __init__(self, observations, actions):
        super(DuelingDQN, self).__init__()

        self.value_network = nn.Sequential(
            nn.Linear(observations.shape[0], 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

        self.surplus_network = nn.Sequential(
            nn.Linear(observations.shape[0], 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, actions.n)
        )

    def forward(self, x):
        value_network = self.value_network(x)
        advantage_network = self.surplus_network(x)
        return value_network + advantage_network - advantage_network.mean(dim=1, keepdim=True)

## Training

In [None]:
def get_state_values(states, net, splits=64):
    state_values=[]
    batches=np.array_split(states, splits)
    for batch in batches:
        observation_tensor = torch.tensor(batch).to(device)
        values = net(observation_tensor)
        action_code = values.max(dim=1)[0].mean().item()
        state_values.append(action_code)
    return np.mean(state_values)

def loss(batch, net, tgt_net, gamma):
    state_warehouse=[]
    action_warehouse=[]
    reward_warehouse=[]
    terminated_warehouse=[]
    terminal_state_warehouse=[]
    for experience in batch:
      state_warehouse.append(np.array(experience.state, copy=False))
      reward_warehouse.append(experience.reward)
      action_warehouse.append(experience.action)
      terminated_warehouse.append(experience.last_state is None)
      terminal_state_warehouse.append(np.array(experience.last_state, copy=False))
      if experience.last_state is None:
        terminal_state_warehouse[-1]=experience.state
    
    state_warehouse_values=torch.tensor(np.array(state_warehouse, copy=False)).to(device)
    action_warehouse_values=torch.tensor(np.array(action_warehouse)).to(device)
    reward_warehouse_values=torch.tensor(np.array(reward_warehouse, dtype=np.float32)).to(device)
    terminated_warehouse_values=torch.tensor(np.array(terminated_warehouse, dtype=np.uint8)).to(device)
    terminal_state_warehouse_values=torch.tensor(np.array(terminal_state_warehouse, copy=False)).to(device)

    state_action_values = net(state_warehouse_values).gather(1, action_warehouse_values.unsqueeze(-1)).squeeze(-1)
    next_state_actions = net(terminal_state_warehouse_values).max(1)[1]
    next_state_values = tgt_net(terminal_state_warehouse_values).gather(1, next_state_actions.unsqueeze(-1)).squeeze(-1)
    next_state_values[terminated_warehouse_values] = 0.0

    expected_state_action_values = next_state_values.detach() * gamma + reward_warehouse_values
    return nn.MSELoss()(state_action_values, expected_state_action_values)

In [None]:
device = torch.device("cpu")

In [None]:
REPLAY_SIZE = 100000
REPLAY_INITIAL = 10000

CHECKPOINT_EVERY_STEP = 99999
VALIDATION_EVERY_STEP = 100000

TARGET_NET_SYNC = 1000
GAMMA = 0.99

BATCH_SIZE = 32

REWARD_STEPS = 2

# LEARNING_RATE = 0.0001
LEARNING_RATE = 0.01

STATES_TO_EVALUATE = 1000
EVAL_EVERY_STEP = 1000

EPSILON_START = 1.0
EPSILON_STOP = 0.1
EPSILON_STEPS = 1000

In [None]:
def train_stock(environment, records, network, agent, tag=''):
  target_network = TargetNet(network)
  experience_source = ExperienceSourceFirstLast(environment, agent, GAMMA, steps_count=REWARD_STEPS)
  buffer = ExperienceReplayBuffer(experience_source, REPLAY_SIZE)
  optimizer = optim.Adam(network.parameters(), lr=LEARNING_RATE)

  itr=0
  states = None
  optimal_mean = None
  while itr<10000:
      itr+=1
      buffer.populate(1)
      selector.epsilon = max(EPSILON_STOP, EPSILON_START - itr / EPSILON_STEPS)

      if len(buffer) < REPLAY_INITIAL:
          continue

      if states is None:
          states = buffer.sample(STATES_TO_EVALUATE)
          states = [np.array(transition.state, copy=False) for transition in states]
          states = np.array(states, copy=False)

      if itr % EVAL_EVERY_STEP == 0:
          mean_val = get_state_values(states, network)
          if optimal_mean is None:
              optimal_mean = mean_val
              network_dictionary = network.state_dict()
              torch.save(network_dictionary, os.path.join("/content/drive/My Drive/RL_Project/", "mean_val_"+str(round(mean_val,3))+'+'+tag+".data"))
          if optimal_mean < mean_val:
              optimal_mean = mean_val
              network_dictionary = network.state_dict()
              torch.save(network_dictionary, os.path.join("/content/drive/My Drive/RL_Project/", "mean_val_"+str(round(mean_val,3))+'+'+tag+".data"))

      optimizer.zero_grad()
      batch = buffer.sample(BATCH_SIZE)
      value_loss = loss(batch, network, target_network.target_model, GAMMA ** REWARD_STEPS)
      value_loss.backward()
      optimizer.step()

      if itr % TARGET_NET_SYNC == 0:
          target_network.sync()
      if itr % CHECKPOINT_EVERY_STEP == 0:
          torch.save(target_network.state_dict(), os.path.join(saves_path, "checkpoint-%3d.data" % itr // CHECKPOINT_EVERY_STEP))
  print('Training Complete!')

records = read_csv(data_path)
environment = StockExchangeEnvironmentA()
environment = gym.wrappers.TimeLimit(environment, max_episode_steps=1000)
observation_space, action_space = environment.get_environment_space()
network = DQN(observation_space, action_space).to(device)
selector = EpsilonGreedyActionSelector(EPSILON_START)
agent = DQNAgent(network, selector, device=device)
train_stock(environment, records, network, agent, tag='ENV_A+DQN')

Dataset File /content/drive/My Drive/RL_Project/YNDX_160101_161231.csv
Training Complete!


In [None]:
environment = StockExchangeEnvironmentB()
environment = gym.wrappers.TimeLimit(environment, max_episode_steps=1000)
observation_space, action_space = environment.get_environment_space()
network = DQN(observation_space, action_space).to(device)
agent = DQNAgent(network, selector, device=device)
train_stock(environment, records, network, agent, tag='ENV_B+DQN')

Training Complete!


In [None]:
environment = StockExchangeEnvironmentA()
environment = gym.wrappers.TimeLimit(environment, max_episode_steps=1000)
observation_space, action_space = environment.get_environment_space()
network = DuelingDQN(observation_space, action_space).to(device)
agent = DQNAgent(network, selector, device=device)
train_stock(environment, records, network, agent, tag='ENV_A+DuelingDQN')

Training Complete!


In [None]:
environment = StockExchangeEnvironmentB()
environment = gym.wrappers.TimeLimit(environment, max_episode_steps=1000)
observation_space, action_space = environment.get_environment_space()
network = DuelingDQN(observation_space, action_space).to(device)
agent = DQNAgent(network, selector, device=device)
train_stock(environment, records, network, agent, tag='ENV_B+DuelingDQN')

Training Complete!
