In [1]:
#this allows relative imports in notebook
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)


from UCLSE.rl_trader import RLTrader
from UCLSE.rl_env import RLEnv
from UCLSE.custom_timer import CustomTimer
from UCLSE.message_environment import yamlLoad
from UCLSE.messenger import Messenger
from UCLSE.plotting_utilities import display_func
import os
import numpy as np
import pandas as pd

In this notebook we will see how an Reinforcement Learning experiment can easily be setup in BUCLSE.

Firstly we define a dictionary sufficient to define a trading environment as before.

In [2]:
environ_dic={
'start_time': 0.0,
 'end_time': 600.0, #how long the experiment goes on for
 'supply_price_low': 95,
 'supply_price_high': 95,
 'demand_price_low': 105,
 'demand_price_high': 105,
 'interval': 30,
 'timemode': 'drip-poisson',
 'buyers_spec': {'GVWY': 10, 'SHVR': 10, 'ZIC': 10, 'ZIP': 10}, #how many of each type of trader we want
 'sellers_spec': {'GVWY': 10, 'SHVR': 10, 'ZIC': 10, 'ZIP': 10},
 'verbose': False,
 'trade_file': 'avg_balance.csv', #profit and loss summary by trader type
 'trade_record': 'transactions.csv', #list of all transactions
 'dump_each_trade': True} #record each trade


def geometric_q():
    return np.random.geometric(0.6)

environ_dic['quantity_f']=geometric_q

timer=CustomTimer(start=0,end=600,step=1/80)
messenger=Messenger()

environ_dic['timer']=timer
environ_dic['messenger']=messenger

environ_dic

{'start_time': 0.0,
 'end_time': 600.0,
 'supply_price_low': 95,
 'supply_price_high': 95,
 'demand_price_low': 105,
 'demand_price_high': 105,
 'interval': 30,
 'timemode': 'drip-poisson',
 'buyers_spec': {'GVWY': 10, 'SHVR': 10, 'ZIC': 10, 'ZIP': 10},
 'sellers_spec': {'GVWY': 10, 'SHVR': 10, 'ZIC': 10, 'ZIP': 10},
 'verbose': False,
 'trade_file': 'avg_balance.csv',
 'trade_record': 'transactions.csv',
 'dump_each_trade': True,
 'quantity_f': <function __main__.geometric_q()>,
 'timer': time: 0 time left: 48000.0 start: 0 end: 600 step: 0.0125,
 'messenger': <UCLSE.messenger.Messenger at 0x1e537b86898>}

Next we define a RL trader. This has a parent type trader so shares all of the methods needed to intereact with BUCLSE. We will give it some initial inventory at a certain average cost.

In [3]:
rl_trader=RLTrader( ttype='RL', tid='RL', n_quote_limit=100
                   ,timer=timer,messenger=messenger)

adding exchange to RL trader  RL


Next we instantiate the RL environment. This is a subclass of the openai gym gym.EnV
This means it has render and step methods.

The RL environment will set up a Market_session object, (and through that associated exchange, traders and supply_demand objects).

The RL environment will actually iterate through a number of steps until the order book has depth = thresh on both sides

In [4]:
lobenv=RLEnv(RL_trader=rl_trader,environ_dic=environ_dic,thresh=4)


using timer start time=0, end time=600, instead
overwriting timer step size from: 0.01 to 0.0125
adding exchange to RL trader  RL


check what time it is, the eperiment will only begin once the LOB has reached a depth greater than the defined threshold. 

In [5]:
timer.time

6.475

The render method returns the lob

In [6]:
lobenv.render()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,tid,tid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,otype,Ask,Bid
price,time,qid,qty,Unnamed: 4_level_2,Unnamed: 5_level_2
78.0,5.5875,54,3,,B33
80.0,4.7125,40,1,,B39
91.0,5.8375,56,1,,B37
92.0,5.9375,58,1,,B14
95.0,5.2625,47,1,S11,
95.0,5.3375,51,1,S38,
95.0,5.4,52,1,S18,
95.0,5.8625,57,3,S09,
98.0,4.75,41,1,S31,
105.0,3.425,24,1,S39,


Give the RL trader inventory priced at mid point

In [6]:
rl_trader.setup_initial_inventory('Buy',1,lobenv.mid_price)
rl_trader.trade_manager

oid gen RL_6.475_1


inventory: 1, avg cost 94.0, direction Long, cash -94.0,

The RL trader at this point can now submit actions, see results and receive rewards though the step method.

In [6]:
lobenv.action_dic

{(0, 0, 0): Do nothing,
 (1, 0, 0): cancel Bid,
 (1, 1, -1): Cross bid-ask spread and fill Bid quantity 1 at best,
 (1, 1, 0):  submit or replace Bid with spread 0 and quantity 1,
 (1, 1, 1):  submit or replace Bid with spread 1 and quantity 1,
 (1, 1, 2):  submit or replace Bid with spread 2 and quantity 1,
 (1, 1, 3):  submit or replace Bid with spread 3 and quantity 1,
 (1, 1, 4):  submit or replace Bid with spread 4 and quantity 1,
 (1, 1, 5):  submit or replace Bid with spread 5 and quantity 1,
 (-1, 0, 0): cancel Ask,
 (-1, -1, -1):  submit or replace Ask with spread 0 and quantity 1,
 (-1, -1, 0):  submit or replace Ask with spread 1 and quantity 1,
 (-1, -1, 1):  submit or replace Ask with spread 1 and quantity 1,
 (-1, -1, 2):  submit or replace Ask with spread 2 and quantity 1,
 (-1, -1, 3):  submit or replace Ask with spread 3 and quantity 1,
 (-1, -1, 4):  submit or replace Ask with spread 4 and quantity 1,
 (-1, -1, 5):  submit or replace Ask with spread 5 and quantity 1}

Demonstrate how to submit an action with the standard step method. 

After the order is submitted to exchange, the timer is increased by one, and the traders in the environment have the opportunity to respond.

Rewards and stopping criteria are calculated.

The step function returns the state of the lob, reward and a boolean stopping variable.

In [7]:
lobenv.sess.process_verbose=True #turn on the verbal messages from the exchange for exposition
print('LOB before')
print(lobenv.render())

print('RL trader does action (1,0,1) which is to add a bid order at 0 spread to best bid for quantity 1')
observation,reward,done,_=lobenv.step((1,1,0)) 
print('LOB after')
print(lobenv.render())
print(lobenv.trader.trade_manager)

LOB before
                       tid     
otype                  Ask  Bid
price time    qid qty          
84.0  10.9875 80  3    NaN  B26
90.0  9.9625  64  1    NaN  B39
92.0  9.7375  60  2    NaN  B38
98.0  11.6375 90  2    NaN  B28
100.0 8.6500  45  1    NaN  B27
104.0 11.4250 88  1    NaN  B25
      12.2000 98  1    NaN  B36
105.0 10.0875 66  2    NaN  B10
      11.7875 93  1    NaN  B11
      12.2875 99  2    NaN  B16
106.0 10.7500 78  1    NaN  B18
      10.9125 79  2    NaN  B15
109.0 11.3125 84  2    S37  NaN
111.0 11.8625 96  1    S31  NaN
117.0 12.3250 100 1    S32  NaN
127.0 12.1500 97  2    S30  NaN
RL trader does action (1,0,1) which is to add a bid order at 0 spread to best bid for quantity 1
oid gen RL_12.325_1
add_order < response=Proceed (time,oid,qid) None
LOB after
                       tid     
otype                  Ask  Bid
price time    qid qty          
84.0  10.9875 80  3    NaN  B26
90.0  9.9625  64  1    NaN  B39
92.0  9.7375  60  2    NaN  B38
98.0  11.6375

observation is the state variable (here the public anonymous lob), reward is the reward received and done is a boolean informing whether the experiment has finished. 

To see what the reward and finishing functions are:

In [10]:
display_func('UCLSE.rl_env.RLEnv','stop_checker')

This will end when the inventory is greater than a limit, time has expired or the trader has no inventory and no open orders at exchange.

In [11]:
display_func('UCLSE.rl_env.RLEnv','reward_get')

Reward is positive if there is no inventory, no quotes and a positive cash balance, else reward is negative.

Check the last few changes happening at exchange

In [12]:
pd.DataFrame(lobenv.sess.exchange.tape).tail(5)

Unnamed: 0,oid,otype,p1_qid,p2_qid,party1,party2,price,qid,qty,tape_time,tid,time,type
112,-64,Ask,,,,,104.0,63.0,1,8.825,S22,8.825,New Order
113,-33,Bid,,,,,82.0,58.0,1,8.875,B31,8.7375,Cancel
114,-33,Bid,,,,,82.0,64.0,1,8.875,B31,8.875,New Order
115,-21,Bid,,,,,94.0,65.0,1,8.9,B19,8.9,New Order
116,RL_8.9_2,Bid,,,,,94.0,66.0,1,8.9,RL,8.9,New Order


Cancel the order with action (-1,0,0). RL trader not on LOB anymore

In [24]:
_,reward,done,_=lobenv.step((1,0,0))
lobenv.render()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,tid,tid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,otype,Ask,Bid
price,time,qid,qty,Unnamed: 4_level_2,Unnamed: 5_level_2
79.0,7.175,41.0,3,,B32
82.0,8.875,64.0,1,,B31
93.0,7.3625,42.0,1,,B20
94.0,8.9,65.0,1,,B19
95.0,5.1125,22.000001,3,S06,
95.0,7.575,45.0,1,S10,
95.0,7.8625,48.0,1,S02,
95.0,8.5875,55.0,2,S16,
95.0,8.75,59.0,3,S00,
95.0,8.7875,61.0,2,S15,


In [14]:
pd.DataFrame(lobenv.sess.exchange.tape).tail(5)

Unnamed: 0,oid,otype,p1_qid,p2_qid,party1,party2,price,qid,qty,tape_time,tid,time,type
113,-33,Bid,,,,,82.0,58.0,1,8.875,B31,8.7375,Cancel
114,-33,Bid,,,,,82.0,64.0,1,8.875,B31,8.875,New Order
115,-21,Bid,,,,,94.0,65.0,1,8.9,B19,8.9,New Order
116,RL_8.9_2,Bid,,,,,94.0,66.0,1,8.9,RL,8.9,New Order
117,RL_8.9_2,Bid,,,,,94.0,66.0,1,8.9125,RL,8.9,Cancel


Buy at best bid. Note that this will end the experiment since the inventory limit is 1.

In [8]:
_,reward,done,_=lobenv.step((1,1,-1))

print(lobenv)
print(lobenv.trader.trade_manager)
print(f'reward:{reward},finished: {done}')
lobenv.render()

oid gen RL_12.3375_2
add_order < response=Proceed (time,oid,qid) None
<RLEnv instance>
inventory: 1, avg cost 109.0, direction Long, cash -109,
reward:-1,finished: False


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,tid,tid
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,otype,Ask,Bid
price,time,qid,qty,Unnamed: 4_level_2,Unnamed: 5_level_2
84.0,10.9875,80.0,3,,B26
90.0,9.9625,64.0,1,,B39
92.0,9.7375,60.0,2,,B38
98.0,11.6375,90.0,2,,B28
100.0,8.65,45.0,1,,B27
104.0,11.425,88.0,1,,B25
104.0,12.35,104.0,1,,B36
105.0,10.0875,66.0,2,,B10
105.0,11.7875,93.0,1,,B11
105.0,12.2875,99.0,2,,B16


In [26]:
pd.DataFrame(lobenv.sess.exchange.tape).tail(5)

Unnamed: 0,oid,otype,p1_qid,p2_qid,party1,party2,price,qid,qty,tape_time,tid,time,type
126,,Ask,,,,,95.0,45.0,1,8.9625,S10,7.575,Fill
127,,Bid,,,,,95.0,70.0,1,8.9625,RL,8.9625,Fill
128,,,45.0,70.0,S10,RL,95.0,,1,8.9625,,,Trade
129,-64.0,Ask,,,,,104.0,63.0,1,8.975,S22,8.825,Cancel
130,-64.0,Ask,,,,,104.0,71.0,1,8.975,S22,8.975,New Order


The reset command returns the trader back to starting inventory. The order book is maintained. 

In [11]:
rl_trader.reset()
rl_trader.setup_initial_inventory('Buy',1,lobenv.mid_price)
rl_trader.trade_manager

oid gen RL_6.4875_3


inventory: 1, avg cost 92.0, direction Long, cash -92.0,

In [12]:

print('LOB before')
print(lobenv.render())


observation,reward,done,_=lobenv.step((-1,-1,-1)) 
print('LOB after')
print(lobenv.render())
print(lobenv.trader.trade_manager)

LOB before
                            tid     
otype                       Ask  Bid
price time   qid       qty          
69.0  6.1375 29.000000 3    NaN  B36
88.0  6.1500 30.000000 1    NaN  B31
89.0  6.4750 34.000000 2    NaN  B24
95.0  5.2500 25.000002 2    S03  NaN
      5.9750 28.000000 1    S16  NaN
      6.1750 31.000000 4    S11  NaN
112.0 6.1875 32.000000 2    S33  NaN
114.0 2.7000 5.000000  1    S34  NaN
oid gen RL_6.4875_4
add_order < response=Proceed (time,oid,qid) None
LOB after
                            tid     
otype                       Ask  Bid
price time   qid       qty          
69.0  6.1375 29.000000 3    NaN  B36
88.0  6.1500 30.000000 1    NaN  B31
89.0  6.4750 34.000001 1    NaN  B24
95.0  5.9750 28.000000 1    S16  NaN
      6.1750 31.000000 4    S11  NaN
      6.5000 38.000000 2    S03  NaN
112.0 6.1875 32.000000 2    S33  NaN
114.0 2.7000 5.000000  1    S34  NaN
inventory: 0, avg cost 0, direction Long, cash -3.0,


In [13]:
rl_trader.trade_manager

inventory: 0, avg cost 0, direction Long, cash -3.0,

In [9]:
pd.DataFrame(lobenv.sess.exchange.tape).tail(5)

Unnamed: 0,oid,otype,p1_qid,p2_qid,party1,party2,price,qid,qty,tape_time,tid,time,type
66,-26,Bid,,,,,89.0,34.0,2,6.475,B24,6.475,New Order
67,RL_6.475_2,Ask,,,,,94.0,35.0,1,6.475,RL,6.475,New Order
68,,Bid,,,,,94.0,33.0,1,6.475,B23,6.35,Fill
69,,Ask,,,,,94.0,35.0,1,6.475,RL,6.475,Fill
70,,,33.0,35.0,B23,RL,94.0,,1,6.475,,,Trade


In [10]:
rl_trader.blotter

{'RL_6.475_2': [{'tid': 'RL',
   'otype': 'Ask',
   'client_price': 94,
   'order_qty': 1,
   'order_issue_time': 6.475,
   'accession_time': 6.475,
   'qid': 35.0,
   'oid': 'RL_6.475_2',
   'exec_time': 6.475,
   'exec_qty': 1,
   'exec_price': 94,
   'profit': 0.0,
   'improvement': 0,
   'BS': 'Sell',
   'status': 'complete'}]}