In [2]:
import random
from env import MarketSimulation
from utils import get_options
import numpy as np
import torch
from torch.distributions import Normal
from torch.nn import functional as F
from torch.optim import Adam
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%load_ext autoreload
%autoreload 2

seed = 10

LAMBDA = 0.95
GAMMA = 0.99

ACTOR_LR = 2e-4
CRITIC_LR = 2e-4

CLIP = 0.2
BATCH_SIZE = 64

ITERATIONS = 10000

torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

pygame 2.1.0 (SDL 2.0.16, Python 3.10.9)
Hello from the pygame community. https://www.pygame.org/contribute.html


  File "C:\Software\anaconda3\lib\site-packages\gymnasium\envs\registration.py", line 602, in load_plugin_envs
    fn()
  File "C:\Software\anaconda3\lib\site-packages\shimmy\registration.py", line 263, in register_gymnasium_envs
    _register_atari_envs()
  File "C:\Software\anaconda3\lib\site-packages\shimmy\registration.py", line 207, in _register_atari_envs
    _register_atari_configs(
  File "C:\Software\anaconda3\lib\site-packages\shimmy\registration.py", line 131, in _register_atari_configs
    from ale_py.roms import utils as rom_utils
  File "C:\Software\anaconda3\lib\site-packages\ale_py\roms\__init__.py", line 94, in <module>
    _RESOLVED_ROMS = _resolve_roms()
  File "C:\Software\anaconda3\lib\site-packages\ale_py\roms\__init__.py", line 46, in _resolve_roms
    supported, unsupported = package.resolve()
  File "C:\Software\anaconda3\lib\site-packages\ale_py\roms\utils.py", line 60, in resolve
    lambda file: file.suffix == ".bin", resources.files(self.package).iterdir()


cuda


In [3]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()

        self.model = nn.Sequential(nn.BatchNorm1d(int(state_dim/2)),
                                   nn.Flatten(),
                                   nn.Linear(state_dim, 256),
                                   nn.ELU(),
                                   nn.Linear(256, 256),
                                   nn.ELU(),
                                   nn.Linear(256, action_dim)).to(device)
        
        self.sigma = nn.Parameter(torch.zeros((action_dim))).to(device)
        self.tanh = nn.Tanh()
        
    def compute_proba(self, state, action):
        _, _, distribution = self.act(state)
        return torch.exp(distribution.log_prob(action).sum(-1))

    def act(self, state):
        means = self.model(state)
        stds = torch.exp(self.sigma).expand_as(means)
        distribution = torch.distributions.Normal(means, stds)
        
        action = distribution.sample()
        tanh_action = self.tanh(action)
        return tanh_action, action, distribution

    
class Critic(nn.Module):
    def __init__(self, state_dim):
        super().__init__()
        self.model = nn.Sequential(nn.BatchNorm1d(int(state_dim/2)),
                                   nn.Flatten(),
                                   nn.Linear(state_dim, 256),
                                   nn.ELU(),
                                   nn.Linear(256, 256),
                                   nn.ELU(),
                                   nn.Linear(256, 1)).to(device)

    def get_value(self, state):
        return self.model(state)

In [4]:
class Agent:
    def __init__(self, agent_id, actives, money, state_dim, action_dim):
        self.agent_id = agent_id
        self.actives = actives
        self.money = money
        
        self.open = False
        self.request = None
        self.num_iterations = None
        
        self.actor = Actor(state_dim, action_dim).to(device)
        self.critic = Critic(state_dim).to(device)
        self.actor_optim = Adam(self.actor.parameters(), ACTOR_LR)
        self.critic_optim = Adam(self.critic.parameters(), CRITIC_LR)

    def update(self, trajectories):
        state, action, old_prob, target_value, advantage = zip(*trajectories)
        state = np.array(state)
        action = np.array(action)
        old_prob = np.array(old_prob)
        target_value = np.array(target_value)
        advantage = np.array(advantage)
        advnatage = (advantage - advantage.mean()) / (advantage.std() + 1e-8)
        
        s = torch.tensor(state).float().to(device)
        a = torch.tensor(action).float().to(device)
        op = torch.tensor(old_prob).float().to(device)
        v = torch.tensor(target_value).float().to(device)
        adv = torch.tensor(advantage).float().to(device)

        prob = self.actor.compute_proba(s, a)
        ratio = prob / op
        surr1 = ratio * adv
        surr2 = torch.clamp(ratio, 1 - CLIP, 1 + CLIP) * adv
        actor_loss = - torch.min(surr1, surr2).mean()
        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()   

        value = self.critic.get_value(s)
        target = v.unsqueeze(1)
        critic_loss = F.smooth_l1_loss(value, target)
        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

    def get_value(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state])).float().to(device)
            value = self.critic.get_value(state)
        return value.cpu().item()

    def act(self, state):
        with torch.no_grad():
            state = torch.tensor(np.array([state])).float().to(device)
            action, pure_action, distr = self.actor.act(state)
            prob = torch.exp(distr.log_prob(pure_action).sum(-1))
        return action.cpu().numpy()[0], pure_action.cpu().numpy()[0], prob.cpu().item()
    
    def random_action(self):
        return self.agent_id, [random.uniform(-1, 1) for _ in range(3)]

    def save(self):
        torch.save(self.actor, f"agent_{self.agent_id}.pkl")

In [5]:
def compute_lambda_returns_and_gae(trajectory):
    lambda_returns = []
    gae = []
    last_lr = 0.
    last_v = 0.
    for _, _, r, _, v in reversed(trajectory):
        ret = r + GAMMA * (last_v * (1 - LAMBDA) + last_lr * LAMBDA)
        last_lr = ret
        last_v = v
        lambda_returns.append(last_lr)
        gae.append(last_lr - v)
    return [(s, a, p, v, adv) for (s, a, _, p, _), v, adv in zip(trajectory, reversed(lambda_returns), reversed(gae))]

In [6]:
def sample_episode(env, agents):
    observations = env.reset()
    
    dones = [False for _ in range(len(agents))]
    
    trajectories = [[] for _ in range(len(agents))]
    k = 0

    while not any(dones) and k < BATCH_SIZE:
    
        actions = []
        pure_actions = []
        probs = []
        values = []
        
        for i in range(len(agents)):
            a, pa, p = agents[i].act(observations[i])
            actions.append((agents[i].agent_id, a))
            pure_actions.append(pa)
            probs.append(p)
            
            v = agents[i].get_value(observations[i])
            values.append(v)

        next_observations, rewards, dones, _, _ = env.step(actions)
        
        for i in range(len(agents)):
            trajectories[i].append((observations[i], pure_actions[i], rewards[i], probs[i], values[i]))
        
        observations = next_observations
        k += 1
        
    return [compute_lambda_returns_and_gae(trajectory) for trajectory in trajectories]

In [7]:
def evaluate_policy(env, agents, num_iter):
    observations = env.reset()
    result = []
    dones = [False for _ in range(len(agents))]
    k = 0

    while not any(dones) and k < num_iter:
        actions = []        
        for i in range(len(agents)):
            a, _, _ = agents[i].act(observations[i])
            actions.append((agents[i].agent_id, a))
        next_observations, rewards, dones, _, _ = env.step(actions)  
        result.append(rewards)
        observations = next_observations
        k += 1
        
    return np.mean(result, axis=0)

In [33]:
from tqdm.notebook import trange, tqdm

min_actives = 0
max_actives = 10_000

min_money = 500_000
max_money = 1_000_000
glass_len = 3
state_dim = (glass_len * 2 + 3) * 2
action_dim = 3
num_agents = 10

agents = [Agent(agent_id, random.randint(min_actives, max_actives), random.randint(min_money, max_money), state_dim, action_dim) for agent_id in range(num_agents)]
env = MarketSimulation(agents, glass_len)

for i in trange(ITERATIONS):
        trajectories = sample_episode(env, agents)
        
        for agent, trajectory in zip(agents, trajectories):
            agent.update(trajectory)

        if (i + 1) % (ITERATIONS//100) == 0:
            rewards = evaluate_policy(env, agents, 5)
            print(f"Step: {i+1}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}")

  0%|          | 0/5000 [00:00<?, ?it/s]

  critic_loss = F.smooth_l1_loss(value, target)


Step: 50, Reward mean: 8.999979972839355, Reward std: 1.0191382898483425e-05
Step: 100, Reward mean: 8.994186401367188, Reward std: 1.4854071196168661e-05
Step: 150, Reward mean: 8.998918533325195, Reward std: 9.5367431640625e-07
Step: 200, Reward mean: 8.999616622924805, Reward std: 0.0
Step: 250, Reward mean: 8.999839782714844, Reward std: 0.0
Step: 300, Reward mean: 8.999931335449219, Reward std: 1.2063131862305454e-06
Step: 350, Reward mean: 8.999969482421875, Reward std: 0.0
Step: 400, Reward mean: 9.0, Reward std: 0.0
Step: 450, Reward mean: 9.0, Reward std: 0.0
Step: 500, Reward mean: 9.0, Reward std: 0.0
Step: 550, Reward mean: 8.999147415161133, Reward std: 2.523185003155959e-06
Step: 600, Reward mean: 8.999847412109375, Reward std: 0.0
Step: 650, Reward mean: 9.0, Reward std: 0.0
Step: 700, Reward mean: 9.0, Reward std: 0.0
Step: 750, Reward mean: 8.999728202819824, Reward std: 1.537753632874228e-06
Step: 800, Reward mean: 8.999947547912598, Reward std: 0.0
Step: 850, Reward 

In [20]:
from tqdm.notebook import trange, tqdm

min_actives = 0
max_actives = 10_000

min_money = 500_000
max_money = 1_000_000
glass_len = 3
state_dim = (glass_len * 2 + 3) * 2
action_dim = 3
num_agents = 10

agents = [Agent(agent_id, random.randint(min_actives, max_actives), random.randint(min_money, max_money), state_dim, action_dim) for agent_id in range(num_agents)]
env = MarketSimulation(agents, glass_len)

for i in trange(ITERATIONS):
        trajectories = sample_episode(env, agents)
        
        for agent, trajectory in zip(agents, trajectories):
            agent.update(trajectory)

        if (i + 1) % (ITERATIONS//100) == 0:
            rewards = evaluate_policy(env, agents, 5)            
            print(f"Step: {i+1}, Reward mean: {rewards}, Reward std: {np.std(rewards)}")
            
            for agent in agents:
                agent.save()

  0%|          | 0/5000 [00:00<?, ?it/s]

  critic_loss = F.smooth_l1_loss(value, target)


Step: 50, Reward mean: [[9.214831 ]
 [9.214831 ]
 [9.214831 ]
 [9.214831 ]
 [4.2148314]
 [9.214831 ]
 [9.214831 ]
 [9.214831 ]
 [9.214831 ]
 [9.214831 ]], Reward std: 1.5
Step: 100, Reward mean: [[9.998097]
 [9.998097]
 [9.998097]
 [9.998097]
 [4.998097]
 [9.998097]
 [9.998097]
 [9.998097]
 [9.998097]
 [9.998097]], Reward std: 1.5000001192092896
Step: 150, Reward mean: [[9.998495]
 [9.998495]
 [9.998495]
 [9.998495]
 [4.998495]
 [9.998495]
 [9.998495]
 [9.998495]
 [9.998495]
 [9.998495]], Reward std: 1.5
Step: 200, Reward mean: [[9.999522]
 [9.999522]
 [9.999522]
 [9.999522]
 [4.999522]
 [9.999522]
 [9.999522]
 [9.999522]
 [9.999522]
 [9.999522]], Reward std: 1.5
Step: 250, Reward mean: [[10.]
 [10.]
 [10.]
 [10.]
 [ 5.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]], Reward std: 1.5
Step: 300, Reward mean: [[10.]
 [10.]
 [10.]
 [10.]
 [ 5.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]], Reward std: 1.5
Step: 350, Reward mean: [[10.]
 [10.]
 [10.]
 [10.]
 [ 5.]
 [10.]
 [10.]
 [10.]
 [10.]
 [10.]], Reward s

Step: 2800, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 2850, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 2900, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 2950, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 3000, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e

Step: 4650, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [4.9999995e+00]
 [1.0000000e+01]
 [3.5846591e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 4700, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 4750, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [5.0000000e+00]
 [1.0000000e+01]
 [3.5884738e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 4800, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [4.9999995e+00]
 [1.0000000e+01]
 [3.5846591e-04]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]], Reward std: 3.2014670372009277
Step: 4850, Reward mean: [[1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e+01]
 [1.0000000e

In [7]:
from tqdm.notebook import trange, tqdm

min_actives = 0
max_actives = 10_000

min_money = 500_000
max_money = 1_000_000
glass_len = 4
state_dim = (glass_len * 2 + 3) * 2
action_dim = 3
num_agents = 8

agents = [Agent(agent_id, random.randint(min_actives, max_actives), random.randint(min_money, max_money), state_dim, action_dim) for agent_id in range(num_agents)]
env = MarketSimulation(agents, glass_len)

for i in trange(ITERATIONS):
        trajectories = sample_episode(env, agents)
        
        for agent, trajectory in zip(agents, trajectories):
            agent.update(trajectory)

        if (i + 1) % (ITERATIONS//100) == 0:
            rewards = evaluate_policy(env, agents, 5)
            print(f"Step: {i+1}, {rewards}, Reward mean: {np.mean(rewards)}, Reward std: {np.std(rewards)}")
            
            for agent in agents:
                agent.save()

  0%|          | 0/10000 [00:00<?, ?it/s]

  critic_loss = F.smooth_l1_loss(value, target)


Step: 100, [[11.952913 ]
 [10.352913 ]
 [ 9.952913 ]
 [ 9.952913 ]
 [10.352913 ]
 [12.7529125]
 [10.7529125]
 [ 7.180186 ]], Reward mean: 10.406322479248047, Reward std: 1.5347384214401245
Step: 200, [[ 7.8558607]
 [ 9.189194 ]
 [ 9.189194 ]
 [ 9.989194 ]
 [ 9.489194 ]
 [ 9.989194 ]
 [10.789194 ]
 [ 6.4164667]], Reward mean: 9.113436698913574, Reward std: 1.291795253753662
Step: 300, [[ 7.1046066]
 [ 9.993495 ]
 [ 9.993495 ]
 [ 9.993495 ]
 [ 7.8434954]
 [11.193495 ]
 [ 9.993495 ]
 [ 7.220768 ]], Reward mean: 9.167043685913086, Reward std: 1.4418740272521973
Step: 400, [[ 7.104673 ]
 [ 9.993563 ]
 [ 9.993563 ]
 [ 9.993563 ]
 [ 7.493562 ]
 [10.793562 ]
 [ 9.993563 ]
 [ 7.2208343]], Reward mean: 9.073360443115234, Reward std: 1.4208065271377563
Step: 500, [[ 7.0947685]
 [ 9.983658 ]
 [ 9.983658 ]
 [10.383657 ]
 [ 9.483658 ]
 [11.983658 ]
 [ 9.983658 ]
 [ 7.21093  ]], Reward mean: 9.513455390930176, Reward std: 1.5278652906417847
Step: 600, [[ 7.0936823]
 [ 9.982572 ]
 [ 9.982572 ]
 [ 9.98

Step: 4500, [[ 5.110299 ]
 [ 9.999188 ]
 [ 9.999188 ]
 [13.999188 ]
 [ 9.999188 ]
 [ 9.999188 ]
 [ 9.999188 ]
 [ 6.0264606]], Reward mean: 9.391486167907715, Reward std: 2.5673282146453857
Step: 4600, [[ 5.110895]
 [ 9.999784]
 [ 9.999784]
 [11.999784]
 [ 9.999784]
 [10.799784]
 [10.399784]
 [ 6.827056]], Reward mean: 9.392082214355469, Reward std: 2.115804433822632
Step: 4700, [[ 5.111091 ]
 [ 9.99998  ]
 [ 9.99998  ]
 [ 9.99998  ]
 [ 9.99998  ]
 [ 9.99998  ]
 [10.399981 ]
 [ 6.0272527]], Reward mean: 8.942277908325195, Reward std: 1.9651312828063965
Step: 4800, [[ 5.058069]
 [ 9.946958]
 [ 9.946958]
 [11.146957]
 [ 9.946958]
 [11.146957]
 [11.146957]
 [ 5.974231]], Reward mean: 9.289255142211914, Reward std: 2.251199960708618
Step: 4900, [[ 5.111061 ]
 [ 9.999949 ]
 [ 9.999949 ]
 [11.999949 ]
 [ 9.999949 ]
 [11.59995  ]
 [11.199949 ]
 [ 5.6272225]], Reward mean: 9.44224739074707, Reward std: 2.4630701541900635
Step: 5000, [[ 5.1111097]
 [ 9.999999 ]
 [ 9.999999 ]
 [ 9.999999 ]
 [ 9.9

Step: 8900, [[ 5.0043955]
 [11.999987 ]
 [ 9.999987 ]
 [ 9.999987 ]
 [ 9.999987 ]
 [ 9.999987 ]
 [ 9.999987 ]
 [ 5.227259 ]], Reward mean: 9.028946876525879, Reward std: 2.3503060340881348


KeyboardInterrupt: 

In [17]:
min_actives = 0
max_actives = 10_000

min_money = 500_000
max_money = 1_000_000
glass_len = 3
state_dim = (glass_len * 2 + 3) * 2
action_dim = 3

class DemoAgent:
    def __init__(self, agent_id):
        self.agent_id = agent_id
        self.actives = random.randint(min_actives, max_actives)
        self.money = random.randint(min_money, max_money)
        
        self.open = False
        self.request = None
        self.num_iterations = None
        
        self.model = torch.load(f"./agent_{agent_id}.pkl")
        
    def act(self, state):
        state = torch.tensor(np.array([state]), device=device).float()
        action = self.model.act(state)[0]
        return action.cpu().numpy()[0]

    def reset(self):
        pass

In [39]:
num_agents = 10

agents_demo = [DemoAgent(agent_id) for agent_id in range(num_agents)]
env_demo = MarketSimulation(agents_demo, glass_len, render_mode='human')

observations = env_demo.reset()

try:
    for ep in range(50):
        actions = []
        for i in range(len(agents_demo)):
            a = agents_demo[i].act(observations[i])
            actions.append((agents_demo[i].agent_id, a))

        next_observations, rewards, dones, _, _ = env_demo.step(actions)
        observations = next_observations if not any(dones) else env_demo.reset(options=get_options(dones))
        print(ep)
except KeyboardInterrupt:
    env_demo.close()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [12]:
import pandas as pd

file_name = './SBER.txt'
data = pd.read_csv(file_name)
data.head(10)

Unnamed: 0,DATE,NO,SECCODE,BUYSELL,TIME,ORDERNO,ACTION,PRICE,VOLUME,TRADENO,TRADEPRICE
0,20140901,2,SBER,S,100000000,2,1,74.18,1500,,
1,20140901,3,SBER,S,100000000,3,1,74.75,1500,,
2,20140901,33,SBER,B,100000000,33,1,73.16,500,,
3,20140901,57,SBER,B,100000000,57,1,71.33,6000,,
4,20140901,83,SBER,S,100000000,83,1,74.8,70,,
5,20140901,84,SBER,S,100000000,84,1,74.4,60,,
6,20140901,85,SBER,S,100000000,85,1,74.0,50,,
7,20140901,86,SBER,S,100000000,86,1,73.6,40,,
8,20140901,87,SBER,B,100000000,87,1,72.8,10,,
9,20140901,101,SBER,B,100000000,101,1,72.6,3300,,


In [50]:
slice_buy = data[['PRICE', 'VOLUME']].loc[data['BUYSELL'] == 'B']
slice_buy[:4407760].to_csv('./SBER_slice_buy.txt', index=False)
slice_buy.head(10)

Unnamed: 0,PRICE,VOLUME
2,73.16,500
3,71.33,6000
8,72.8,10
9,72.6,3300
12,73.03,500
13,72.53,40
24,72.93,500
26,70.21,200000
28,70.1,80
29,73.5,23000


In [51]:
slice_sell = data[['PRICE', 'VOLUME']].loc[data['BUYSELL'] == 'S']
slice_sell['VOLUME'] = -slice_sell['VOLUME']
slice_sell.to_csv('./SBER_slice_sell.txt', index=False)
slice_sell.head(10)

Unnamed: 0,PRICE,VOLUME
0,74.18,-1500
1,74.75,-1500
4,74.8,-70
5,74.4,-60
6,74.0,-50
7,73.6,-40
10,73.7,-10
11,73.58,-250
14,74.72,-210
15,74.79,-420


In [36]:
slice_sell.shape

(4407760, 2)

In [37]:
slice_buy.shape

(4573737, 2)