In [1]:
%config Completer.use_jedi = False

In [2]:
import numpy as np
import matplotlib.pyplot as plt

import sys
sys.path.append('/home/victorialena/rlkit')

In [3]:
import copy
import torch
import torch.nn as nn
import torch_geometric.nn as gnn # cuda issue?

from gym.spaces import Box, MultiDiscrete
from numpy.random import rand
from torch.optim import Adam
from torch_geometric.data import Data
from torch_geometric.utils import to_undirected

from replay_buffer import anyReplayBuffer
from rlkit.policies.base import Policy


from path_collector import MdpPathCollector
from replay_buffer import replayBuffer

In [4]:
from env.multiPong import *

In [5]:
env = multiPong()

  "Function `create_seed(a, max_bytes)` is marked as deprecated and will be removed in the future. "
  "Function `_bigint_from_bytes(bytes)` is marked as deprecated and will be removed in the future. "


In [6]:
seed = 42

In [7]:
torch.manual_seed(seed)
np.random.seed(seed)
s = env.reset(seed)

In [8]:
n_agent, c_in = s.shape # 4, 38 # input feature 2+2+30+2+2
c_out = 6 #4 # action space
d = 0. # 0.1

def get_model():
#     return gnn.Sequential('x, edge_index',
#                           [(gnn.GATv2Conv(c_in, 16, heads=4, concat=True, dropout=d), 'x, edge_index -> x'), 
#                             nn.ReLU(inplace=True),
#                             (gnn.GATv2Conv(64, 8, heads=4, concat=True, dropout=d), 'x, edge_index -> x'),
#                             nn.ReLU(inplace=True),
#                             nn.Linear(32, c_out)])
#     return gnn.Sequential('x, edge_index',
#                           [(gnn.GATv2Conv(c_in, 8, heads=4, concat=True, dropout=d), 'x, edge_index -> x'), 
#                             nn.ReLU(inplace=True),
#                             nn.Linear(32, c_out)])
    return nn.Sequential(nn.Linear(c_in, 32),
                         nn.ReLU(),
                         nn.Linear(32, c_out))

In [9]:
class epsilonGreedyPolicy(nn.Module, Policy):
    def __init__(self, qf, space, eps=0.1):
        super().__init__()
        self.qf = qf
        self.aspace = space
        self.eps = np.clip(eps, .0, 1.)

    def __call__(self, x, g):
        if rand() < self.eps:
            return torch.Tensor(self.aspace.sample()).to(torch.long)
        q_values = self.qf(x) #, g)
        return q_values.argmax(-1)
    
    def get_action(self, obs):
        return self(obs.x, obs.edge_index)

In [10]:
def rollout(env, agent, max_path_length): #=np.inf):        
    observations = []
    actions = []
    rewards = []
    terminals = []
    next_observations = []
    
    path_length = 0
    o = env.reset()
    while env.isterminal():
        o = env.reset()
        
    while path_length < max_path_length:
        a = agent(o, env.g)
        next_o, r, done, env_info = env.step(copy.deepcopy(a))
        
        observations.append(o)
        rewards.append(r)
        terminals.append(done)
        actions.append(a)
        next_observations.append(next_o)
        
        path_length += 1
        if done:
            break
        o = next_o
        
    return dict(
        observations=observations,
        actions=actions,
        rewards=rewards,
        next_observations=next_observations,
        terminals=terminals,
    )

```python
# o = env.reset()
for _ in range(1):
    a = expl_policy(o, env.g)
    print(a)
    next_o, r, done, env_info = env.step(a)
    print(r)
    
env.render()
```

In [11]:
from copy import deepcopy
import torch.nn as nn
import torch.nn.functional as F

scientific_notation =  lambda x:"{:.2e}".format(x)

In [12]:
qf = get_model()
expl_policy = epsilonGreedyPolicy(qf, MultiDiscrete([c_out]*4), 0.1)

target_qf = get_model()
eval_policy = epsilonGreedyPolicy(target_qf, MultiDiscrete([c_out]*4), 0.0)

expl_path_collector = MdpPathCollector(env, expl_policy, rollout_fn=rollout, parallelize=False)
eval_path_collector = MdpPathCollector(env, eval_policy, rollout_fn=rollout, parallelize=False)

replay_buffer_cap = 4000 #10000
replay_buffer = replayBuffer(replay_buffer_cap, prioritized=True)

In [13]:
# steps = expl_path_collector.collect_nsteps(200, 200, False)

```python
path = rollout(env, expl_policy, 2500)
path['terminals'][-1]
env.render()

replay_buffer.add_path(path, env.g)

replay_buffer.random_batch(50)
```

In [14]:
optimizer = Adam(qf.parameters(), lr=1e-4, weight_decay=0.01)

max_len = 200
n_samples = 1000 #min(32, replay_buffer_cap//max_len) 

loss = []
avg_r_train = []
avg_r_test = []
max_r_test = []
success_rate = []

In [15]:
n_epoch = 40
n_iter = 64
batch_size = 64
gamma = 0.95

In [16]:
qf_criterion = nn.MSELoss()

In [17]:
def mean_reward(paths):
    return np.hstack([torch.vstack(p['rewards']).sum(1).numpy() for p in paths]).mean()

In [18]:
for i in range(n_epoch):
    qf.train(False)
    paths = expl_path_collector.collect_nsteps(n_samples, max_len, False)
    train_r = mean_reward(paths)
    replay_buffer.add_paths(paths, env.g)

    qf.train(True)
    for _ in range(n_iter): 
        batch = replay_buffer.random_batch(batch_size)

        rewards = batch.r.to(torch.float)
        terminals = batch.t.to(torch.float).repeat_interleave(4)
        actions = batch.a
        states = batch.x
        next_s = batch.next_s

        out = target_qf(next_s) #, env.g)
        target_q_values = out.max(-1, keepdims=False).values
        y_target = rewards + (1. - terminals) * gamma * target_q_values 
        
        out = qf(states) #, env.g)
        actions_one_hot = F.one_hot(actions.to(torch.int64), c_out)
        y_pred = torch.sum(out * actions_one_hot, dim=-1, keepdim=False)
        qf_loss = qf_criterion(y_pred, y_target).to(torch.float)

        loss.append(qf_loss.item())
        avg_r_train.append(rewards.mean().item())

        optimizer.zero_grad() 
        qf_loss.backward()
        optimizer.step()

    target_qf.load_state_dict(deepcopy(qf.state_dict()))
    err = 8
    print("epoch", i+1, #"| lr:", scientific_notation(optimizer.param_groups[0]["lr"]) ,
          " -> loss:", round(np.mean(loss[-n_iter:]), err),
          "| rewards: (train)", round(train_r, err))
#           "| (test)", round(avg_r_test[-1], err), "| (max)", round(max_r_test[-1], err),
#           "| success rate:", round(success_rate[-1], err))

epoch 1  -> loss: 0.03621449 | rewards: (train) -0.2363
epoch 2  -> loss: 0.02284621 | rewards: (train) -0.17189997
epoch 3  -> loss: 0.02209072 | rewards: (train) -0.1302


  return np.stack([np.stack(np.where(masked_obs == c)).mean(1) for c in summed_cvalues])
  ret, rcount, out=ret, casting='unsafe', subok=False)


epoch 4  -> loss: 0.02406532 | rewards: (train) -0.191
epoch 5  -> loss: 0.02104258 | rewards: (train) -0.097
epoch 6  -> loss: 0.02522908 | rewards: (train) -0.17193606
epoch 7  -> loss: 0.03069505 | rewards: (train) -0.20631868
epoch 8  -> loss: 0.02810762 | rewards: (train) -0.1491
epoch 9  -> loss: 0.02967692 | rewards: (train) -0.11960001
epoch 10  -> loss: 0.03800928 | rewards: (train) -0.30898204
epoch 11  -> loss: 0.02692761 | rewards: (train) -0.14930001
epoch 12  -> loss: 0.02880292 | rewards: (train) -0.12200001
epoch 13  -> loss: 0.02750982 | rewards: (train) -0.1446
epoch 14  -> loss: 0.0205277 | rewards: (train) -0.0909
epoch 15  -> loss: 0.02474472 | rewards: (train) -0.1103
epoch 16  -> loss: 0.0232911 | rewards: (train) -0.11400001
epoch 17  -> loss: 0.01993873 | rewards: (train) -0.1517
epoch 18  -> loss: 0.02022458 | rewards: (train) -0.09090001
epoch 19  -> loss: 0.0215789 | rewards: (train) -0.1258
epoch 20  -> loss: 0.02048987 | rewards: (train) -0.1071
epoch 21  

#### notes:

1. reduce feature space
    b. reduce action space
2. try linear network
3. double check sample trajectories
4. check sampled batches
5. remove "grey" from reward structure