In [1]:
import sys

sys.path.append('../')

from gymenv_v2 import make_multiple_env
import numpy as np

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
class Policy(nn.Module):
    def __init__(self, num_inputs):
        super(Policy, self).__init__()
        self.batchNormMatrix = nn.BatchNorm1d(num_features = num_inputs)
        
        
        self.core = nn.Sequential(
                                nn.Conv1d(1, 32, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(32, 64, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(64, 32, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                nn.Conv1d(32, 1, 3),
                                nn.Dropout(.5),
                                nn.ReLU(),
                                )
        
        self.saved_log_probs = []
        self.rewards = []

    def forward(self, s):
        a, b, _, d, e = self._preproc(s)
        a, d = [self.batchNormMatrix(x) for x in [a, d]]
        b, e =[(x - x.min()) / (x.max() - x.min()) for x in [b, e]]
        X, Y = [torch.cat((x, y.unsqueeze(1)), 1) for x, y in zip([a, d], [b, e])]
        
        
        X, Y = [x.unsqueeze(1) for x in [X, Y]]
        
        H, G = [self.core(x) for x in [X, Y]]
        

        H, G = [x.squeeze(1) for x in [H, G]]
        
        S = H @ G.T
        
        
        action_scores = S.mean(0)
        
        return F.softmax(action_scores, dim=-1)
    
    
    def _preproc(self, s):
        return [torch.FloatTensor(item) for item in s]

In [4]:
training_config = {
                'lr': 1e-2,
                'gamma': .95,
                'num_revisit': 1,
                'entropy_coef': 1e-2
            }
env_config =  {
    "load_dir"        : '../instances/train_100_n60_m60',
    "idx_list"        : list(range(99)),
    "timelimit"       : 50,
    "reward_type"     : 'obj'
    }


model = Policy(60)
observer = Observer(env_config)


agent = Agent(training_config, observer, model)


for iteration in range(10):
    for _ in range(1):
        agent.run_episode()
    reward = agent.finish_episode()
    print(f'iter: {iteration}, training reward: {reward}')

loading training instances, dir ../instances/train_100_n60_m60 idx 0
loading training instances, dir ../instances/train_100_n60_m60 idx 1
loading training instances, dir ../instances/train_100_n60_m60 idx 2
loading training instances, dir ../instances/train_100_n60_m60 idx 3
loading training instances, dir ../instances/train_100_n60_m60 idx 4
loading training instances, dir ../instances/train_100_n60_m60 idx 5
loading training instances, dir ../instances/train_100_n60_m60 idx 6
loading training instances, dir ../instances/train_100_n60_m60 idx 7
loading training instances, dir ../instances/train_100_n60_m60 idx 8
loading training instances, dir ../instances/train_100_n60_m60 idx 9
loading training instances, dir ../instances/train_100_n60_m60 idx 10
loading training instances, dir ../instances/train_100_n60_m60 idx 11
loading training instances, dir ../instances/train_100_n60_m60 idx 12
loading training instances, dir ../instances/train_100_n60_m60 idx 13
loading training instances, di