In [1]:
import sys,os
import numpy as np
from itertools import count
from collections import namedtuple
import pandas as pd
import matplotlib.pyplot as plt
import subprocess
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# env

In [2]:
class G4env():
    def __init__(self, batch_size = 32):

        self.batch_size = batch_size
        self.angle = np.zeros(self.batch_size)
        self.r = torch.empty((1,30,31,31),dtype=torch.float32)
        self.done = torch.zeros(self.batch_size).view(self.batch_size,1)

        self.ang_range = 85
        self.event_num = 1000
        self.angle_step = 5.0
        self.switch_counter = 0
        
        # Make evaluate array  
        self.ev_array = torch.zeros((30,31,31))
        self.ev_array[15:20, 0:5, 0:5] = 1
        
        # Make dict of dose distribution and angle 
        self.dose = dict()
        angles = [-90 + 5 * a for a in range(36)]
        for angle in angles:
            d = pd.read_csv("data/dose_t=" + str(angle) + ".csv")['dose']
            d = np.array(d).reshape(1,30,31,31) / self.event_num
            # d = torch.tensor(d)
            self.dose[angle] = d

    def reset(self):
        self.r = torch.zeros((1,30,31,31),dtype=torch.float32)
        self.switch_counter = 0
        
        self.angle = np.zeros(self.batch_size)
        self.done = torch.zeros(self.batch_size).view(self.batch_size,1).to(device)

        states = np.array([self.dose[a] for a in self.angle])
        states = torch.tensor(states, dtype=torch.float32).to(device)
        
        return states
    
    def GetReward(self, angle, switch, b):#報酬の計算用関数
        
        if (switch[b] == 1): #スイッチがONの場合のみ
            self.r = self.ev_array * self.dose[angle] #報酬を加算
            self.switch_counter += 1
        
        if (self.switch_counter > 20):
            self.done[b] = 1
        reward = self.r.to(torch.float32).sum().sum().view(-1)
        
        return reward
    
    def step(self, action, switch):

        rewards = torch.empty(0,dtype=torch.float32)
        
        for b in range(self.batch_size):    
            self.angle[b] += self.angle_step * (action[b] - 1)
            
            if (np.abs(self.angle[b]) >= self.ang_range):
                self.done[b] = 1
            
            reward = self.GetReward(self.angle[b], switch, b)      #Reward を計算
            rewards = torch.cat([rewards, reward],dim=0) #配列を組み合わせる(append, push_back) 
            
        rewards = rewards.view(self.batch_size, 1).to(device)
        
        states = np.array([self.dose[a] for a in self.angle])
        states = torch.from_numpy(states).float().to(device)
        return states, rewards, self.done
    


In [3]:
class SwitchNetWork(nn.Module):
    def __init__(self, nch_g=32):
        super(SwitchNetWork, self).__init__()

        self.batch_size = 32
        
        self.conv1 = nn.Conv3d(1        , nch_g    , 2)
        self.conv2 = nn.Conv3d(nch_g    , nch_g * 2, 2)
        self.conv3 = nn.Conv3d(nch_g * 2, nch_g * 4, 2)

        self.relu  = nn.ReLU()
        self.flat  = nn.Flatten()

        self.policy= nn.Linear(128 * 27 * 28 * 28, 3)
        self.value = nn.Linear(128 * 27 * 28 * 28, 1)
        self.switch= nn.Linear(128 * 27 * 28 * 28, 2)
        self.sfmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        #----------------
        x = self.conv1(x)
        x = self.relu(x)
        #----------------
        x = self.conv2(x)
        x = self.relu(x)
        #----------------
        x = self.conv3(x)
        x = self.relu(x)
        #----------------
        x = self.flat(x)
        #----------------
        probs = self.policy(x)
        probs = self.sfmax(probs)
        
        value  = self.value(x)
        
        switch = self.switch(x)
        switch = self.sfmax(switch)
        
        return probs, value, switch
        

In [4]:
class AgentSingleNet:
    def __init__(self):
        self.gamma = 0.98
        self.lr_pi = 0.0000002
        self.lr_v = 0.0000005
        self.action_size = 3

        self.net = SwitchNetWork().to(device)
        self.reward_max = torch.empty(1).to(device)
        self.optimizer_pi = optim.Adam(self.net.parameters(), lr=self.lr_pi)
        self.optimizer_v = optim.Adam(self.net.parameters(), lr=self.lr_v)
        
    def get_action(self, state):
        probs, _, switch_probs = self.net(state)
        
        actions = torch.multinomial(probs, 1)
        switchs = torch.multinomial(switch_probs, 1)
        
        selected_probs = torch.gather(probs, 1, actions)
        selected_switch_probs = torch.gather(switch_probs, 1, switchs)
        
        return actions, selected_probs, switchs, selected_switch_probs
        
    def update(self, states, actions_probs, rewards, next_states, dones):
        # ========== (1) Update V network ============
        _, v_next_states, _ = self.net(next_states)
        with torch.no_grad():
            targets = rewards + self.gamma * v_next_states * (1 - dones)
        _, v_states, s_probs = self.net(states)
        loss_v = F.mse_loss(v_states, targets)
        
        # ========== (2) Update pi network ===========
        with torch.no_grad():
            deltas = targets - v_states

        loss_pi = torch.mean(-torch.log(actions_probs) * deltas)
        
        # ==========(2.5) Update Switch network ======
        switch_targets = torch.empty(0,dtype=torch.float32)
        for reward in rewards:
            if (self.reward_max >= reward):
                self.reward_max = reward
                switch_target = torch.tensor([0,1],dtype=torch.float32).to(device)
                switch_targets = torch.cat([switch_targets, switch_target], dim=0)
            else:
                switch_target = torch.tensor([1,0],dtype=torch.float32).to(device)
                switch_targets = torch.cat([switch_targets, switch_target], dim=0)
        loss_s = sprobs - switch_targets
        
        # ========== (3) Calculate loss ===============
        loss = loss_pi + loss_v + loss_s
        
        self.optimizer_v.zero_grad()
        self.optimizer_pi.zero_grad()

        loss.backward(retain_graph=True)  

        self.optimizer_v.step()
        self.optimizer_pi.step()


In [5]:
episodes   = 5
batch_size = 64

env = G4env(batch_size)
agent = AgentSingleNet()

reward_lis = []
angles  = []

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    count = 0
    print(episode)
    while 1:
        action, prob, switch, switch_prob = agent.get_action(state)
        next_state, reward, done = env.step(action, switch)

        total_reward += reward.sum()
        count += 1

        agent.update(state, prob, reward, next_state, done)
        state = next_state
        angles.append(env.angle[0])

        if count > 100:
            break
        if done.sum() > 0:
            break
    
    reward_lis.append(total_reward.item() / batch_size)

    if episode % 10 == 0:
        # print("episode: ",episode,"step: ", count, "probs: ", prob, "actions: ", action, "angle: ", env.angle)
        print("reward: ",reward)
        # torch.save(agent, "model03/actor-critic" + str(episode))
        

0


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:1! (when checking arugment for argument tensors in method wrapper__cat)