In [33]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
from environment import *
from mix_state_env import MixStateEnv
from config import *
import copy
from MyGlobal import MyGlobals
from itertools import count
from torch.distributions import Categorical


In [34]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

state_size = NUM_STATE
action_size = NUM_ACTION
lr = 0.001


In [35]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear1 = nn.Linear(self.state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, self.action_size)

    def forward(self, state):
        output = F.relu(self.linear1(state))
        output = F.relu(self.linear2(output))
        output = self.linear3(output)
        distribution = Categorical(F.softmax(output, dim=-1))
        return distribution


class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.state_size = state_size
        self.linear1 = nn.Linear(self.state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, 1)

    def forward(self, state):
        output = F.relu(self.linear1(state))
        output = F.relu(self.linear2(output))
        value = self.linear3(output)
        return value

In [36]:
def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns



In [37]:
def train(actor, critic, num_iters, num_episodes, duration, gamma, env):
    optimizerA = optim.Adam(actor.parameters())
    optimizerC = optim.Adam(critic.parameters())
    exploit_rate_files = open(
        RESULT_DIR + MyGlobals.folder_name + "exploit_rate.csv", "w")
    exploit_rate_files.write('1')
    for i in range(2, NUM_ACTION + 1):
        exploit_rate_files.write(',' + str(i))
    exploit_rate_files.write('\n')

    for iter in range(num_iters):
        env.replay()
        for episode in range(num_episodes):
            state = env.reset()
            done = False
            count_exploit = [0] * NUM_ACTION

            while not done:
                log_probs = []
                values = []
                rewards = []
                masks = []
                for i in count():
                    state = torch.FloatTensor(state).to(device)
                    dist, value = actor(state), critic(state)

                    action = dist.sample()
                    prob_dist = dist.probs
                    # print(type(prob_dist))
                    # print(prob_dist)
                    # print(action)
                    # print(torch.topk(prob_dist.flatten(), NUM_ACTION).indices.tolist())
                    # print(torch.topk(prob_dist.flatten(), NUM_ACTION).indices.tolist().index(action))
                    # print(torch.topk(
                    #     prob_dist.flatten(), NUM_ACTION))
                    # assert 2 == 3
                    count_exploit[torch.topk(
                        prob_dist.flatten(), NUM_ACTION).indices.tolist().index(action)] += 1
                    # if action == dist.probs.argmax():
                    #     count_exploit += 1
                    next_state, reward, done, _ = env.step(
                        action.cpu().numpy())

                    log_prob = dist.log_prob(action).unsqueeze(0)

                    log_probs.append(log_prob)
                    values.append(value)
                    rewards.append(torch.tensor(
                        [reward], dtype=torch.float, device=device))
                    masks.append(torch.tensor(
                        [1-done], dtype=torch.float, device=device))

                    state = next_state

                    if done:
                        if (env.old_avg_reward < -1500):
                            return
                        print(i)
                        print('Episode: {}, Score: {}'.format(
                            episode, env.old_avg_reward))

                        # print(dist.probs)
                        #print('Iteration: {}, Score: {}'.format(episode, i))
                        break

                    if (i > duration):
                        break

                next_state = torch.FloatTensor(next_state).to(device)
                next_value = critic(next_state)
                returns = compute_returns(
                    next_value, rewards, masks, gamma=gamma)

                log_probs_cat = torch.cat(log_probs)
                returns_cat = torch.cat(returns).detach()
                values_cat = torch.cat(values)

                advantage = returns_cat - values_cat

                actor_loss = -(log_probs_cat * advantage.detach()).mean()
                critic_loss = advantage.pow(2).mean()

                optimizerA.zero_grad()
                optimizerC.zero_grad()
                actor_loss.backward()
                critic_loss.backward()
                optimizerA.step()
                optimizerC.step()

            tempstr = ','.join([str(elem) for elem in count_exploit])
            exploit_rate_files.write(tempstr+"\n")
            # exploit_rate_files.write('{}\n'.format(count_exploit))
            print(tempstr)

    exploit_rate_files.close()

In [38]:
def test(actor, critic, num_episodes, env):
    if (env.old_avg_reward < -1500):
        return
    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            state = torch.FloatTensor(state).to(device)
            dist, value = actor(state), critic(state)

            action = dist.probs.argmax()
            next_state, reward, done, _ = env.step(action.cpu().numpy())

            state = next_state

        print('Test Episode: {}, Score: {}'.format(episode, env.old_avg_reward))

# MyGlobals.folder_name = "Actor_Critic/dur10_g_0_99/" + '1' +'/'

In [39]:
def runAC(i, dur, gamma):
    # MyGlobals.folder_name = "Actor_Critic_800_30s/dur" + str(dur) + "/" + str(i) +'/'
    MyGlobals.folder_name = f"test/gamma{gamma}/dur{dur}/{i}/"
    env = MixStateEnv()
    env.seed(123)
    actor = Actor(state_size, action_size).to(device)
    critic = Critic(state_size, action_size).to(device)
    # 9, 10, 12, 15, 20, 22 
    train(actor, critic, num_iters=9, num_episodes=121,
          duration=dur, gamma=gamma, env=env)
    test(actor, critic, num_episodes=31, env=env)

In [40]:
runAC(1, 30, 0.99)

[WinError 183] Cannot create a file when that file already exists: 'd:\\Lab\\RL\\AODAI_RL\\result/result3/test/gamma0.99/dur30/1/'
[186, 131, 79, 68, 153, 92, 91]
31
Episode: 0, Score: -1.4989184282635148
194,155,134,100,87,74,56
[437, 317, 4, 9, 30, 1, 2]
31
Episode: 1, Score: -23.46053028076649
616,141,30,5,7,0,1
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 2, Score: -159.69988462197796
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 3, Score: -348.8521214569705
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 4, Score: -538.7966152687542
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 5, Score: -728.9677755380002
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 6, Score: -918.4326162781306
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 7, Score: -1107.4870741512743
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 8, Score: -1297.4434487570393
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
31
Episode: 9, Score: -1487.5866542185304
800,0,0,0,0,0,0
[0, 800, 0, 0, 0, 0, 0]
