In [4]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
from environment import *
from mix_state_env import MixStateEnv
from config import *
import copy
from MyGlobal import MyGlobals
from itertools import count
from torch.distributions import Categorical
import random
import math
from collections import namedtuple, deque
import pickle


In [5]:
device = torch.device("cpu")
result_train=list()
result_test=list()
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

state_size = NUM_STATE
action_size = NUM_ACTION
lr = 0.001
eps_start=0.9
eps_end=0.01
eps_decay=1000


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
    def push(self, *args):
        self.memory.append(self.Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
replay = ReplayMemory(10000)
class DQNnet(nn.Module):
    def __init__(self,n_observations,n_actions):
        super(DQNnet,self).__init__()
        self.layer1=nn.Linear(n_observations,128)
        self.layer2=nn.Linear(128,128)
        self.layer3=nn.Linear(128,n_actions)
    def forward(self,x):
        x=F.relu(self.layer1(x))
        x=F.relu(self.layer2(x))
        return self.layer3(x)


In [15]:
class DQNAgent:
    def __init__(self) :
        self.env=MixStateEnv()
        self.optimize=0
        self.env.seed(123)
        self.batch_size=128
        self.eps_start=0.9
        self.eps_end=0.05
        self.eps_decay=1000
        self.tau=0.01
        self.lr=1e-4
        self.n_actions=NUM_ACTION
        self.n_observations=NUM_STATE
        self.policy_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.target_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.save_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.save_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer=optim.AdamW(self.policy_net.parameters(),lr=self.lr,amsgrad=True)
        self.memory=ReplayMemory(10000)
        self.stepdone=0
    def select_action(self,state):
            sample=random.random()
            eps_threshold=self.eps_end+(self.eps_start-self.eps_end)*math.exp(-1.*self.stepdone/self.eps_decay)
            self.stepdone+=0.05
            if sample > eps_threshold:
                with torch.no_grad():
                    return self.policy_net(state).max(-1)[1].view(1, 1)
            else:
                return torch.tensor([[random.randint(0,NUM_ACTION-1)]], device=device, dtype=torch.long)    

    def optimize_model(self,gamma):
            # self.start_time=time.time()
            if len(self.memory) < self.batch_size:
                return
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))

            
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                                batch.next_state)), device=device, dtype=torch.bool)
            non_final_next_states = torch.cat([s for s in batch.next_state
                                                        if s is not None])
            batch_state=[]
            for i in batch.state :
                batch_state.append(i.view(1,14))
            state_batch = torch.cat(batch_state)
            action_batch = torch.cat(batch.action)

           
            reward_batch = torch.cat(batch.reward)

            
            state_action_values = self.policy_net(state_batch).gather(1, action_batch)

            next_state_values = torch.zeros(self.batch_size, device=device)
            with torch.no_grad():
                next_state_values[non_final_mask] =self.target_net(non_final_next_states).max(1)[0]
                # next_state_values[non_final_mask] =self.policy_net(non_final_next_states).gather(1,self.target_net(non_final_next_states).max(1)[1].unsqueeze(0))
                
            # Compute the expected Q values
            expected_state_action_values = (next_state_values * gamma) + reward_batch
            

            # Compute Huber loss
            criterion = nn.SmoothL1Loss()
            loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

            # Optimize the model
            self.optimizer.zero_grad()
            loss.backward()
            # In-place gradient clipping
            torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
            self.optimizer.step()
            self.optimize+=1
            # self.end_time=time.time()
            # print(f"optimze_time:{self.end_time-self.start_time},times{self.optimize}")
    def train(self,num_iters,num_episodes,duration,gamma):
    
            for iter in range(num_iters):
                self.env.replay()
                for episode in range(num_episodes):
                    state = self.env.reset()
                    done = False
                    while not done:
                        for i in count():
                            state = torch.FloatTensor(state).to(device)
                            action = self.select_action(state)
                            action1=action.item()
                            next_state, reward, done, _= self.env.step(np.array(action1))
                            reward = torch.tensor([reward], device=device)
                            if done:
                                if (self.env.old_avg_reward < -1500):
                                    return
                                next_state = None
                                print('Episode: {}, Score: {}'.format(
                                    episode, self.env.old_avg_reward))
                                result_train.append(self.env.old_avg_reward*-800)

                                break
                            
                                
                            if (i > duration):
                                break
                            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
                            self.memory.push(state, action, next_state, reward)
                            state = next_state

                            self.optimize_model(gamma)

             
                            target_net_state_dict = self.target_net.state_dict()
                            policy_net_state_dict = self.policy_net.state_dict()
                            save_net_state_dict=self.save_net.state_dict()
                            for key in policy_net_state_dict:
                                save_net_state_dict[key] = policy_net_state_dict[key]*self.tau + save_net_state_dict[key]*(1-self.tau)
                            self.save_net.load_state_dict(save_net_state_dict)
                        self.target_net.load_state_dict(save_net_state_dict)
                            

    def test(self,num_episodes):
            
            if (self.env.old_avg_reward < -1500):
                return
            for episode in range(num_episodes):
                state = self.env.reset()
                done = False

                while not done:
                    state = torch.FloatTensor(state).to(device)
                    action=action = self.select_action(state)
                    action1=action.item()
                    next_state, reward, done, _= self.env.step(np.array(action1))

                    state = next_state

                print('Test Episode: {}, Score: {}'.format(episode, self.env.old_avg_reward))
                result_test.append(self.env.old_avg_reward*-800)
    def runAC(self,i, dur,gamma):
    # MyGlobals.folder_name = "Actor_Critic_800_30s/dur" + str(dur) + "/" + str(i) +'/'
        MyGlobals.folder_name = f"test/gamma{gamma}/dur{dur}/{i}/"
        self.train(num_iters=9, num_episodes=121,
            duration=dur,gamma=gamma )
        self.test( num_episodes=60, )


In [16]:
Agent=DQNAgent()
Agent.runAC(1,100,0.99)

[WinError 183] Cannot create a file when that file already exists: 'd:\\Lab\\RL\\AODAI_RL\\result/result3/test/gamma0.99/dur100/1/'
[154, 121, 122, 112, 94, 103, 94]
Episode: 0, Score: -1.00588719618301
[188, 103, 95, 105, 102, 100, 107]
Episode: 1, Score: -0.7411724343102697
[162, 111, 109, 107, 105, 107, 99]
Episode: 2, Score: -0.49034550246658043
[170, 107, 110, 110, 107, 107, 89]
Episode: 3, Score: -0.24045627881850426
[195, 108, 108, 100, 113, 109, 67]
Episode: 4, Score: -0.28303912100907214
[198, 107, 106, 104, 96, 100, 89]
Episode: 5, Score: -0.30599773677113123
[179, 112, 109, 106, 114, 109, 71]
Episode: 6, Score: -0.1623039918787435
[168, 109, 115, 105, 107, 108, 88]
Episode: 7, Score: -0.19608544705228068
[186, 111, 104, 105, 105, 110, 79]
Episode: 8, Score: -0.21208893819795815
[192, 107, 111, 109, 107, 107, 67]
Episode: 9, Score: -0.2555821092043173
[171, 113, 110, 112, 111, 109, 74]
Episode: 10, Score: -0.22898323973507193
[192, 106, 108, 108, 110, 110, 66]
Episode: 11, Sc

In [17]:
with open('dqn_train5.pkl', 'wb') as file:
    pickle.dump(result_train, file)
with open('dqn_test5.pkl', 'wb') as file:
    pickle.dump(result_test, file)


In [None]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
from environment import *
from mix_state_env import MixStateEnv
from config import *
import copy
from MyGlobal import MyGlobals
from itertools import count
from torch.distributions import Categorical

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

state_size = NUM_STATE
action_size = NUM_ACTION
lr = 0.001
result_train1=list()
result_test1=list()

class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.state_size = state_size
        self.action_size = action_size
        self.linear1 = nn.Linear(self.state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, self.action_size)

    def forward(self, state):
        output = F.relu(self.linear1(state))
        output = F.relu(self.linear2(output))
        output = self.linear3(output)
        distribution = Categorical(F.softmax(output, dim=-1))
        return distribution


class Critic(nn.Module):
    def __init__(self, state_size, action_size):
        super(Critic, self).__init__()
        self.state_size = state_size
        self.linear1 = nn.Linear(self.state_size, 128)
        self.linear2 = nn.Linear(128, 256)
        self.linear3 = nn.Linear(256, 1)

    def forward(self, state):
        output = F.relu(self.linear1(state))
        output = F.relu(self.linear2(output))
        value = self.linear3(output)
        return value


def compute_returns(next_value, rewards, masks, gamma=0.99):
    R = next_value
    returns = []
    for step in reversed(range(len(rewards))):
        R = rewards[step] + gamma * R * masks[step]
        returns.insert(0, R)
    return returns


def train(actor, critic, num_iters, num_episodes, duration, gamma, env):
    optimizerA = optim.Adam(actor.parameters())
    optimizerC = optim.Adam(critic.parameters())
    exploit_rate_files = open(
        RESULT_DIR + MyGlobals.folder_name + "exploit_rate.csv", "w")
    exploit_rate_files.write('1')
    for i in range(2, NUM_ACTION + 1):
        exploit_rate_files.write(',' + str(i))
    exploit_rate_files.write('\n')

    for iter in range(num_iters):
        env.replay()
        for episode in range(num_episodes):
            state = env.reset()
            done = False
            count_exploit = [0] * NUM_ACTION

            while not done:
                log_probs = []
                values = []
                rewards = []
                masks = []
                for i in count():
                    state = torch.FloatTensor(state).to(device)
                    dist, value = actor(state), critic(state)

                    action = dist.sample()
                    prob_dist = dist.probs
                    # print(type(prob_dist))
                    # print(prob_dist)
                    # print(action)
                    # print(torch.topk(prob_dist.flatten(), NUM_ACTION).indices.tolist())
                    # print(torch.topk(prob_dist.flatten(), NUM_ACTION).indices.tolist().index(action))
                    # print(torch.topk(
                    #     prob_dist.flatten(), NUM_ACTION))
                    # assert 2 == 3
                    count_exploit[torch.topk(
                        prob_dist.flatten(), NUM_ACTION).indices.tolist().index(action)] += 1
                    # if action == dist.probs.argmax():
                    #     count_exploit += 1
                    action1=action.cpu().numpy()
                    next_state, reward, done, _ = env.step(
                        action1)
                    log_prob = dist.log_prob(action).unsqueeze(0)

                    log_probs.append(log_prob)
                    values.append(value)
                    rewards.append(torch.tensor(
                        [reward], dtype=torch.float, device=device))
                    masks.append(torch.tensor(
                        [1-done], dtype=torch.float, device=device))

                    state = next_state

                    if done:
                        if (env.old_avg_reward < -1500):
                            return
                        print('Episode: {}, Score: {}'.format(
                            episode, env.old_avg_reward))
                        result_train1.append(env.old_avg_reward)

                        # print(dist.probs)
                        #print('Iteration: {}, Score: {}'.format(episode, i))
                        break

                    if (i > duration):
                        break

                next_state = torch.FloatTensor(next_state).to(device)
                next_value = critic(next_state)
                returns = compute_returns(
                    next_value, rewards, masks, gamma=gamma)

                log_probs_cat = torch.cat(log_probs)
                returns_cat = torch.cat(returns).detach()
                values_cat = torch.cat(values)

                advantage = returns_cat - values_cat

                actor_loss = -(log_probs_cat * advantage.detach()).mean()
                critic_loss = advantage.pow(2).mean()

                optimizerA.zero_grad()
                optimizerC.zero_grad()
                
                critic_loss.backward()
                actor_loss.backward()
                optimizerA.step()
                optimizerC.step()
                

            tempstr = ','.join([str(elem) for elem in count_exploit])
            exploit_rate_files.write(tempstr+"\n")
            # exploit_rate_files.write('{}\n'.format(count_exploit))
            print(tempstr)

    exploit_rate_files.close()


def test(actor, critic, num_episodes, env):
    if (env.old_avg_reward < -1500):
        return
    for episode in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            state = torch.FloatTensor(state).to(device)
            dist, value = actor(state), critic(state)

            action = dist.probs.argmax()
            next_state, reward, done, _ = env.step(action.cpu().numpy())

            state = next_state

        print('Test Episode: {}, Score: {}'.format(episode, env.old_avg_reward))
        result_test1.append(env.old_avg_reward)

# MyGlobals.folder_name = "Actor_Critic/dur10_g_0_99/" + '1' +'/'


def runAC(i, dur, gamma):
    # MyGlobals.folder_name = "Actor_Critic_800_30s/dur" + str(dur) + "/" + str(i) +'/'
    MyGlobals.folder_name = f"test/gamma{gamma}/dur{dur}/{i}/"
    env = MixStateEnv()
    env.seed(123)
    actor = Actor(state_size, action_size).to(device)
    critic = Critic(state_size, action_size).to(device)
    # 9, 10, 12, 15, 20, 22 
    train(actor, critic, num_iters=9, num_episodes=121,
          duration=dur, gamma=gamma, env=env)
    test(actor, critic, num_episodes=100, env=env)


runAC(1, 30, 0.99)

KeyboardInterrupt: 

In [None]:
with open('actor_train0.pkl', 'wb') as file:
    pickle.dump(result_train1, file)
with open('actor_test0.pkl', 'wb') as file:
    pickle.dump(result_test1, file)

In [21]:
import numpy as np
import torch
import gym
from matplotlib import pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
from environment import *
from mix_state_env import MixStateEnv
from config import *
import copy
from MyGlobal import MyGlobals
from itertools import count
from torch.distributions import Categorical
import random
import math
from collections import namedtuple, deque
import pickle

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
result_train2=list()
result_test2=list()
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

state_size = NUM_STATE
action_size = NUM_ACTION
lr = 0.001
eps_start=0.9
eps_end=0.05
eps_decay=1000


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
        self.Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))
    def push(self, *args):
        self.memory.append(self.Transition(*args))
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    def __len__(self):
        return len(self.memory)
replay = ReplayMemory(10000)
class DQNnet(nn.Module):
    def __init__(self,n_observations,n_actions):
        super(DQNnet,self).__init__()
        self.layer1=nn.Linear(n_observations,128)
        self.layer2=nn.Linear(128,128)
        self.layer3=nn.Linear(128,n_actions)
    def forward(self,x):
        x=F.relu(self.layer1(x))
        x=F.relu(self.layer2(x))
        return self.layer3(x)

class DQNAgent:
    def __init__(self) :
        self.env=MixStateEnv()
        self.optimize=0
        self.env.seed(123)
        self.batch_size=128
        self.eps_start=0.9
        self.eps_end=0.05
        self.eps_decay=1000
        self.tau=0.1
        self.lr=1e-4
        self.n_actions=NUM_ACTION
        self.n_observations=NUM_STATE
        self.policy_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.target_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.save_net=DQNnet(self.n_observations,self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.save_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer=optim.AdamW(self.policy_net.parameters(),lr=self.lr,amsgrad=True)
        self.memory=ReplayMemory(10000)
        self.stepdone=0
    def select_action(self,state):
            sample=random.random()
            eps_threshold=self.eps_end+(self.eps_start-self.eps_end)*math.exp(-1.*self.stepdone/self.eps_decay)
            self.stepdone+=0.05
            if sample > eps_threshold:
                with torch.no_grad():
                    return self.policy_net(state).max(-1)[1].view(1, 1)
            else:
                return torch.tensor([[random.randint(0,NUM_ACTION-1)]], device=device, dtype=torch.long)    

    def optimize_model(self,gamma):
            # self.start_time=time.time()
            if len(self.memory) < self.batch_size:
                return
            transitions = self.memory.sample(self.batch_size)
            batch = Transition(*zip(*transitions))

            
            non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                                batch.next_state)), device=device, dtype=torch.bool)
            non_final_next_states = torch.cat([s for s in batch.next_state
                                                        if s is not None])
            batch_state=[]
            for i in batch.state :
                batch_state.append(i.view(1,14))
            state_batch = torch.cat(batch_state)
            action_batch = torch.cat(batch.action)

           
            reward_batch = torch.cat(batch.reward)

            
            state_action_values = self.policy_net(state_batch).gather(1, action_batch)

            next_state_values = torch.zeros(self.batch_size, device=device)
            with torch.no_grad():
                 next_state_values[non_final_mask] =self.policy_net(non_final_next_states).gather(1,self.target_net(non_final_next_states).max(1)[1].unsqueeze(0))
                # next_state_values[non_final_mask] =self.policy_net(non_final_next_states).gather(1,self.target_net(non_final_next_states).max(1)[1].unsqueeze(0))
                
            # Compute the expected Q values
            expected_state_action_values = (next_state_values * gamma) + reward_batch
            

            # Compute Huber loss
            criterion = nn.SmoothL1Loss()
            loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

            # Optimize the model
            self.optimizer.zero_grad()
            loss.backward()
            # In-place gradient clipping
            torch.nn.utils.clip_grad_value_(self.policy_net.parameters(), 100)
            self.optimizer.step()
            self.optimize+=1
            # self.end_time=time.time()
            # print(f"optimze_time:{self.end_time-self.start_time},times{self.optimize}")
    def train(self,num_iters,num_episodes,duration,gamma):
    
            for iter in range(num_iters):
                self.env.replay()
                for episode in range(num_episodes):
                    state = self.env.reset()
                    done = False
                    while not done:
                        for i in count():
                            state = torch.FloatTensor(state).to(device)
                            action = self.select_action(state)
                            action1=action.item()
                            next_state, reward, done, _= self.env.step(np.array(action1))
                            reward = torch.tensor([reward], device=device)
                            if done:
                                if (self.env.old_avg_reward < -1500):
                                    return
                                next_state = None
                                print('Episode: {}, Score: {}'.format(
                                    episode, self.env.old_avg_reward))
                                result_train2.append(self.env.old_avg_reward*-800)

                                break
                            
                                
                            if (i > duration):
                                break
                            next_state = torch.tensor(next_state, dtype=torch.float32, device=device).unsqueeze(0)
                            self.memory.push(state, action, next_state, reward)
                            state = next_state

                            self.optimize_model(gamma)

             
                            
                            target_net_state_dict = self.target_net.state_dict()
                            policy_net_state_dict = self.policy_net.state_dict()
                            for key in policy_net_state_dict:
                                target_net_state_dict[key] = policy_net_state_dict[key]*self.tau + target_net_state_dict[key]*(1-self.tau)
                            self.target_net.load_state_dict(target_net_state_dict)
                            

    def test(self,num_episodes):
            
            if (self.env.old_avg_reward < -1500):
                return
            for episode in range(num_episodes):
                state = self.env.reset()
                done = False

                while not done:
                    state = torch.FloatTensor(state).to(device)
                    action=action = self.select_action(state)
                    action1=action.item()
                    next_state, reward, done, _= self.env.step(np.array(action1))

                    state = next_state

                print('Test Episode: {}, Score: {}'.format(episode, self.env.old_avg_reward))
                result_test2.append(self.env.old_avg_reward*-800)
    def runAC(self,i, dur,gamma):
    # MyGlobals.folder_name = "Actor_Critic_800_30s/dur" + str(dur) + "/" + str(i) +'/'
        MyGlobals.folder_name = f"test/gamma{gamma}/dur{dur}/{i}/"
        self.train(num_iters=9, num_episodes=121,
            duration=dur,gamma=gamma )
        self.test( num_episodes=60, )

Agent=DQNAgent()
Agent.runAC(1,30,0.99)


[WinError 183] Cannot create a file when that file already exists: 'd:\\Lab\\RL\\AODAI_RL\\result/result3/test/gamma0.99/dur30/1/'
[154, 101, 113, 101, 110, 118, 103]
Episode: 0, Score: -0.38440640876666116
[183, 97, 95, 104, 118, 112, 91]
Episode: 1, Score: -0.6896677323550872
[166, 108, 111, 112, 110, 99, 94]
Episode: 2, Score: -0.6803602836653444
[175, 101, 108, 104, 106, 111, 95]
Episode: 3, Score: -0.2465062251250613
[158, 114, 109, 110, 107, 108, 94]
Episode: 4, Score: -0.2184031434569128
[191, 105, 107, 101, 107, 109, 80]
Episode: 5, Score: -0.4748302476152162
[178, 109, 107, 109, 109, 112, 76]
Episode: 6, Score: -0.17972829133060972
[175, 106, 112, 107, 120, 111, 69]
Episode: 7, Score: -0.23915977583565698
[184, 112, 108, 109, 99, 107, 81]
Episode: 8, Score: -0.2099395370307423
[172, 106, 106, 103, 111, 109, 93]
Episode: 9, Score: -0.1288124406949275
[183, 109, 112, 111, 110, 110, 65]
Episode: 10, Score: -0.11486622890789194
[193, 108, 107, 106, 110, 109, 67]
Episode: 11, Score

KeyboardInterrupt: 

In [None]:
with open('dqn_train6.pkl', 'wb') as file:
    pickle.dump(result_train2, file)
with open('dqn_test6.pkl', 'wb') as file:
    pickle.dump(result_test2, file)