# Imports

In [None]:
!pip install box2d-py
!pip install gym[box2d]
!pip install ma_gym



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import gym
from gym import spaces
from google.colab import widgets
import time
from mpl_toolkits import mplot3d
import copy
import random
import gym

# AC - Acrobot 🌎

## NN Definition (Our Q-Approximation)

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 100)
        self.fc2 = nn.Linear(100, 3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


## NN Definition (Actor)

In [None]:
class ActorNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(6, 200)
        self.fc2 = nn.Linear(200, 100)
        self.fc3 = nn.Linear(100, 50)
        self.fc4 = nn.Linear(50, 3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.softmax(self.fc4(x), dim = -1)
        return x


## Actor Agent

In [None]:
#Agent that uses the greedy epsilon approach
class ActorAgent():

  def __init__(self, Actor, action_space):
    self.Actor = Actor
    self.action_space = action_space

  def get_action(self, state):
      #return int(torch.argmax(self.Actor(state)))
      t = self.Actor(state)
      return int(random.choices([i for i in range(len(t))], [float(i) for i in list(t)])[0])
      #return int(random.choices([0,1,2,3], [float(i) for i in list(t)])[0])

  def __str__(self):
    return f'This agent works in an environment with {len(self.Actor)} states and {self.action_space.n} possible actions'

## Greedy-EPS Agent 

In [None]:
#Agent that uses the greedy epsilon approach
class GreedyEpsAgent():

  def __init__(self, eps, Q, action_space):
    self.eps = eps
    self.Q = Q
    self.action_space = action_space

  def get_action(self, state):
    eps_val = np.random.uniform(0, 1)

    if eps_val < self.eps:
      return self.action_space.sample()
    else:
      return int(torch.argmax(self.Q(state)))
        

  def update_eps(self, eps):
    self.eps = eps

  def __str__(self):
    return f'This agent works in an environment with {len(self.Q)} states and {self.action_space.n} possible actions'

## Actor-Critic: Acrobot Training

In [None]:
#DQN-Learning: Applied to Cart Pole

#Initialize HyperParameters
params = {"copyInterval": 100, "lr": 0.001, "momentum": 0.9, "gamma": 0.99, "epsilon" : 1, "endEpsilon": 0.01, "maxReplay": 256, "miniBatchSize": 10, "numEpisodes": 2000}


#Initialize Environment
env = gym.make('Acrobot-v1')

#Create list for total rewards
totalRewards = []

#Create list for average of last ten rewards
averages = []

#Initialize Networks
Q = Net()
Q_Target = copy.deepcopy(Q)
copyInterval = params["copyInterval"]
step = 0

#Initialize Network (Actor)
Actor = ActorNet()

#Define the loss function
criterion = nn.MSELoss()
optimizerQ = optim.Adam(Q.parameters(), lr=0.001)

#Define optimizer for actor
optimizerA = optim.Adam(Actor.parameters(), lr = 0.000001)

#Define Discount Factor (gamma)
gamma = params["gamma"]

#Initialize Greedy Epsilon Agent for Q
epsilon = params["epsilon"]
endEpsilon = params["endEpsilon"]
agent = GreedyEpsAgent(epsilon, Q, env.action_space)

#Initialize agent for Actor
AAgent = ActorAgent(Actor, env.action_space)

#Define Experience Replay list, its max size, and the mini batch size
replay = []
maxReplay = params["maxReplay"]
miniBatchSize = params["miniBatchSize"]

#Our initial policy can be an even distribution
numEpisodes = params["numEpisodes"]
episode = 0

#Loop through episodes, training agent using Q-Learning
while(episode < numEpisodes):
  s = env.reset()
  done = False

  rTracker = 0
  counter = 0
  while not done:
    step += 1
    counter += 1

    #Take action based on Actor
    a = AAgent.get_action(torch.tensor(s).float())
    sP, r, done, info = env.step(a)
    r = np.clip(r, -1, 1)
    #Take random minibatch for Actor and Critic
    minibatch = random.sample(replay, min(len(replay), miniBatchSize))

    if len(minibatch) > 2:
      #Break up minibatch into the different columns of values
      columns = list(zip(*minibatch))
      states = torch.reshape(torch.tensor(columns[0]).float(),[len(minibatch),6])
      actions = torch.tensor(columns[1])
      rewards = torch.tensor(columns[2])
      newStates = torch.reshape(torch.tensor(columns[3]).float(),[len(minibatch), 6])
      dones = torch.tensor(columns[4])

      #Get targets and outputs (critic)
      targets = torch.mul(gamma*torch.max(Q_Target(newStates), 1)[0], dones) + rewards
      outputs = torch.reshape(torch.gather(Q(states), 1, actions), [len(minibatch)]).float()
      targets = targets.float()

      #Get Outputs for Actor
      outputsA = torch.log(torch.reshape(torch.gather(Actor(states), 1, actions), [len(minibatch)]))
      o = torch.reshape(torch.gather(Q(states), 1, actions), [len(minibatch)])

      #Calculate loss and gradients (actor)
      optimizerA.zero_grad()
      lossA = -1*torch.mean(torch.mul(outputsA, o))
      lossA.backward()
      optimizerA.step()

      #Calculate loss and gradients (critic)
      optimizerQ.zero_grad()
      lossQ = criterion(outputs, targets).float()
      lossQ.backward()
      optimizerQ.step()


    #Storing in replay
    d = int(not done)
    if len(replay) >= maxReplay:
      replay.append([s,[a],r,sP, d])
      replay.pop(0)
    else:
      replay.append([s,[a],r,sP, d])
    s = sP
    rTracker += r
  #Updating target Q if necessary
  if step > copyInterval:
    step  = 0
    Q_Target = copy.deepcopy(Q)
  
  #Increment Episode
  episode += 1
  #Append new Reward to totalRewards
  totalRewards.append(rTracker)


#Create and reset Deterministic Grid Environment
env = gym.make('Acrobot-v1')
obs = env.reset()

#Take random steps in environment until done
done = False
while not done:
  a = AAgent.get_action(torch.tensor(obs).float())
  obs, reward, done, info = env.step(a)


plt.plot([i for i in range(len(totalRewards))], totalRewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
#plt.plot([i for i in range(len(cumRewards))], cumRewards)
plt.show()

# Traffic Junction 4 🎮 🎮 🎮 🎮

### Greedy Eps Agent

In [None]:
#Agent that uses the greedy epsilon approach
class GreedyEpsAgent2():

  def __init__(self, eps, Q, action_space):
    self.eps = eps
    self.Q = Q
    self.action_space = action_space

  def get_action(self, state):
    eps_val = np.random.uniform(0, 1)

    if eps_val < self.eps:
      return self.action_space.sample()[0]
    else:
      return int(torch.argmax(self.Q(state)))
        

  def update_eps(self, eps):
    self.eps = eps

  def __str__(self):
    return f'This agent works in an environment with {len(self.Q)} states and {self.action_space.n} possible actions'

### NN Definition (Our Q-Approximation)

In [None]:
class Switch4Net(nn.Module):
    def __init__(self, nAgents):
        super().__init__()
        self.fc1 = nn.Linear(nAgents*81, 100)
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


### Training

In [None]:
#DQN-Learning: Applied to Deterministic Grid Environment

#Initialize Environment
env = gym.make('ma_gym:TrafficJunction4-v0')
numAgents = 4

#Create list for total rewards
totalRewards = [[] for i in range(numAgents)]

#Initialize Networks
Qs = [Switch4Net(numAgents) for i in range(numAgents)]
Q_Targets = [copy.deepcopy(Q) for Q in Qs]
copyInterval = 500
step = 0

#Define the loss function
criterion = nn.MSELoss()
optimizers = [optim.Adam(Q.parameters(), lr=0.00001) for Q in Qs]

#Define Discount Factor (gamma)
gamma = 0.9

#Initialize Greedy Epsilon Agent for Q
epsilon = 1
endEpsilon = 0.01
agents = [GreedyEpsAgent2(epsilon, Q, env.action_space) for Q in Qs]

#Define Experience Replay list, its max size, and the mini batch size
replays = [[] for i in range(numAgents)]
maxReplay = 512
miniBatchSize = 52

#Our initial policy can be an even distribution
numEpisodes = 1000
episode = 0

#Loop through episodes, training agent using Q-Learning
while(episode < numEpisodes):
  s = env.reset()
  s = [element for sublist in s for element in sublist]
  done = [False for i in range(env.n_agents)]

  rTracker = [0 for i in range(numAgents)]
  while not all(done):
    step += 1

    #Take action based on greatest Q-Value from Q for state s
    a = [agent.get_action(torch.tensor([[s]]).float()) for agent in agents]
    sP, r, done, info = env.step(a)
    sP = [element for sublist in sP for element in sublist]

    #Zero out gradient and take random minibatch
    for optimizer in optimizers:
      optimizer.zero_grad()
    
    for i in range(numAgents):
      minibatch = random.sample(replays[i], min(len(replays[i]), miniBatchSize))
      if len(minibatch) > 3:
        #Break up minibatch into the different columns of values
        columns = list(zip(*minibatch))
        states = torch.reshape(torch.tensor(columns[0]).float(),[len(minibatch), 81*numAgents])
        actions = torch.tensor(columns[1])
        rewards = torch.tensor(columns[2])
        newStates = torch.reshape(torch.tensor(columns[3]).float(),[len(minibatch), 81*numAgents])

        #Select Q and Q_Target
        Q = Qs[i]
        Q_Target = Q_Targets[i]

        #Get targets and outputs
        targets = (gamma*torch.max(Q_Target(newStates), 1)[0]) + rewards
        outputs = torch.reshape(torch.gather(Q(states), 1, actions), [len(minibatch)])

        #Calculate loss and gradients
        loss = torch.mean(torch.abs(outputs - targets))
        loss.backward()

        optimizers[i].step()

    #Storing in replay
    for i,replay in enumerate(replays):
      if len(replay) >= maxReplay:
        replay.append([s,[a[i]],r[i],sP,done[i]])
        replay.pop(0)
      else:
        replay.append([s,[a[i]],r[i],sP,done[i]])

    #Updating target Q if necessary
    if step % copyInterval == 0:
      Q_Targets = [copy.deepcopy(Q) for Q in Qs]

    s = sP
    for i in range(numAgents):
      rTracker[i] = rTracker[i] + r[i]

  episode += 1
  for i in range(numAgents):
    totalRewards[i].append(rTracker[i])

  #Update epsilons for greedy-eps
  for agent in agents:
      agent.update_eps((endEpsilon/epsilon)**(episode/numEpisodes))


rs = []
for i in range(numEpisodes):
  rTemp = 0
  for j in range(len(totalRewards)):
    rTemp += totalRewards[j][i]
  rs.append(rTemp)

plt.plot(rs)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.show()

#Take random steps in environment until done
for agent in agents:
  agent.update_eps(0)
s = env.reset()
s = [element for sublist in s for element in sublist]
done = [False for i in range(env.n_agents)]
while not all(done):
  a = [agent.get_action(torch.tensor([[s]]).float()) for agent in agents]
  s, r, done, info = env.step(a)
  s = [element for sublist in s for element in sublist]
plt.imshow(env.render('rgb_array'))

# Checkers 🎮 🎮 🎮 🎮 (This one takes a while to train)

### Greedy Eps Agent

In [None]:
#Agent that uses the greedy epsilon approach
class GreedyEpsAgent2():

  def __init__(self, eps, Q, action_space):
    self.eps = eps
    self.Q = Q
    self.action_space = action_space

  def get_action(self, state):
    eps_val = np.random.uniform(0, 1)

    if eps_val < self.eps:
      return self.action_space.sample()[0]
    else:
      return int(torch.argmax(self.Q(state)))
        

  def update_eps(self, eps):
    self.eps = eps

  def __str__(self):
    return f'This agent works in an environment with {len(self.Q)} states and {self.action_space.n} possible actions'

### NN Definition (Our Q-Approximation)

In [None]:
class Switch4Net(nn.Module):
    def __init__(self, nAgents):
        super().__init__()
        self.fc1 = nn.Linear(nAgents*47, 1000)
        self.fc2 = nn.Linear(1000, 500)
        self.fc3 = nn.Linear(500, 250)
        self.fc4 = nn.Linear(250, 5)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


### Training

In [None]:
#DQN-Learning: Applied to Deterministic Grid Environment

#Initialize Environment
env = gym.make('ma_gym:Checkers-v0')
numAgents = 2

#Create list for total rewards
totalRewards = []

#Initialize Networks
Qs = [Switch4Net(numAgents) for i in range(numAgents)]
Q_Targets = [copy.deepcopy(Q) for Q in Qs]
copyInterval = 5000
step = 0

#Define the loss function
criterion = nn.MSELoss()
optimizers = [optim.Adam(Q.parameters()) for Q in Qs]

#Define Discount Factor (gamma)
gamma = 0.99

#Initialize Greedy Epsilon Agent for Q
epsilon = 1
endEpsilon = 0.01
agents = [GreedyEpsAgent2(epsilon, Q, env.action_space) for Q in Qs]

#Define Experience Replay list, its max size, and the mini batch size
replays = [[] for i in range(numAgents)]
maxReplay = 2048
miniBatchSize = 256

#Our initial policy can be an even distribution
numEpisodes = 1000
episode = 0

#Loop through episodes, training agent using Q-Learning
while(episode < numEpisodes):
  s = env.reset()
  s = [element for sublist in s for element in sublist]
  done = [False for i in range(env.n_agents)]

  rTracker = 0
  while not all(done):
    step += 1

    #Take action based on greatest Q-Value from Q for state s
    a = [agent.get_action(torch.tensor([[s]]).float()) for agent in agents]
    sP, r, done, info = env.step(a)
    sP = [element for sublist in sP for element in sublist]

    #Zero out gradient and take random minibatch
    for optimizer in optimizers:
      optimizer.zero_grad()
    
    for i in range(numAgents):
      minibatch = random.sample(replays[i], min(len(replays[i]), miniBatchSize))
      if len(minibatch) > 3:
        #Break up minibatch into the different columns of values
        columns = list(zip(*minibatch))
        states = torch.reshape(torch.tensor(columns[0]).float(),[len(minibatch),47*numAgents])
        actions = torch.tensor(columns[1])
        rewards = torch.tensor(columns[2])
        newStates = torch.reshape(torch.tensor(columns[3]).float(),[len(minibatch), 47*numAgents])

        #Select Q and Q_Target
        Q = Qs[i]
        Q_Target = Q_Targets[i]

        #Get targets and outputs
        targets = (gamma*torch.max(Q_Target(newStates), 1)[0]) + rewards
        outputs = torch.reshape(torch.gather(Q(states), 1, actions), [len(minibatch)])

        #Calculate loss and gradients
        loss = torch.mean(torch.abs(outputs - targets))
        loss.backward()

        optimizers[i].step()

    #Storing in replay
    for i,replay in enumerate(replays):
      if len(replay) >= maxReplay:
        replay.append([s,[a[i]],r[i],sP,done[i]])
        replay.pop(0)
      else:
        replay.append([s,[a[i]],r[i],sP,done[i]])

    #Updating target Q if necessary
    if step % copyInterval == 0:
      Q_Targets = [copy.deepcopy(Q) for Q in Qs]

    s = sP
    rTracker += sum(r)
  episode += 1
  totalRewards.append(rTracker)
  for agent in agents:
      agent.update_eps((endEpsilon/epsilon)**(episode/numEpisodes))

plt.plot([i for i in range(len(totalRewards))], totalRewards)
plt.xlabel("Episode")
plt.ylabel("Total Reward")
plt.show()

#Take random steps in environment until done
for agent in agents:
  agent.update_eps(0)
s = env.reset()
s = [element for sublist in s for element in sublist]
done = [False for i in range(env.n_agents)]
while not all(done):
  a = [agent.get_action(torch.tensor([[s]]).float()) for agent in agents]
  s, r, done, info = env.step(a)
  s = [element for sublist in s for element in sublist]
plt.imshow(env.render('rgb_array'))