# Actor Critic

**References:**
- https://gym.openai.com/envs/Blackjack-v0/
- https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code

## Import Libraries

In [6]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

import gym
import random
import numpy as np

from ipynb.fs.full.Plotting import *
import matplotlib.pyplot as plt

## Actor Critic Network

In [7]:
class ActorCriticNetwork(nn.Module):
    def __init__(self, observation_space, action_space, learning_rate):
        super(ActorCriticNetwork, self).__init__()
        
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        
        self.fc1 = nn.Linear(*self.observation_space, 2048)
        self.fc2 = nn.Linear(2048, 1536)
        
        self.pi = nn.Linear(1536, self.action_space)
        self.v = nn.Linear(1536, 1)
        
        
        self.optimizer = optim.Adam(self.parameters(), lr = self.learning_rate)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, observation):
        layer1 = F.relu(self.fc1(observation))
        layer2 = F.relu(self.fc2(layer1))
        pi = self.pi(layer2)
        v = self.v(layer2)

        return (pi, v)

## Reinforcement Learning Agent

In [8]:
class Agent():
    def __init__(self, observation_space, action_space, learning_rate = 0.0005, discount_rate = 0.99):
        
        self.observation_space = observation_space
        self.action_space = action_space
        
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
         
        self.actor_critic = ActorCriticNetwork(self.observation_space, self.action_space, self.learning_rate)
        self.log_prob = None

    def choose_action(self, observation):
        state = T.tensor([observation], dtype = T.float).to(self.actor_critic.device)
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities, dim = 1)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob

        return action.item()

    def learn(self, state, reward, state_, done):
        self.actor_critic.optimizer.zero_grad()

        state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
        state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)

        _, critic_value = self.actor_critic.forward(state)
        _, critic_value_ = self.actor_critic.forward(state_)

        delta = reward + self.discount_rate * critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_prob*delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()

## Main Program

In [9]:
step = 0
episodes = 3000

In [10]:
env = gym.make('LunarLander-v2')

observation_space = env.observation_space.shape
action_space = env.action_space.n
agent = Agent(observation_space, action_space)

scores, steps = [], []

for i in range(episodes):
    observation = env.reset()
    done = False
    score = 0
    
    while not done:
        action = agent.choose_action(observation)
        next_observation, reward, done, _ = env.step(action)
        observation = next_observation
        score += reward
        step += 1
        agent.learn(observation, reward, next_observation, done)

    scores.append(score)
    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.1f' % score, 'average score %.1f' % avg_score)

plot_learning_curve(steps, scores)

episode  0 score -547.3 average score -547.3
episode  1 score -481.5 average score -514.4
episode  2 score -553.7 average score -527.5
episode  3 score -108.7 average score -422.8


KeyboardInterrupt: 