# Reinforce



**References:**
- https://gym.openai.com/envs/Blackjack-v0/
- https://github.com/philtabor/Actor-Critic-Methods-Paper-To-Code

## Import Libraries

In [20]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch as T

import gym
import random
import numpy as np

from ipynb.fs.full.Plotting import *
import matplotlib.pyplot as plt

## Policy Network

In [21]:
class PolicyNetwork(nn.Module):
    def __init__(self, observation_space, action_space, learning_rate):
        super(PolicyNetwork, self).__init__()
        
        self.observation_space = observation_space
        self.action_space = action_space
        self.learning_rate = learning_rate
        
        self.fc1 = nn.Linear(*self.observation_space, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, self.action_space)
        self.optimizer = optim.Adam(self.parameters(), lr = self.learning_rate)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, observation):
        layer1 = F.relu(self.fc1(observation))
        layer2 = F.relu(self.fc2(layer1))
        actions = self.fc3(layer2)

        return actions

## Reinforcement Learning Agent

In [28]:
class Agent():
    def __init__(self, observation_space, action_space, learning_rate = 0.0005, discount_rate = 0.99):
        
        self.observation_space = observation_space
        self.action_space = action_space
        
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        
        self.policy = PolicyNetwork(self.observation_space, self.action_space, self.learning_rate)
        
        self.reward_memory = []
        self.action_memory = []

    def choose_action(self, observation):
        state = T.Tensor([observation]).to(self.policy.device)
        probabilities = F.softmax(self.policy.forward(state))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.action_memory.append(log_probs)

        return action.item()

    def store_rewards(self, reward):
        self.reward_memory.append(reward)

    def learn(self):
        self.policy.optimizer.zero_grad()

        # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3
        # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1}
        G = np.zeros_like(self.reward_memory, dtype = np.float64)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(self.reward_memory)):
                G_sum += self.reward_memory[k] * discount
                discount *= self.discount_rate
            G[t] = G_sum
        G = T.tensor(G, dtype=T.float).to(self.policy.device)
        
        loss = 0
        for g, logprob in zip(G, self.action_memory):
            loss += -g * logprob
        loss.backward()
        self.policy.optimizer.step()

        self.action_memory = []
        self.reward_memory = []

## Main Program

In [29]:
step = 0
episodes = 3000

In [30]:
env = gym.make('LunarLander-v2')

observation_space = env.observation_space.shape
action_space = env.action_space.n
agent = Agent(observation_space, action_space)

scores = []

for i in range(episodes):
    done = False
    observation = env.reset()
    score = 0
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.store_rewards(reward)
        observation = observation_
    agent.learn()
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    print('episode ', i, 'score %.2f' % score, 'average score %.2f' % avg_score)

x = [i+1 for i in range(len(scores))]
# plot_learning_curve(scores, x, figure_file)

  probabilities = F.softmax(self.policy.forward(state))


episode  0 score -205.60 average score -205.60
episode  1 score -258.99 average score -232.30
episode  2 score -212.63 average score -225.74
episode  3 score -265.73 average score -235.74
episode  4 score -139.79 average score -216.55
episode  5 score -99.53 average score -197.05
episode  6 score -156.65 average score -191.27
episode  7 score -368.39 average score -213.41
episode  8 score -256.77 average score -218.23
episode  9 score -328.19 average score -229.23
episode  10 score -78.54 average score -215.53
episode  11 score -316.59 average score -223.95
episode  12 score -77.57 average score -212.69
episode  13 score -90.12 average score -203.93
episode  14 score -307.70 average score -210.85
episode  15 score -117.85 average score -205.04
episode  16 score -116.94 average score -199.86
episode  17 score -68.64 average score -192.57
episode  18 score -141.56 average score -189.88
episode  19 score -272.44 average score -194.01
episode  20 score -94.99 average score -189.30
episode 

KeyboardInterrupt: 

4