<a href="https://colab.research.google.com/github/wileyw/DeepLearningDemos/blob/master/RL/RL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay

from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.distributions 

class Agent(nn.Module):
    def __init__(self, state_shape, action_shape):
        super(Agent, self).__init__()
        self.state_shape = state_shape
        self.action_shape = action_shape

        self.relu = nn.ReLU(inplace=True)
        self.linear1 = nn.Linear(state_shape, 24)
        self.linear2 = nn.Linear(24, 12)
        self.linear3 = nn.Linear(12, action_shape)
        # self.softmax = nn.Softmax()

    def forward(self, state):
        # print(state, state.shape)
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        action = F.softmax(self.linear3(x))
        # action = self.linear3(x)
        
        return action

def one_hot_encode_action(action, n_actions):
    encoded = np.zeros(n_actions, np.float32)
    encoded[action] = 1
    return encoded

def get_discounted_rewards(rewards):
    """
    - The discounted rewards sum up all the rewards in the episode
    - Later rewards are exponentially less important
    """
    discounted_rewards = np.zeros(len(rewards))
    running_add = 0
    gamma = 0.99
    for t in reversed(range(0, len(rewards))):
        running_add = running_add * gamma + rewards[t]
        discounted_rewards[t] = running_add
    return discounted_rewards

# loss = nn.CrossEntropyLoss()
# loss = nn.NLLLoss()

In [None]:
# Setup env.
SEED = 1 # 1337
env = gym.make("CartPole-v1")
env.reset()
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

# setup model
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
model = Agent(n_states, n_actions)

# training settings.
n_episodes = 300
episode_lengths = []
render = False

learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Score Function Method

In [None]:
running_reward = 10
for episode in range(1, n_episodes + 1):
    observation = env.reset()
    ep_reward = 0

    done = False

    observations = []
    actions = []
    rewards = []
    gradients = []
    probs = []
    log_probs = []

    t = 0
    while not done:
        if render:
            screen = env.render(mode='rgb_array')  # Skip if want to train w/o image

            plt.imshow(screen)
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())
        
        # observation (4,) -> (1, 4)
        observation_reshaped = observation.reshape([1, observation.shape[0]])

        # 1. Get next action.
        obs_tmp = observation_reshaped.astype(np.float32)
        torch_obs = torch.from_numpy(obs_tmp)
        action_prob_distribution = model.forward(torch_obs)
        m = torch.distributions.Categorical(action_prob_distribution)
        action_prob_distribution = action_prob_distribution.flatten().detach().numpy()
        action = m.sample()
        observation, reward, done, info = env.step(action.item())
        ep_reward += reward

        # 2. Record history.
        observations.append(observation_reshaped)
        actions.append(action)
        rewards.append(reward)
        probs.append(action_prob_distribution)
        log_probs.append(m.log_prob(action))
        t += 1

        if done:
            print('Episode finished after {} timesteps'.format(t))
            episode_lengths.append(t + 1)
            print('Average Episode Length: {} from n_episodes: {}'.format(np.mean(episode_lengths), episode))
            # 3. Update policy
            discounted_rewards = get_discounted_rewards(rewards)
            discounted_rewards -= np.mean(discounted_rewards)
            discounted_rewards /= np.std(discounted_rewards + 1e-7)
            discounted_rewards = torch.Tensor(discounted_rewards)

            loss = [-log_prob * r for log_prob, r in zip(log_probs, discounted_rewards)]
            loss = torch.cat(loss).sum()
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            observations = []
            actions = []
            rewards = []
            gradients = []
            probs = []
            log_probs = []
            break

    # running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    # if episode % 10 == 0:
    #     print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
    #             episode, ep_reward, running_reward))

# ipythondisplay.clear_output(wait=True)
env.close()

In [None]:
model.eval()
env = gym.make("CartPole-v1")
obs = env.reset()
# env.seed(SEED)
# torch.manual_seed(SEED)
# np.random.seed(SEED)
# prev_screen = env.render(mode='rgb_array')
# plt.imshow(prev_screen)

for i in range(1000):
    # action = env.action_space.sample()
    obs_tmp = obs.reshape([1, observation.shape[0]])
    obs_tmp = obs_tmp.astype(np.float32)
    torch_obs = torch.from_numpy(obs_tmp)
    action_prob_distribution = model.forward(torch_obs)
    m = torch.distributions.categorical.Categorical(action_prob_distribution)
    action = m.sample()
    # print(action_prob_distribution.detach().numpy(), action.item())
    obs, reward, done, info = env.step(action.item())

    # screen = env.render(mode='rgb_array')  # Skip if want to run w/o image
    # plt.imshow(screen)
    # ipythondisplay.clear_output(wait=True)
    # ipythondisplay.display(plt.gcf())

    if done:
        break

# ipythondisplay.clear_output(wait=True)
env.close()
print(i + 1)
# print(obs, reward, action)
# print(action, action_prob_distribution, action1)


# Reparameterization Trick Method

In [None]:
class ReparamTrickAgent(nn.Module):
    def __init__(self, state_shape, action_shape):
        super(ReparamTrickAgent, self).__init__()
        self.state_shape = state_shape
        self.action_shape = action_shape

        self.relu = nn.ReLU(inplace=True)
        self.linear1 = nn.Linear(state_shape, 24)
        self.linear2 = nn.Linear(24, 12)

        init_w = 3e-3
        self.mu_linear = nn.Linear(12, action_shape)
        self.mu_linear.weight.data.uniform_(-init_w, init_w)
        self.mu_linear.bias.data.uniform_(-init_w, init_w)

        self.logvar_linear = nn.Linear(12, action_shape)
        self.logvar_linear.weight.data.uniform_(-init_w, init_w)
        self.logvar_linear.bias.data.uniform_(-init_w, init_w)
        # self.softmax = nn.Softmax()

    def forward(self, state):
        # print(state, state.shape)
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        # action = F.softmax(self.linear3(x))
        mu = torch.tanh(self.mu_linear(x))
        logvar = torch.tanh(self.logvar_linear(x))
        std = torch.exp(0.5 * logvar)
        
        return mu, std

In [None]:
# Setup env.
SEED = 1 # 1337
env = gym.make("CartPole-v1")
env.reset()
env.seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

# setup model
n_states = env.observation_space.shape[0]
n_actions = env.action_space.n
reparam_model = ReparamTrickAgent(n_states, n_actions)

# training settings.
n_episodes = 1000
episode_lengths = []
render = False

learning_rate = 0.01
optimizer = torch.optim.Adam(reparam_model.parameters(), lr=learning_rate)

running_reward = 10
for episode in range(1, n_episodes + 1):
    observation = env.reset()
    ep_reward = 0

    done = False

    observations = []
    actions = []
    rewards = []
    gradients = []
    probs = []
    log_probs = []

    t = 0
    while not done:
        if render:
            screen = env.render(mode='rgb_array')  # Skip if want to train w/o image

            plt.imshow(screen)
            ipythondisplay.clear_output(wait=True)
            ipythondisplay.display(plt.gcf())
        
        # observation (4,) -> (1, 4)
        observation_reshaped = observation.reshape([1, observation.shape[0]])

        # 1. Get next action.
        obs_tmp = observation_reshaped.astype(np.float32)
        torch_obs = torch.from_numpy(obs_tmp)
        mu, std = reparam_model.forward(torch_obs)
        m = torch.distributions.Normal(mu, std)
        # action = m.rsample()
        action = m.sample()
        log_prob = m.log_prob(action).sum(axis=-1) 
        # print(log_prob)
        action = torch.argmax(action)
        # print(action)
        observation, reward, done, info = env.step(action.item())
        ep_reward += reward

        # 2. Record history.
        observations.append(observation_reshaped)
        actions.append(action)
        rewards.append(reward)
        probs.append(action_prob_distribution)
        log_probs.append(log_prob)
        t += 1

        if done:
            print('Episode finished after {} timesteps'.format(t))
            episode_lengths.append(t + 1)
            print('Average Episode Length: {} from n_episodes: {}'.format(np.mean(episode_lengths), episode))
            # 3. Update policy
            discounted_rewards = get_discounted_rewards(rewards)
            discounted_rewards -= np.mean(discounted_rewards)
            discounted_rewards /= np.std(discounted_rewards + 1e-7)
            discounted_rewards = torch.Tensor(discounted_rewards)

            # actions = torch.Tensor(actions)
            loss = [-log_prob * r for log_prob, r in zip(log_probs, discounted_rewards)]
            loss = torch.cat(loss).sum()
            # print(rewards)
            # print(discounted_rewards)
            # loss = discounted_rewards.sum()
            # print(loss)
            # loss = -(log_prob * discounted_rewards).mean()

            # Useful extra info
            # approx_kl = (logp_old - logp).mean().item()
            # ent = pi.entropy().mean().item()
            # pi_info = dict(kl=approx_kl, ent=ent)
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            observations = []
            actions = []
            rewards = []
            gradients = []
            probs = []
            log_probs = []
            break

    # running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    # if episode % 10 == 0:
    #     print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
    #             episode, ep_reward, running_reward))

# ipythondisplay.clear_output(wait=True)
env.close()

In [None]:
reparam_model.eval()
env = gym.make("CartPole-v1")
obs = env.reset()
# env.seed(SEED)
# torch.manual_seed(SEED)
# np.random.seed(SEED)
# prev_screen = env.render(mode='rgb_array')
# plt.imshow(prev_screen)

for i in range(1000):
    # action = env.action_space.sample()
    obs_tmp = obs.reshape([1, observation.shape[0]])
    obs_tmp = obs_tmp.astype(np.float32)
    torch_obs = torch.from_numpy(obs_tmp)
    mu, std = reparam_model.forward(torch_obs)
    m = torch.distributions.Normal(mu, std)
    action = torch.argmax(m.sample())
    # print(action_prob_distribution.detach().numpy(), action.item())
    obs, reward, done, info = env.step(action.item())

    # screen = env.render(mode='rgb_array')  # Skip if want to run w/o image
    # plt.imshow(screen)
    # ipythondisplay.clear_output(wait=True)
    # ipythondisplay.display(plt.gcf())

    if done:
        break

# ipythondisplay.clear_output(wait=True)
env.close()
print(i + 1)
# print(obs, reward, action)
# print(action, action_prob_distribution, action1)
