In [0]:
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  5 17:08:54 2020

@author: Admin
"""

# Quadcopter Env

import easydict
import gym
import numpy as np
from itertools import count
from collections import namedtuple
import logging
import logging.handlers


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import matplotlib.pyplot as plt

from quad_env import QuadEnv

# Cart Pole
# based on:
# https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py

# args = parser.parse_args()

args = easydict.EasyDict({
    "gamma": 0.99,
    "seed": 203,
    "render": False,
    "log_interval": 10,
    "write_logger":True
})

# env = gym.make('LunarLanderContinuous-v2')
env = QuadEnv()

# env.seed(args.seed)
torch.manual_seed(args.seed)

SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])
# state_dim = env.observation_space.shape[0]
state_dim = 18
# action_dim = env.action_space.shape[0]
action_dim = 4

class Policy(nn.Module):
    """
    implements both actor and critic in one model
    """

    def __init__(self, hidden_dim1=64, hidden_dim2=64, output_dim=128):
        super(Policy, self).__init__()
        self.affine1 = nn.Linear(state_dim, hidden_dim1)
        self.affine2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.affine3 = nn.Linear(hidden_dim2, output_dim)
        self.act1 = nn.Tanh()
        # actor's layer
        self.action_mean = nn.Linear(output_dim, action_dim)
        self.action_var = nn.Linear(output_dim, action_dim)
        # critic's layer
        self.value_head = nn.Linear(output_dim, 1)
        # action & reward buffer
        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        # TODO map input to:
        # mean of action distribution,
        # variance of action distribution (pass this through a non-negative function),
        # state value

        input_x = x
        x = self.act1(self.affine1(x))
        x = self.act1(self.affine2(x))
        x = self.act1(self.affine3(x))
        action_mean = self.action_mean(x)
        action_var = F.softplus(self.action_var(x))
        action_var = torch.add(action_var, 1e-10)
        state_values = self.value_head(x)  # <= Value Function not value of state
        if any(torch.isnan(x)) or any(torch.isnan(action_mean)) or any(torch.isnan(action_var)):
            print('NaN in forward pass')

        return 1000.0 * action_mean, 1000.0 * action_var, state_values


model = Policy().float()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()


def select_action(state):
    state = torch.from_numpy(state).float()
    mu, sigma, state_value = model(state)
    # sigma += 10**-8

    # create a normal distribution over the continuous action space
    m = Normal(loc=mu, scale=sigma)

    # and sample an action using the distribution
    action = m.sample()

    # save to action buffer
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))

    # the action to take (left or right)
    return action.data.numpy()


def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backprop.
    """
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []  # list to save actor (policy) loss
    value_losses = []  # list to save critic (value) loss
    returns = []  # list to save the true values

    # calculate the true value using rewards returned from the environment
    #for r in model.rewards[::-1]:
    for i in range(len(model.rewards)):
        # TODO compute the value at state x
        # via the reward and the discounted tail reward
        r = model.rewards[i]
        if i < len(model.rewards) - 1:
          value_next = saved_actions[i + 1][1].item()
        else:
          value_next = 0
        R = args.gamma * value_next + r

        returns.insert(0, R)

    # whiten the returns
    returns = torch.tensor(returns).float()
    returns = (returns - returns.mean()) / (returns.std() + eps)

    for (log_prob, value), R in zip(saved_actions, returns):
        # TODO compute the advantage via subtracting off value
        advantage = R - value.item()

        # TODO calculate actor (policy) loss, from log_prob (saved in select action)
        # and from advantage
        policy_loss = -log_prob * advantage
        # append this to policy_losses
        policy_losses.append(policy_loss)
        # TODO calculate critic (value) loss
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    # reset gradients

    optimizer.zero_grad()

    # sum up all the values of policy_losses and value_losses
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()

    # perform backprop
    loss.backward()
    # gradient clipping to solve exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
    optimizer.step()

    # reset rewards and action buffer
    del model.rewards[:]
    del model.saved_actions[:]


def main():
    if args.write_logger:
        log_filename = 'training_log.txt'
        train_logger = logging.getLogger('TrainLogger')
        train_logger.setLevel(logging.DEBUG)
        handler = logging.handlers.RotatingFileHandler(log_filename, maxBytes=10*1024*1024, backupCount=5)
        train_logger.addHandler(handler)

    running_reward = -8000

    # run infinitely many episodes, until performance criteria met
    episodic_rewards = []
    episodes = []

    for i_episode in count(1):
        # reset environment and episode reward
        state = env.reset()
        ep_reward = 0

        for t in range(1, 300):
            # select action from policy
            action = select_action(state)
            if any(np.isnan(action)):
                print('action is NaN')

            # take the action
            state, reward, done, _ = env.step(action)

            if args.render and i_episode % 100 == 0:
                env.render()

            if args.write_logger:
                train_logger.debug('episode {0}, step {1}, state {2}, action {3}, reward {4}'.format(i_episode, t, state, action, reward))

            model.rewards.append(reward)
            ep_reward += reward
            if done:
                episodes.append(i_episode)  # added
                episodic_rewards.append(ep_reward)
                break

        # update cumulative reward
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

        # perform backprop
        finish_episode()

        # log results
        if i_episode % args.log_interval == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                i_episode, ep_reward, running_reward))

        # check if we have "solved" the problem
        # if running_reward > 200:
        if i_episode > 6000:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))

            # TODO plot episodic_rewards --- submit this plot with your code
            plt.figure()
            plt.plot(episodes, episodic_rewards)
            break


if __name__ == '__main__':
    main()

Episode 10	Last reward: -1461.76	Average reward: -5327.44
Episode 20	Last reward: -978.14	Average reward: -3770.94
Episode 30	Last reward: -1087.20	Average reward: -2906.01
Episode 40	Last reward: -1930.15	Average reward: -2264.26
Episode 50	Last reward: -1043.54	Average reward: -1821.27
Episode 60	Last reward: -1691.61	Average reward: -1539.02
Episode 70	Last reward: -1394.95	Average reward: -1474.12
Episode 80	Last reward: -1100.29	Average reward: -1457.52
Episode 90	Last reward: -1650.87	Average reward: -1408.94
Episode 100	Last reward: -976.39	Average reward: -1289.78
Episode 110	Last reward: -1298.82	Average reward: -1193.99
Episode 120	Last reward: -882.04	Average reward: -1100.40
Episode 130	Last reward: -1284.28	Average reward: -1118.33
Episode 140	Last reward: -1225.89	Average reward: -1129.32
Episode 150	Last reward: -963.51	Average reward: -1179.05
Episode 160	Last reward: -2578.83	Average reward: -1428.39
Episode 170	Last reward: -2576.47	Average reward: -2111.12
Episode 18

# New Section

We did many things
Nothing worked
What we learned: RL is BS
LAME
Just put a PID controller on your vehicles, and you'll be fine...

In [0]:
!unzip quad_sim.zip -d ./

Archive:  quad_sim.zip
  inflating: ./quad_sim/config.py    
  inflating: ./quad_sim/ctrl.py      
   creating: ./quad_sim/quadFiles/
 extracting: ./quad_sim/quadFiles/__init__.py  
  inflating: ./quad_sim/quadFiles/initQuad.py  
  inflating: ./quad_sim/quadFiles/quad.py  
  inflating: ./quad_sim/run_3D_simulation.py  
  inflating: ./quad_sim/trajectory.py  
   creating: ./quad_sim/utils/
  inflating: ./quad_sim/utils/__init__.py  
  inflating: ./quad_sim/utils/animation.py  
  inflating: ./quad_sim/utils/display.py  
  inflating: ./quad_sim/utils/mixer.py  
  inflating: ./quad_sim/utils/quaternionFunctions.py  
  inflating: ./quad_sim/utils/rotationConversion.py  
  inflating: ./quad_sim/utils/stateConversions.py  
  inflating: ./quad_sim/utils/windModel.py  
  inflating: ./quad_sim/waypoints.py  


In [0]:
!pip install gym
!pip install Box2D gym

