<a href="https://colab.research.google.com/github/tamasdoka/str_opt/blob/dev/steering_optimizer_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Steering geometry optimizer with stable baselines

Github Repo: [https://github.com/tamasdoka/str_opt](https://github.com/tamasdoka/str_opt)

In [0]:
#!apt install swig cmake libopenmpi-dev zlib1g-dev
#!pip install stable-baselines[mpi]==2.8.0 box2d box2d-kengz
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x

# Import gym, numpy and stable baselines

In [0]:
%%capture
import gym
import numpy as np

from stable_baselines import DQN

## Cloning and intalling the Gym env and instantiate the agent


In [3]:
!git clone -b dev https://github.com/tamasdoka/str_opt/

Cloning into 'str_opt'...
remote: Enumerating objects: 354, done.[K
remote: Counting objects: 100% (354/354), done.[K
remote: Compressing objects: 100% (212/212), done.[K
remote: Total 354 (delta 152), reused 297 (delta 131), pack-reused 0[K
Receiving objects: 100% (354/354), 1.05 MiB | 1.42 MiB/s, done.
Resolving deltas: 100% (152/152), done.


In [4]:
!pip install -e /content/str_opt/steering-optimizer

Obtaining file:///content/str_opt/steering-optimizer
Installing collected packages: steering-optimizer
  Running setup.py develop for steering-optimizer
Successfully installed steering-optimizer


In [0]:
for env in gym.envs.registry.env_specs.keys():
     if 'steering_optimizer' in env:
          print('Remove {} from registry".format(env)')
          del gym.envs.registry.env_specs[env]

In [0]:
from gym.envs.registration import register

register(
    id='steering_optimizer-v0',
    entry_point='steering_optimizer.envs:StrOptEnv')

In [7]:
%cd /content/str_opt/steering-optimizer/


/content/str_opt/steering-optimizer


In [0]:
env = gym.make('steering_optimizer-v0')

In [9]:
env.check_version()

StrOpt version: dev


## Creating the model

In [0]:
model = DQN('MlpPolicy', env, learning_rate=1e-3, exploration_fraction=0.5, prioritized_replay=True, verbose=1)

Function for model evaluation

In [0]:
def evaluate(model, num_steps=100):
  """
  Evaluate a RL agent
  :param model: (BaseRLModel object) the RL Agent
  :param num_steps: (int) number of timesteps to evaluate it
  :return: (float) Mean reward for the last 100 episodes
  """
  episode_rewards = [0.0]
  rewards = []
  states = []
  obs = env.reset()
  print('state:', obs)
  for i in range(num_steps):
      # _states are only useful when using LSTM policies
      action, _states = model.predict(obs)

      #print('action:', action)

      obs, reward, done, info = env.step(action)

      #print('obs (state), reward, done:', obs, reward, done)
      
      # Stats
      episode_rewards[-1] += reward
      rewards.append(reward)
      states.append(obs)
      if done:
          obs = env.reset()
          episode_rewards.append(0.0)
  # Compute mean reward for the last 100 episodes
  mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
  print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
  best_reward_index = np.argmax(episode_rewards)
  best_state = states[best_reward_index]
  best_reward = rewards[best_reward_index]

  print('Best reward:', best_reward)
  print('Best state:', best_state)
  
  return mean_100ep_reward

Model before training

In [12]:
# Random Agent, before training
mean_reward_before_train = evaluate(model, num_steps=1000)

state: [ -95.62267323 -182.59717912 -419.34229977  -97.08271741]
Mean reward: 0.1 Num episodes: 133
Best reward: 0.01
Best state: [ -93.67301898 -171.07768053 -422.78257589  -86.18850968]


## Train the agent and save it

Warning: this may take a while

In [0]:
# Train the agent
model.learn(total_timesteps=int(2e4), log_interval=10)
# Save the agent
model.save("dqn_steering_opt")
del model  # delete trained model to demonstrate loading

## Load the trained agent

In [14]:
model = DQN.load("dqn_steering_opt")

Loading a model without an environment, this model cannot be trained until it has a valid environment.


In [15]:
# Evaluate the trained agent
mean_reward = evaluate(model, num_steps=5000)

state: [-116.80394909 -203.22429275 -418.18714272 -100.74071472]
Mean reward: -0.2 Num episodes: 1119
Best reward: 0.01
Best state: [-104.18619417 -196.96378768 -417.48413011 -122.96692131]
