Keep package manager up-to-date

In [0]:
%%bash
apt-get update > /dev/null 2>&1 && apt-get upgrade > /dev/null 2>&1

Install proper version of `gym` and `pyglet` packages

In [0]:
%%bash
python -m pip uninstall -y gym > /dev/null 2>&1
python -m pip install gym==0.11.0 > /dev/null 2>&1

In [0]:
%%bash
python -m pip uninstall -y pyglet > /dev/null 2>&1
python -m pip install pyglet==1.3.2 > /dev/null 2>&1

Upload `envs.zip` from repository, unzip it and substitute corresponding folder in `gym` package

In [0]:
%%bash
unzip envs.zip > /dev/null 2>&1
rm -rf /usr/local/lib/python3.6/dist-packages/gym/envs
cp -r envs /usr/local/lib/python3.6/dist-packages/gym

Setup packages for virtual display support

In [0]:
%%bash
python -m pip install pyvirtualdisplay > /dev/null 2>&1
apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
apt-get install -y xorg openbox > /dev/null 2>&1
apt-get install -y x11-utils > /dev/null 2>&1

Install dependencies for Marvin environment

In [0]:
%%bash
apt-get install -y swig > /dev/null 2>&1
python -m pip install box2d > /dev/null 2>&1
python -m pip install box2d-kengz > /dev/null 2>&1

Write `marvin.py` file for this project

In [36]:
%%writefile marvin.py
#!/usr/local/bin/python
import argparse
import gym
import numpy as np
import pickle
from gym.wrappers import Monitor
import logging
import multiprocessing as mp
from datetime import datetime as dt


def wrap_env(env):
  env = Monitor(env, './video', force=True)
  
  return env


def _parse_args():
  pool = mp.cpu_count()
  seed = 1
  epochs = 100
  print_step = 1
  sigma = 0.1
  alpha = 0.03
  decay = 0.999
  population_size = 42
  max_play_steps = 2424
  max_train_steps = 2424
  layer = [24, 16]
  parser = argparse.ArgumentParser(
    description='A depressed robot training to walk')
  parser.add_argument(
    '-pool',
    metavar='NUMBER',
    type=int,
    default=pool,
    choices=range(1, 10 * pool + 1),
    help='set multiprocessing number for Mravin weights train')
  parser.add_argument(
    '-seed',
    metavar='NUMBER',
    type=int,
    default=seed,
    help='set seed number for random weights generator')
  parser.add_argument(
    '-print',
    metavar='NUMBER',
    type=int,
    default=print_step,
    choices=range(1, 10001),
    help='set print step number')
  parser.add_argument(
    '-epochs',
    metavar='NUMBER',
    type=int,
    default=epochs,
    choices=range(1, 10001),
    help='set number of epochs for Marvin weights train')
  parser.add_argument(
    '-sigma',
    metavar='NUMBER',
    type=float,
    default=sigma,
    help='set sigma number for population weights generator')
  parser.add_argument(
    '-alpha',
    metavar='NUMBER',
    type=float,
    default=alpha,
    help='set alpha number for learning rate')
  parser.add_argument(
    '-decay',
    metavar='NUMBER',
    type=float,
    default=decay,
    help='set decay number for next alfa')
  parser.add_argument(
    '-pop',
    metavar='NUMBER',
    type=int,
    default=population_size,
    choices=range(1, 101),
    help='set population number for Marvin weights train')
  parser.add_argument(
    '-play_steps',
    metavar='NUMBER',
    type=int,
    default=max_play_steps,
    choices=range(1, 10001),
    help='set max number of play steps')
  parser.add_argument(
    '-train_steps',
    metavar='NUMBER',
    type=int,
    default=max_train_steps,
    choices=range(1, 10001),
    help='set max number of train steps')
  parser.add_argument(
    '-layer',
    metavar='NUMBER',
    type=int,
    default=layer,
    choices=range(1, 101),
    nargs='+',
    help='set hidden layer size')
  parser.add_argument(
    '-log',
    metavar='FILE',
    type=argparse.FileType('w'),
    help='write log to a file')
  parser.add_argument(
    '-l',
    '--load',
    metavar='FILE',
    type=argparse.FileType('rb'),
    help='load weights for Marvin agent from a file; skip training process if this option is specified')
  parser.add_argument(
    '-s',
    '--save',
    metavar='FILE',
    type=argparse.FileType('wb'),
    help='save weights to a file after running the program')
  parser.add_argument(
    '-t',
    '--train',
    action='store_true',
    help='run only training process; skip walking process; if load option specified, train with loaded weights')
  parser.add_argument(
    '-w',
    '--walk',
    action='store_true',
    help='display only walking process')
  parser.add_argument(
    '-norm',
    action='store_true',
    help='normalize output value for prediction')
  parser.add_argument(
    '-detail',
    action='store_true',
    help='print detailed log')
  parser.add_argument(
    '-silent',
    action='store_true',
    help='do not print log for each train epoch and play step')
  parser.add_argument(
    '-render',
    action='store_true',
    help='render video of marvin play')
  parser.add_argument(
    '-capture',
    action='store_true',
    help='capture rendering to video file')
  parser.add_argument(
    '-current',
    action='store_true',
    help='print current or default arguments values')

  return parser.parse_args()


def _config_logging(args):
  if args.log is None:
    logging.basicConfig(filename=None, level=logging.INFO, format='%(asctime)s %(message)s')
  else:
    logging.basicConfig(stream=args.log, level=logging.INFO, format='%(asctime)s %(message)s')


def _set_layers_size(args):
  layers_size = [24, 4]
  layers_size[1:-1] = args.layer

  return layers_size


def _generate_random_weights(args):
  weights_layers_size = _set_layers_size(args)
  seed = args.seed
  weights = []
  np.random.seed(seed)

  for i in range(len(weights_layers_size) - 1):
    weights_layer = np.random.randn(weights_layers_size[i], weights_layers_size[i + 1])
    weights.append(weights_layer)

  return np.array(weights), weights_layers_size


def _generate_population(population_size, weights_layers_size):
  population = []

  for _ in range(population_size):
    normal_distribution = []

    for i in range(len(weights_layers_size) - 1):
      weights_layer = np.random.randn(weights_layers_size[i], weights_layers_size[i + 1])
      normal_distribution.append(weights_layer)

    population.append(normal_distribution)

  return np.array(population)


def _predict_action(observation, weights, norm):
  out = np.expand_dims(observation.flatten(), 0)
  
  if norm:
    out = out / np.linalg.norm(out)

  for weights_layer in weights:
    out = np.dot(out, weights_layer)
    out = np.tanh(out)

  return out[0]


def _get_reward(env, weights, max_train_steps, norm):
  env_local = None

  if env is None:
    env = gym.make('Marvin-v0')
    env_local = True

  total_reward = timesteps = 0
  observation = env.reset()
  done = False

  while not done and timesteps < max_train_steps:
    action = _predict_action(observation, weights, norm)
    observation, reward, done, _ = env.step(action)
    total_reward += reward
    timesteps += 1

  if env_local is None:
    env.close()

  return total_reward, timesteps


def _get_population_weights(pop, weights, sigma):
  population_weights = []

  for index, pop_layer in enumerate(pop):
    weights_layer = weights[index] + sigma * pop_layer
    population_weights.append(weights_layer)

  return np.array(population_weights)


def _worker_process(arg):
  env = None
  fp, population_weights, max_train_steps, norm = arg
  ret, _ = fp(env, population_weights, max_train_steps, norm)

  return ret


def _get_rewards(env, population_size, population, weights, sigma, pool, max_train_steps, norm, detail, silent, walk_only):
  if pool:

    population_weights = []
    for i in range(population_size):
      population_weights.append(_get_population_weights(population[i], weights, sigma))
    worker_args = ((_get_reward, pop, max_train_steps, norm) for pop in population_weights)
    rewards = np.array(pool.map(_worker_process, worker_args))
    
  else:
    rewards = np.zeros(population_size)

    for i in range(population_size):
      population_weights = _get_population_weights(population[i], weights, sigma)
      rewards[i], timesteps = _get_reward(env, population_weights, max_train_steps, norm)
    
      if not walk_only and not silent and detail:
        logging.info(f'>>> train population {i + 1} with reward {rewards[i]} and {timesteps} of {max_train_steps} max train steps')

  return rewards


def _update_weights(population_size, population, weights, rewards, sigma, alpha):
  updated_weights = []
  rewards_mean = np.mean(rewards)
  rewards_std = np.std(rewards)

  if rewards_std == 0:
    return weights

  rewards_norm = (rewards - rewards_mean) / rewards_std

  for index, weights_layer in enumerate(weights):
    population_layer = np.array([pop[index] for pop in population])
    factor = alpha / (population_size * sigma)
    updated_layer = weights_layer + factor * np.dot(population_layer.T, rewards_norm).T
    updated_weights.append(updated_layer)

  return np.array(updated_weights)


def _train_marvin_weights(args):
  weights, weights_layers_size = _generate_random_weights(args)

  if args.load is not None:
    weights = pickle.load(args.load)
    
  env = gym.make('Marvin-v0')
  sigma = args.sigma
  alpha = args.alpha
  decay = args.decay
  population_size = args.pop
  processes = args.pool
  epochs = args.epochs
  max_train_steps = args.train_steps
  walk_only = args.walk
  print_step = args.print
  detail =  args.detail
  silent =  args.silent
  norm = args.norm
  observation = env.reset()

  if processes > 1:
    pool = mp.Pool(processes=processes)
  else:
    pool = False

  if not walk_only and detail:
    logging.info(f'>>> start Marvin weights train with following arguments')
    _list_args(args)

  for epoch in range(epochs):
    t1 = dt.now()
    population = _generate_population(population_size, weights_layers_size)
    rewards = _get_rewards(env, population_size, population, weights, sigma, pool, max_train_steps, norm, detail, silent, walk_only)
    weights = _update_weights(population_size, population, weights, rewards, sigma, alpha)
    alpha *= decay
    reward, timesteps = _get_reward(env, weights, max_train_steps, norm)

    if not walk_only and (not silent or detail) and (epoch + 1) % print_step == 0:
      logging.info(f'>>> train epoch {epoch + 1} - time {dt.now() - t1} - reward {reward} - timesteps {timesteps} of {max_train_steps}')

  if not walk_only and detail:
    logging.info(f'>>> end Marvin weights train after {epoch + 1} epoch(-s) with final reward {reward}')
    logging.info(f'>>> Marvin parameters alpha {alpha}, weights layers {weights_layers_size}, multiprocessing {processes if pool else 1}')

  if args.save is not None:
    pickle.dump(weights, args.save)

  if pool:
    pool.close()
    pool.join()

  env.close()

  return weights


def _play_marvin(weights, args):
  max_play_steps = args.play_steps
  render = args.render
  detail = args.detail
  silent = args.silent
  walk_only = args.walk
  norm = args.norm
  if args.capture:
    env = wrap_env(gym.make('Marvin-v0'))
  else:
    env = gym.make('Marvin-v0')
  play_reward = play_steps = 0
  done = False
  observation = env.reset()

  if not silent or detail or walk_only:
   logging.info(f'>>> start Marvin play with {max_play_steps} max steps')

  while not done and play_steps < max_play_steps:

    if render:
      env.render()

    action = _predict_action(observation, weights, norm)
    observation, reward, done, _ = env.step(action)
    play_reward += reward
    play_steps += 1
    if not silent and detail:
      logging.info(f'>>> play step {play_steps} - reward {reward}')

  if not silent or detail or walk_only:
    logging.info(f'>>> end Marvin play after {play_steps} steps with total reward {play_reward} and done status {done}')
  
  env.close()


def _list_args(args):
  logging.info(f'>>> ' + ' | '.join([f'{k}: {v.name}' if (k == 'log' or k == 'load' or k == 'save') and v else f'{k}: {v}' for k, v in vars(args).items()]))


def _main():
  args = _parse_args()
  _config_logging(args)
  
  if args.current:
    logging.info(f'>>> current or default arguments values are')
    _list_args(args)

  if args.train or args.load is None:

    try:
      t1 = dt.now()
      weights = _train_marvin_weights(args)      
      if not args.walk and args.detail:
        logging.info(f'>>> train time {dt.now() - t1}')
    except Exception as err:
      logging.info(f'>>> error in Marvin train with following arguments - {err}:')
      _list_args(args)
      return

  elif args.load is not None:

    try:
      weights = pickle.load(args.load)
    except Exception as err:
      logging.info(f'>>> error loading file `{args.load.name}` - {err}')
      return

  if not args.train:
    try:
      t1 = dt.now()
      _play_marvin(weights, args)
      if args.detail:
        logging.info(f'>>> play time {dt.now() - t1}')
    except Exception as err:
      logging.info(f'>>> error in Marvin play with following arguments - {err}:')
      _list_args(args)
      return


if __name__ == '__main__':
  _main()


Overwriting marvin.py


Change file mode to executable and see the result

In [0]:
%%bash
chmod a+x marvin.py
ls -l

Make some tests

In [28]:
!./marvin.py -t -s 100x2.pkl -epochs 100 -train_steps 2424

2020-02-20 13:07:15,360 >>> train epoch 1 - time 0:00:51.098178 - reward -192.436292931441 - timesteps 2424 of 2424
2020-02-20 13:08:06,892 >>> train epoch 2 - time 0:00:51.531554 - reward -1.8541210088130442 - timesteps 2424 of 2424
2020-02-20 13:09:02,275 >>> train epoch 3 - time 0:00:55.383144 - reward -45.76024559623993 - timesteps 2424 of 2424
2020-02-20 13:09:53,129 >>> train epoch 4 - time 0:00:50.852877 - reward -49.84441871894536 - timesteps 2424 of 2424
2020-02-20 13:10:44,599 >>> train epoch 5 - time 0:00:51.470411 - reward 20.594082995404374 - timesteps 2424 of 2424
2020-02-20 13:11:30,697 >>> train epoch 6 - time 0:00:46.097179 - reward 20.676901855225815 - timesteps 2424 of 2424
2020-02-20 13:12:24,146 >>> train epoch 7 - time 0:00:53.448706 - reward 12.09732298426574 - timesteps 2424 of 2424
2020-02-20 13:13:16,813 >>> train epoch 8 - time 0:00:52.666992 - reward 29.089542661381884 - timesteps 2424 of 2424
2020-02-20 13:14:11,514 >>> train epoch 9 - time 0:00:54.700521 -

Setup virtual display dependencies

In [33]:
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920, 1080))
display.start()

<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1920x1080x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1920x1080x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

Run Marvin play

In [44]:
!./marvin.py -l 100x2.pkl -detail -silent -capture

2020-02-20 14:45:39,510 Generating grammar tables from /usr/lib/python3.6/lib2to3/Grammar.txt
2020-02-20 14:45:39,544 Generating grammar tables from /usr/lib/python3.6/lib2to3/PatternGrammar.txt
2020-02-20 14:45:40,019 >>> start Marvin play with 2424 max steps
2020-02-20 14:46:41,368 >>> end Marvin play after 1873 steps with total reward 183.8802827453791 and done status True
2020-02-20 14:46:41,550 >>> play time 0:01:02.266442


Play captured video

In [45]:
def play_video():
  mp4_list = glob.glob('video/*.mp4')
  if len(mp4_list) > 0:
    mp4 = mp4_list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(
        HTML(data='''
    <video alt="Play video" autoplay controls style="height: 360px;">
      <source src="data:video/mp4;base64,{0}" type="video/mp4" />
    </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("No video found")

play_video()