In [1]:
from math import *
import matplotlib.pyplot as plt
from scipy.integrate import ode
import time
import matplotlib.pyplot as plt
import gym
import torch as th
import os
from datetime import datetime
import csv
import numpy as np

from stable_baselines3 import TD3, SAC
from stable_baselines3.common.callbacks import BaseCallback, EvalCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import SubprocVecEnv, VecMonitor
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

from bipedalWalkerPMTG import BipedalWalkerPMTG

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
use_gpu = True

In [2]:
# Addresses
env_name = "BipedalWalker-v3"
model_saved_file = "./models/bipedWalker_PMTG_SAC"
log_dir = "./models/PMTG_SAC-bipedalWalker2d-model"
best_model_dir = "./models/PMTG_SAC-bipedalWalker2d-model/best_single"
video_prefix = "bipedWalker_PMTG_SAC"
csv_addr = "./statistics/bipedWalker2d_PMTG_SAC_rewards.csv"
time_addr = "./statistics/bipedWalker2d_PMTG_SAC_elapcedTime.txt"
csv_path = "./statistics/bipedWalker2d_PMTG_SAC_rewards.csv"
time_path = "./statistics/bipedWalker2d_PMTG_SAC_elapcedTime.txt"
tb_log = "./PMTG_tensorboard/"
tb_log_name = "PMTG_SAC"
best_video_prefix = "bipedWalker_PMTG_SAC"

# Training Parameters
if use_gpu == True:
    device = "cuda"
else:
    device = "cpu"
    num_cpu = 1

is_hard = False
n_timesteps = int(1e6)
seed = 42
check_freq = 1000
reward_threshold = 300

# Hyperparameters
learning_rate = 7.3e-4
batch_size = 256
gamma = 0.98 #
tau = 0.02
buffer_size = 300000
learning_starts = 10000
noise_std = 0.1
gradient_steps = 1
train_freq = 1
n_layers = 2
n_neurons = 256
activation = th.nn.modules.activation.ReLU
ent_coef = 'auto'
log_std_init = -3

net_arch = [n_neurons]*n_layers

policy_kwargs = dict(activation_fn=activation, 
                    net_arch=net_arch)

hyperparameters = {
        #"env_name": env_name,
        "n_timesteps": n_timesteps,
        "seed": seed,
        "noise_std": noise_std,
        "ent_coef": ent_coef,
        
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "tau": tau,
        "learning_starts": learning_starts,
        "gradient_steps": gradient_steps,
        "gamma": gamma,
        "learning_rate": learning_rate,
        "train_freq": train_freq,

        "policy_kwargs": dict(
            net_arch=net_arch,
            log_std_init=log_std_init,
            activation_fn=activation,
        ),
    }

# other params
it = 0
os.makedirs(log_dir, exist_ok=True)

In [3]:
# Create environments
env = BipedalWalkerPMTG(is_hard=is_hard, action_repeat=1, act_noise=0.3, rew_scale=1.0)
env = Monitor(env, log_dir)
eval_env = BipedalWalkerPMTG(is_hard=is_hard, action_repeat=1, act_noise=0.0, rew_scale=1.0)

obs = env.reset()

In [5]:
# Create model
model = SAC(policy='MlpPolicy', 
        env=env,
        # action_noise=action_noise,
        train_freq=train_freq,

        learning_rate=learning_rate,
        batch_size=batch_size,
        tau=tau,
        gamma=gamma,
        buffer_size=buffer_size,
        learning_starts=learning_starts,
        gradient_steps=gradient_steps,
        ent_coef=ent_coef,

        policy_kwargs=policy_kwargs,

        device=device,
        seed=seed,
        verbose=0,
        tensorboard_log="./PMTG_tensorboard/")

In [7]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level.
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf
        self.mean_reward_lst = []
        self.timesteps = []
        self.s_reward = []
        self.s_timestep = []
        self.file_number = 0

    def _init_callback(self) -> None:
        pass

    def _on_step(self) -> bool:
        if self.num_timesteps % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          
          if len(x) > 0:
            # Mean training reward over the last 100 episodes
            mean_reward = np.mean(y[-100:])
            self.mean_reward_lst.append(mean_reward)
            self.timesteps.append(self.num_timesteps)
            episodes = len(y)

            self.s_reward.append(mean_reward)
            self.s_timestep.append(self.num_timesteps)

            if self.verbose > 0:
              print(f"Num timesteps: {self.num_timesteps}; Episodes: {episodes}")
              print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

            # New best model, you could save the agent here
            if mean_reward > self.best_mean_reward:
              self.best_mean_reward = mean_reward
              print(f"Saving new best model to {self.save_path}...")
              self.model.save(self.save_path)

            # 300-scores candidate test
            if mean_reward > reward_threshold:
              Yellow = "\033[0;33m"
              NC = "\033[0m"
              print("{0}Reward threshold achieved{1}".format(Yellow, NC))
              print("Evaluating model....")
              evals= evaluate_policy(model, eval_env, n_eval_episodes=100, deterministic=True, render=False, callback=None,
                              reward_threshold=None)
              mean_reward_100 = np.mean(evals[0])
              std_reward = np.mean(evals[1])
              print(f"Evaluation over 100 Episodes: {mean_reward_100} ")

              if mean_reward_100 >= reward_threshold:
                # create folder for best models
                now = datetime.now()
                dt_string = now.strftime("%Y-%m-%d_%H-%M-%S")
                best_dir = "{0}/{1}".format(best_model_dir, dt_string)
                os.makedirs(best_dir, exist_ok=True)
                # сохранить файл модели
                self.model.save(best_dir)
                print(f"Saving new best model to {self.save_path}...")
                # save file with hyperparameters, episodes number and reward
                print("Saving training info...")
                filename = "{0}/training_info.txt".format(best_dir)
                print(filename)
                with open(filename, mode="w") as f:
                    f.write("Episodes: {0}\r\n".format(episodes))
                    f.write("Timesteps: {0}\r\n".format(self.num_timesteps))
                    f.write("Eval reward: {0}\r\n".format(mean_reward_100))
                    f.write("Info and Hyperparameters:\r\n")
                    for k, v in hyperparameters.items():
                        str = "    {0}: {1}\r\n".format(k, v)
                        f.write(str)
                # save reward curve
                print("Saving reward CSV-data...")
                filename = "{0}/rewards.csv".format(best_dir)
                with open(filename, mode='w') as reward_file:
                    reward_writer = csv.writer(reward_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

                    for i in range(len(self.s_timestep)):
                        reward_writer.writerow([self.s_reward[i], self.s_timestep[i]])
                print(f"MISSION COMPLETED")
                print(f"Score: {mean_reward_100}+/-{std_reward} reached at Episode: {episodes} ")
                return False
        if self.num_timesteps % 10000 == 0:
          print("Saving new model file...")
          self.file_number += 1
          filename = "model{0}".format(self.file_number)
          nsave_path = os.path.join(log_dir, filename)
          self.model.save(nsave_path)
          

        return True

    def get_mean_reward(self):
      return self.mean_reward_lst, self.timesteps

_XSERVTransSocketUNIXCreateListener: ...SocketCreateListener() failed
_XSERVTransMakeAllCOTSServerListeners: server already running
(EE) 
Fatal server error:
(EE) Cannot establish any listening sockets - Make sure an X server isn't already running(EE) 


In [8]:
callback = SaveOnBestTrainingRewardCallback(check_freq=check_freq, log_dir=log_dir)

In [9]:
model.learn(n_timesteps, tb_log_name=tb_log_name, callback=callback)

Num timesteps: 1000; Episodes: 113
Best mean reward: -inf - Last mean reward per episode: 259.92
Saving new best model to ./models/PMTG_SAC-bipedalWalker2d-model/best_model...
Num timesteps: 2000; Episodes: 125
Best mean reward: 259.92 - Last mean reward per episode: 222.91
Num timesteps: 3000; Episodes: 138
Best mean reward: 259.92 - Last mean reward per episode: 182.71
Num timesteps: 4000; Episodes: 151
Best mean reward: 259.92 - Last mean reward per episode: 142.41
Num timesteps: 5000; Episodes: 165
Best mean reward: 259.92 - Last mean reward per episode: 99.16
Num timesteps: 6000; Episodes: 179
Best mean reward: 259.92 - Last mean reward per episode: 57.78
Num timesteps: 7000; Episodes: 194
Best mean reward: 259.92 - Last mean reward per episode: 11.51
Num timesteps: 8000; Episodes: 209
Best mean reward: 259.92 - Last mean reward per episode: -10.29
Num timesteps: 9000; Episodes: 222
Best mean reward: 259.92 - Last mean reward per episode: -10.29
Num timesteps: 10000; Episodes: 235