# Behavior Cloningを使用して実機の自動運転の学習をしたい。
学習データは、自分が運転したデータを使用する。
データの形式は、画像とその時の操作であるsteeringとthrottleの値が保存されている。


In [1]:
import numpy as np
import gymnasium
from gymnasium.wrappers import TimeLimit
from imitation.data import rollout
from imitation.policies.serialize import load_policy
from imitation.util.util import make_vec_env
from imitation.data.wrappers import RolloutInfoWrapper
from stable_baselines3.common.vec_env import VecVideoRecorder, DummyVecEnv

from gymnasium.spaces import Box
from gymnasium.envs.registration import register
import racing_gym
import os
import json
from PIL import Image





## 環境を事務のレジストリに追加する

In [2]:
gymnasium.register(
    id = 'RacingEnv-v0',
    entry_point = 'racing_gym.envs.racing_env:RacingEnv',
    max_episode_steps = 1000,
)


ベクトル化された環境を作成する

In [3]:
import os
import json
import numpy as np
from PIL import Image

def load_expert_data(data_path):
    # expert_dataの初期化
    expert_data = {'images': [], 'actions': []}
    # print(expert_data)
    # count = 0

    # 画像ファイルに対応するJSONファイルを取得
    json_file_list = [json_file for json_file in os.listdir(data_path) if json_file.startswith('record_') and json_file.endswith('.json')]
    # print(len(json_file_list))

    for json_file in json_file_list:
        # count += 1
        # print(count)
        # print(json_file)

        # レコードのファイルパスを構築
        json_path = os.path.join(data_path, json_file)
        # print(json_path)

        # レコードの読み込み
        try:
            with open(json_path, 'r') as json_file:
                record_data = json.load(json_file)
        except FileNotFoundError:
            print(f"エラー：{json_path} でJSONファイルが見つかりませんでした。")
            continue
        except json.JSONDecodeError:
            print(f"エラー：{json_path} のJSONファイルのデコードに失敗しました。")
            continue

        # 画像データの読み込み
        image_file = record_data.get('cam/image_array', '')  # 画像ファイル名をJSONから取得
        # print(image_file)
        image_path = os.path.join(data_path, image_file)
        # print(image_path)
        try:
            image_data = np.array(Image.open(image_path))
        except FileNotFoundError:
            print(f"エラー：{image_path} で画像ファイルが見つかりませんでした。")
            continue

        # expert_dataに追加
        expert_data['images'].append(image_data)
        expert_data['actions'].append([record_data.get('user/angle', 0), record_data.get('user/throttle', 0)])

    return expert_data


In [4]:
expert_data_path = '../../Data/autorace/O/tub_9_24-01-09'
expert_data = load_expert_data(expert_data_path)
print(expert_data['actions'])
env = gymnasium.make('RacingEnv-v0', expert_data=expert_data)

[[1.0, 0.7], [1.0, 0.7], [1.0, 0.8], [1.0, 0.7], [0.0, 0.8], [0.0, 0.8], [0.0, 0.7], [1.0, 0.8], [0.0, 0.8], [1.0, 0.8], [0.0, 0.8], [1.0, 0.65], [1.0, 0.65], [0.0, 0.7], [1.0, 0.7], [1.0, 0.8], [1.0, 0.65], [0.0, 0.65], [0.0, 0.65], [1.0, 0.8], [1.0, 0.8], [1.0, 0.8], [0.0, 0.7], [1.0, 0.65], [1.0, 0.65], [0.0, 0.8], [1.0, 0.8], [0.0, 0.8], [1.0, 0.65], [0.0, 0.8], [0.0, 0.65], [1.0, 0.8], [0.0, 0.7], [1.0, 0.8], [1.0, 0.7], [1.0, 0.65], [1.0, 0.8], [1.0, 0.7], [0.0, 0.8], [0.0, 0.7], [1.0, 0.7], [1.0, 0.8], [0.0, 0.7], [0.0, 0.8], [1.0, 0.8], [0.8247627185888241, 0.65], [0.0, 0.65], [1.0, 0.7], [1.0, 0.8], [0.0, 0.8], [0.0, 0.8], [0.0, 0.7], [0.0, 0.8], [1.0, 0.8], [0.0, 0.65], [0.0, 0.65], [1.0, 0.8], [1.0, 0.65], [1.0, 0.8], [1.0, 0.7], [1.0, 0.8], [1.0, 0.65], [0.6082644123661001, 0.65], [1.0, 0.7], [0.0, 0.65], [1.0, 0.7], [0.0, 0.8], [0.0, 0.7], [0.0, 0.8], [1.0, 0.8], [0.0, 0.8], [0.0, 0.65], [0.0, 0.8], [1.0, 0.8], [1.0, 0.7], [0.0, 0.7], [1.0, 0.7], [0.0, 0.8], [0.0, 0.65], [

  logger.warn(


In [5]:
# env = gym.make('RacingEnv-v0')

# print(list(gym.envs.registry.keys()))
# print(env.expert_data)

In [6]:
import gymnasium
env_id = 'RacingEnv-v0'  # あなたの環境の名前に変更してください
try:
    env = gymnasium.make(env_id, expert_data=expert_data)
    print(f"Environment '{env_id}' is successfully registered.")
except gymnasium.error.Error as e:
    print(f"Error: {e}")


Environment 'RacingEnv-v0' is successfully registered.


In [8]:
# venv = make_vec_env(
#     "RacingEnv-v0",
#     rng = np.random.default_rng(),
#     n_envs = 4,
#     post_wrappers = [lambda env, _: RolloutInfoWrapper(env)],
# )

In [9]:
def _make_env():
    """Helper function to create a single environment. Put any logic here, but make sure to return a RolloutInfoWrapper."""
    _env = gymnasium.make("RacingEnv-v0", expert_data=expert_data)
    _env = RolloutInfoWrapper(_env)
    print(_env.total_step)
    return _env

venv = DummyVecEnv([_make_env for _ in range(4)])

3479
3479
3479
3479


  logger.warn(


In [10]:

# 環境の初期化
env = gymnasium.make("RacingEnv-v0", expert_data=expert_data)

num_episodes = 1000

# 学習ループなどで利用
# for _ in range(num_episodes):
#     action = policy.predict(observation)  # ポリシーによるアクション予測
#     observation, reward, done, info = env.step(action)
#     if done:
#         observation = env.reset()

env.reset()
next_state, reward, done, trunc, info = env.step(action=[0, 0])
print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

(224, 224, 3),
 1.49,
 False,
 {}


  logger.deprecation(


In [11]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

model = PPO(MlpPolicy, venv, verbose=1)


# 観測空間の形状を確認
obs_shape = model.observation_space.shape
print(f"Observation space shape: {obs_shape}")

Using cuda device
Wrapping the env in a VecTransposeImage.
Observation space shape: (3, 224, 224)


In [12]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy

expert = PPO(
    policy=MlpPolicy,
    env=env,
    seed=0,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    n_steps=64,
)
reward, _ = evaluate_policy(expert, env, 10)
print(f"Reward before training: {reward}")


# Note: if you followed step 2a, i.e. registered the environment, you can use the environment name directly

# expert = PPO(
#     policy=MlpPolicy,
#     env="custom/ObservationMatching-v0",
#     seed=0,
#     batch_size=64,
#     ent_coef=0.0,
#     learning_rate=0.0003,
#     n_epochs=10,
#     n_steps=64,
# )
expert.learn(10_000)  # Note: set to 100000 to train a proficient expert
reward, _ = evaluate_policy(expert, expert.get_env(), 10)
print(f"Expert reward: {reward}")



Reward before training: 999.9404208958149
Expert reward: 999.940422


In [13]:
print(env.reset())
print(env.observation_space)

(array([[[ 77,  72,  52],
        [ 79,  74,  54],
        [ 81,  76,  56],
        ...,
        [129, 118, 114],
        [130, 119, 115],
        [130, 119, 115]],

       [[ 77,  72,  52],
        [ 79,  74,  54],
        [ 81,  76,  56],
        ...,
        [131, 120, 116],
        [133, 122, 118],
        [134, 123, 119]],

       [[ 78,  73,  53],
        [ 80,  75,  55],
        [ 82,  77,  57],
        ...,
        [131, 120, 116],
        [133, 122, 118],
        [135, 124, 120]],

       ...,

       [[ 67,  74,  84],
        [ 69,  76,  86],
        [ 70,  77,  87],
        ...,
        [ 84,  87, 102],
        [ 80,  83,  98],
        [ 78,  81,  96]],

       [[ 66,  73,  83],
        [ 67,  74,  84],
        [ 68,  75,  85],
        ...,
        [ 84,  87, 102],
        [ 83,  86, 101],
        [ 83,  86, 101]],

       [[ 65,  72,  82],
        [ 66,  73,  83],
        [ 68,  75,  85],
        ...,
        [ 85,  88, 103],
        [ 87,  90, 105],
        [ 88,  91, 106]

In [14]:
print(expert.get_env().observation_space)
print(venv.observation_space)racing/bc_test.ipynb


Box(0, 255, (3, 224, 224), uint8)
Box(0, 255, (224, 224, 3), uint8)


In [15]:
rng = np.random.default_rng()
rollouts = rollout.rollout(
    expert,
    expert.get_env(),
    rollout.make_sample_until(min_timesteps=None, min_episodes=50),
    rng=rng,
    unwrap=False,
)
transitions = rollout.flatten_trajectories(rollouts)

In [17]:
print(env.expert_data['actions'][0])
# print(venv.step(actions=env.expert_data['actions'][0]))

[1.0, 0.7]


In [18]:
from imitation.algorithms import bc

bc_trainer = bc.BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    demonstrations=transitions,
    rng=rng,
)

In [19]:
reward_before_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward before training: {reward_before_training}")

Reward before training: 999.9404208958149


In [20]:
bc_trainer.train(n_epochs=1)
reward_after_training, _ = evaluate_policy(bc_trainer.policy, env, 10)
print(f"Reward after training: {reward_after_training}")

0batch [00:00, ?batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 0        |
|    ent_loss       | -0.00284 |
|    entropy        | 2.84     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 129      |
|    loss           | 2.33     |
|    neglogp        | 2.33     |
|    prob_true_act  | 0.0973   |
|    samples_so_far | 32       |
--------------------------------


499batch [00:30, 17.44batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 500      |
|    ent_loss       | -0.00181 |
|    entropy        | 1.81     |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 217      |
|    loss           | 0.806    |
|    neglogp        | 0.808    |
|    prob_true_act  | 0.446    |
|    samples_so_far | 16032    |
--------------------------------


1000batch [01:00, 16.73batch/s]

---------------------------------
| batch_size        | 32        |
| bc/               |           |
|    batch          | 1000      |
|    ent_loss       | -0.000803 |
|    entropy        | 0.803     |
|    epoch          | 0         |
|    l2_loss        | 0         |
|    l2_norm        | 218       |
|    loss           | -0.198    |
|    neglogp        | -0.197    |
|    prob_true_act  | 1.22      |
|    samples_so_far | 32032     |
---------------------------------


1500batch [01:33, 13.88batch/s]

--------------------------------
| batch_size        | 32       |
| bc/               |          |
|    batch          | 1500     |
|    ent_loss       | 0.000197 |
|    entropy        | -0.197   |
|    epoch          | 0        |
|    l2_loss        | 0        |
|    l2_norm        | 219      |
|    loss           | -1.2     |
|    neglogp        | -1.2     |
|    prob_true_act  | 3.31     |
|    samples_so_far | 48032    |
--------------------------------


1562batch [01:37, 16.03batch/s]


Reward after training: 999.9404208958149
