In [None]:
import gymnasium as gym
# from EasyEnv import myEasyGym
from Approach_env import SRC_approach as SRC_test
import numpy as np
from stable_baselines3.common.callbacks import CheckpointCallback
from stable_baselines3.common.env_checker import check_env
from stable_baselines3 import PPO
from CL_env import CurriculumWrapper
# Create environment

gym.envs.register(id="Training_ppo_rand_needle", entry_point=SRC_test, max_episode_steps=2000)
env = gym.make("Training_ppo_rand_needle", render_mode="human")

wrapped_env = CurriculumWrapper(env)
env = wrapped_env 
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn

In [None]:
# Check the environment
check_env(wrapped_env)

In [None]:
wrapped_env.reset()

In [None]:
wrapped_env.reset()
for _ in range(20000):
    action = wrapped_env.action_space.sample()
    obs, reward, terminated, truncated, info = wrapped_env.step(action)
    print(obs)
    wrapped_env.render()
    if terminated or truncated:
        obs, info = wrapped_env.reset()

In [None]:
# First training
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log="./First_expert_insert/",)

In [None]:
# Only behavior cloning
import pickle

with open('expert_data.pkl', 'rb') as f:
    expert_data = pickle.load(f)

# #######
# model = PPO("MlpPolicy", env, seed=66, verbose=1,tensorboard_log="./First_end_effector/",)
# #######

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
Policy = model.policy.to(device)
observations, actions = zip(*expert_data)
observations = np.array(observations)
actions = np.array(actions)


# 转换为Tensor
observations_tensor = torch.tensor(observations, dtype=torch.float32)
actions_tensor = torch.tensor(actions, dtype=torch.long)

# 创建数据集
dataset = TensorDataset(observations_tensor, actions_tensor)
data_loader = DataLoader(dataset, batch_size=1024, shuffle=True)

# 定义优化器
optimizer = Adam(model.policy.parameters(), lr=2e-4)
criterion = nn.CrossEntropyLoss()
batch_idx = 0

# 训练循环
for epoch in range(100000):  # 假设训练10个epoch
    # batch_idx = 0
    loss_avg = 0
    batch_idx = 0
    if (epoch%10 == 0):
        data_loader = DataLoader(dataset, batch_size=1024, shuffle=True)
        print("Re-shuffle the batch...")
    for batch_obs, batch_actions in data_loader:
        batch_idx += 1
        batch_obs = batch_obs.to(device)
        batch_actions = batch_actions.to(device)
        # 前向传播
        dist = Policy.get_distribution(batch_obs)
        # action_prediction = dist.distribution.logits
        loss = 0
        action_len = env.action_space.shape[0]
        for i in range(action_len):
            action_logits = dist.distribution[i].logits
            # action_logits = torch.cat([d.logits for d in dist.distribution], dim=1)
            loss += criterion(action_logits, batch_actions[:,i].long())
        # 计算损失
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_avg += loss

        if (batch_idx%200 == 0):
            print(
                "Train Epoch: {} Batch idx: {} \t Loss: {:.6f}\n".format(
                    epoch,
                    batch_idx,
                    loss.item(),
                )
            )
    print(f"Average Loss in {epoch} episode is {loss_avg/batch_idx}\n")


In [None]:
# Save the expert demonstration model
model.save("Expert")

In [None]:
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./First_end_effector/Model_temp2', name_prefix='SRC')
model.learn(total_timesteps=int(2000000), progress_bar=True,callback=checkpoint_callback,)
model.save("SRC")

In [None]:
# Continue training
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log="./First_expert_demo/")
checkpoint_callback = CheckpointCallback(save_freq=10000, save_path='./First_expert_demo/Model_temp', name_prefix='SRC')
# model_path = "./First_end_effector/Model_temp/SRC_330000_steps.zip"
model_path = "./Expert.zip"
model = PPO.load(model_path)
model.set_env(env=env)
model.learn(total_timesteps=int(1000000), progress_bar=True,callback=checkpoint_callback,reset_num_timesteps=False)

In [None]:
model = PPO("MlpPolicy", env, verbose=1,tensorboard_log="./First_expert_demo/",)
model_path = "./First_expert_insert/demo_temp1.zip"
model = PPO.load(model_path)
model.set_env(env=env)

In [None]:
# Initialize the model for imitation learning and RL
model = PPO("MlpPolicy", env, learning_rate=1e-4, gamma=0.999, n_steps=10, batch_size=512,  verbose=1,tensorboard_log="./Curriculum_learning_v1/RL_BC_test")
checkpoint_callback = CheckpointCallback(save_freq=5000, save_path='./Curriculum_learning_v1/RL_BC_test', name_prefix='rl_model')

In [None]:
# Imitation learning only
import pickle
from torch.utils.tensorboard import SummaryWriter
import gc
tensorboard_writer = SummaryWriter(log_dir="./Curriculum_learning_v1/BC_baseline/BC_logs")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

with open('./Expert_Data_Folder/Version2/Insert_20T_100N_0N_1R.pkl', 'rb') as f:
    expert_data = pickle.load(f)

observations, actions = zip(*expert_data)
observations = np.array(observations)
actions = np.array(actions)
observations_tensor = torch.tensor(observations, dtype=torch.float32, device=device)
actions_tensor = torch.tensor(actions, dtype=torch.long, device=device)

dataset = TensorDataset(observations_tensor, actions_tensor)
del observations, actions, observations_tensor, actions_tensor 
gc.collect() 
torch.cuda.empty_cache() 
seed=66
torch.manual_seed(seed)  

print("Behavior cloning training...\n")
Policy = model.policy.to(device)
data_loader = DataLoader(dataset, batch_size=256, shuffle=True)

optimizer = Adam(model.policy.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()
batch_idx = 0
record_epoch = 801
for epoch in range(801,2000):
    record_epoch += 1
    loss_avg = 0

    for batch_idx, (batch_obs, batch_actions) in enumerate(data_loader):

        batch_obs = batch_obs.to(device)
        batch_actions = batch_actions.to(device)
        dist = Policy.get_distribution(batch_obs)
        loss = 0
        action_len = env.action_space.shape[0]
        for i in range(action_len):
            action_logits = dist.distribution[i].logits
            loss += criterion(action_logits, batch_actions[:,i].long())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_avg += loss

    print(f"Average Loss in {epoch} episode is {loss_avg/batch_idx}\n")
    tensorboard_writer.add_scalar("Loss", loss_avg/batch_idx, record_epoch)
    if epoch%100 == 0:
        model.save("./Curriculum_learning_v1/BC_baseline/"+"bc_model"+str(epoch))
del data_loader 
gc.collect()
torch.cuda.empty_cache()  
tensorboard_writer.close()

In [None]:
import pickle
from torch.utils.tensorboard import SummaryWriter
import gc
tensorboard_writer = SummaryWriter(log_dir="./Curriculum_learning_v1/RL_BC_test/BC_logs")
record_epoch = 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

with open('./Expert_Data_Folder/Version3/Relative_Insert_200T_50N_1nr_1rr.pkl', 'rb') as f:
    expert_data = pickle.load(f)

observations, actions = zip(*expert_data)
observations = np.array(observations)
actions = np.array(actions)
observations_tensor = torch.tensor(observations, dtype=torch.float32, device=device)
actions_tensor = torch.tensor(actions, dtype=torch.long, device=device)

dataset = TensorDataset(observations_tensor, actions_tensor)
del observations, actions, observations_tensor, actions_tensor
gc.collect()
torch.cuda.empty_cache()  
seed=66
torch.manual_seed(seed)  

for train_round in range(100):

    print("RL training...\n")
    model.learn(total_timesteps=int(15000), progress_bar=True,callback=checkpoint_callback,reset_num_timesteps=False)

    print("Behavior cloning training...\n")
    Policy = model.policy.to(device)
    data_loader = DataLoader(dataset, batch_size=1024, shuffle=True)

    optimizer = Adam(model.policy.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    batch_idx = 0

    for epoch in range(200):
        record_epoch += 1
        loss_avg = 0

        for batch_idx, (batch_obs, batch_actions) in enumerate(data_loader):
            batch_obs = batch_obs.to(device)
            batch_actions = batch_actions.to(device)
            dist = Policy.get_distribution(batch_obs)
            loss = 0
            action_len = env.action_space.shape[0]
            for i in range(action_len):
                action_logits = dist.distribution[i].logits
                loss += criterion(action_logits, batch_actions[:,i].long())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_avg += loss
        print(f"Average Loss in {epoch} episode is {loss_avg/batch_idx}\n")
        tensorboard_writer.add_scalar("Loss", loss_avg/batch_idx, record_epoch)
    model.save("./Curriculum_learning_v1/RL_BC_test/"+"bc_model"+str(train_round))
    del data_loader  # 删除DataLoader释放内存
    gc.collect()
    torch.cuda.empty_cache()  # 清空CUDA缓存以供RL训练使用


tensorboard_writer.close()

In [None]:
# Reinforcement learning only
model.learn(total_timesteps=int(1000000), progress_bar=True,callback=checkpoint_callback,reset_num_timesteps=False)
model.save("./Curriculum_learning_v1/RL_BC_test/rl_model_final")

In [None]:
model.save("./First_RL_expert_insert/"+"rl_model_final")

In [None]:
# Predict the action with model
obs,info = wrapped_env.reset()
print(obs)
for i in range(10000):
    action, _state = model.predict(obs, deterministic=True)
    # print(action)
    obs, reward, terminated,truncated, info = wrapped_env.step(action)
    # print(obs)
    wrapped_env.render()
    if terminated or truncated:
        obs, info = wrapped_env.reset()

In [None]:
def low_pass_filter(prev_action, new_action, alpha=0.3):
    """
    Apply low pass filter
    alpha: smooth factor
    """
    return alpha * new_action + (1 - alpha) * prev_action

# 初始化动作值
obs, info = env.reset()
prev_action = None

for i in range(10000):
    current_action, _state = model.predict(obs, deterministic=True)

    if prev_action is not None:
        filtered_action = low_pass_filter(prev_action, current_action)
    else:
        filtered_action = current_action

    prev_action = filtered_action
    print(filtered_action)
    
    obs, reward, terminated, truncated, info = env.step(filtered_action)
    print(info)
    env.render()
    
    if terminated or truncated:
        obs, info = env.reset()
        prev_action = None  # 重置前一个动作
