In [7]:
import gym

# 创建CartPole环境
env = gym.make('CartPole-v1', render_mode="human")

# 进行若干次实验
for episode in range(10):
    state = env.reset()
    terminated = False
    step_count = 0

    # 在每次实验中，执行若干步操作
    while not terminated:
        env.render()
        action = env.action_space.sample()
        state, reward, terminated, truncated, info = env.step(action)
        step_count += 1

    print(f"Episode {episode + 1} finished after {step_count} steps")

env.close()


Episode 1 finished after 20 steps
Episode 2 finished after 28 steps
Episode 3 finished after 14 steps
Episode 4 finished after 11 steps
Episode 5 finished after 49 steps
Episode 6 finished after 11 steps
Episode 7 finished after 9 steps
Episode 8 finished after 35 steps
Episode 9 finished after 40 steps
Episode 10 finished after 44 steps


测试Git

In [15]:
import gym
import numpy as np

def run_episode(env, parameters):
    observation = env.reset()[0]
    total_reward = 0
    for _ in range(300):
        action = 0 if np.matmul(parameters, observation) < 0 else 1
        observation, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        if terminated:
            break
    return total_reward

def train():
    env = gym.make("CartPole-v1")
    best_params = None
    best_reward = 0
    noise_scaling = 0.1
    episodes_per_update = 10

    for _ in range(10000):
        parameters = best_params if best_params is not None else np.random.rand(4) * 2 - 1
        parameters += (np.random.rand(4) * 2 - 1) * noise_scaling
        reward_sum = 0
        for _ in range(episodes_per_update):
            reward_sum += run_episode(env, parameters)
        
        average_reward = reward_sum / episodes_per_update
        if average_reward > best_reward:
            best_reward = average_reward
            best_params = parameters
            if average_reward >= 290.0:
                print(f"average reward: {average_reward}")
                break

    return best_params

best_parameters = train()

env = gym.make("CartPole-v1")
total_reward = run_episode(env, best_parameters)
print(f"Total reward: {total_reward}")


average reward: 295.3
Total reward: 300.0


In [7]:
import numpy as np
import gym
import time

def get_action(weights, observation):# 根据权值对当前状态做出决策
    wxb = np.dot(weights[:4], observation) + weights[4] # 计算加权和
    if wxb >= 0:# 加权和大于0时选取动作1，否则选取0
        return 1
    else:
        return 0

def get_sum_reward_by_weights(env, weights):
# 测试不同权值的控制模型有效控制的持续时间（或奖励）
    observation = env.reset()[0] # 重置初始状态
    sum_reward = 0 # 记录总的奖励
    for t in range(1000):
        # time.sleep(0.01)
        # env.render()
        action = get_action(weights, observation) # 获取当前权值下的决策动作
        observation, reward, terminated, truncated, info = env.step(action)# 执行动作并获取这一动作下的下一时间步长状态
        sum_reward += reward
        # print(sum_reward, action, observation, reward, done, info)
        if terminated:# 如若游戏结束，返回
            break
    return sum_reward


def get_weights_by_random_guess():
# 选取随机猜测的5个随机权值
    return np.random.rand(5)

def get_weights_by_hill_climbing(best_weights):
# 通过爬山算法选取权值（在当前最好权值上加入随机值）
    return best_weights + np.random.normal(0, 0.1, 5)

def get_best_result(algo="random_guess"):
    env = gym.make("CartPole-v1")
    np.random.seed(10)
    best_reward = 0 # 初始最佳奖励
    best_weights = np.random.rand(5) # 初始权值为随机取值

    for iter in range(10000):# 迭代10000次
        cur_weights = None

        if algo == "hill_climbing": # 选取动作决策的算法 
            # print(best_weights)
            cur_weights = get_weights_by_hill_climbing(best_weights)
        else: # 若为随机猜测算法，则选取随机权值
            cur_weights = get_weights_by_random_guess()
        # 获取当前权值的模型控制的奖励和
        cur_sum_reward = get_sum_reward_by_weights(env, cur_weights)

        # print(cur_sum_reward, cur_weights)
        # 更新当前最优权值
        if cur_sum_reward > best_reward:
            best_reward = cur_sum_reward
            best_weights = cur_weights
        # 达到最佳奖励阈值后结束
        if best_reward >= 200:
            break

    print(iter, best_reward, best_weights)
    return best_reward, best_weights

# 程序从这里开始执行
print(get_best_result("hill_climbing")) # 调用爬山算法寻优并输出结果 

# env = gym.make("CartPole-v0")
# get_sum_reward_by_weights(env, [0.22479665, 0.19806286, 0.76053071, 0.16911084, 0.08833981])

34 206.0 [ 0.70975396 -0.30061841  0.72104549  1.16557572 -0.00898724]
(206.0, array([ 0.70975396, -0.30061841,  0.72104549,  1.16557572, -0.00898724]))
