### 环境模块gym

In [22]:
import gym
import collections
import numpy as np

In [17]:
env = gym.make('FrozenLake-v1',is_slippery=False,render_mode="ansi")

In [18]:
env.reset()

(0, {'prob': 1})

In [19]:
print(env.render())


[41mS[0mFFF
FHFH
FFFH
HFFG



In [20]:
env.step(2)

(1, 0.0, False, False, {'prob': 1.0})

In [21]:
print(env.render())

  (Right)
S[41mF[0mFF
FHFH
FFFH
HFFG



### 基于Q学习的二维世界

In [16]:
GAMMA = 0.5
ALPHA = 0.1
TEST_EPISODES = 30
EPSILON = 0.7

In [33]:
class Agent:
    def __init__(self):
        self.env = gym.make('FrozenLake-v1',is_slippery=False,render_mode="ansi")
        self.state = self.env.reset()[0]
        self.values = collections.defaultdict(float)

    def feedback_env(self):
        old_state = self.state
        if np.random.uniform() > EPSILON:
            action = self.env.action_space.sample()
        else:
            _, action = self.best_value_and_action(old_state)
        new_state, reward, is_done, _ , _ = self.env.step(action)
        self.state = self.env.reset()[0] if is_done else new_state
        return (old_state, action, reward, new_state)


   
    def best_value_and_action(self, state):
        best_value, best_action = None, None
        # 遍历所有的动作
        for action in range(self.env.action_space.n):
            action_value = self.values[(state, action)]
            if best_value is None or best_value < action_value:
                best_value = action_value
                best_action = action
        if  best_value == 0:  
            best_action = self.env.action_space.sample()
            best_value = self.values[(state, best_action)]
        return best_value, best_action
    

    def value_update(self, s, a, r, next_s):
        best_v, _ = self.best_value_and_action(next_s)
        new_val = r + GAMMA * best_v
        old_val = self.values[(s, a)]
        self.values[(s, a)] = old_val * (1-ALPHA) + new_val * ALPHA

    def play_episode(self, env):  
        total_reward = 0.0
        state = env.reset()[0]
        while True:
            _, action = self.best_value_and_action(state)
            new_state, reward, is_done, _ ,_= env.step(action)
            total_reward += reward
            if is_done:
                break
            state = new_state
        return total_reward

In [34]:
test_env = gym.make('FrozenLake-v1',is_slippery=False,render_mode="ansi")
agent = Agent()

iter_no = 0
best_reward = 0.0
while True:
    iter_no += 1
    s, a, r, next_s = agent.feedback_env()
    agent.value_update(s, a, r, next_s)
    reward = 0.0
    for i in range(TEST_EPISODES):
        reward += agent.play_episode(test_env)
    reward /= TEST_EPISODES
    if reward > best_reward:
        print("Best reward updated %.3f -> %.3f" % (best_reward, reward))
        best_reward = reward
    if reward > 0.80:
        print("Solved in %d iterations!" % iter_no)
        break


Best reward updated 0.000 -> 0.033
Best reward updated 0.033 -> 0.067
Best reward updated 0.067 -> 0.100
Best reward updated 0.100 -> 0.133
Best reward updated 0.133 -> 0.200
Best reward updated 0.200 -> 0.233
Best reward updated 0.233 -> 0.300
Best reward updated 0.300 -> 0.467
Best reward updated 0.467 -> 0.500
Best reward updated 0.500 -> 0.533
Best reward updated 0.533 -> 0.700
Best reward updated 0.700 -> 0.800
Best reward updated 0.800 -> 0.900
Solved in 948 iterations!


In [35]:
agent.values

defaultdict(float,
            {(0, 0): 0.0,
             (0, 1): 0.0,
             (0, 2): 0.0,
             (0, 3): 0.0,
             (4, 0): 0.0,
             (4, 1): 0.0,
             (4, 2): 0.0,
             (4, 3): 0.0,
             (8, 0): 0.0,
             (8, 1): 0.0,
             (8, 2): 1.2500000000000004e-05,
             (8, 3): 0.0,
             (9, 0): 0.0,
             (9, 1): 0.0,
             (9, 2): 0.0015325000000000004,
             (9, 3): 0.0,
             (13, 0): 0.0,
             (13, 1): 0.0,
             (13, 2): 0.0,
             (13, 3): 0.0,
             (14, 0): 0.0,
             (14, 1): 0.0,
             (14, 2): 0.40951000000000004,
             (14, 3): 0.0,
             (10, 0): 0.0,
             (10, 1): 0.04073,
             (10, 2): 0.0,
             (10, 3): 0.0,
             (6, 0): 0.0,
             (6, 1): 0.0013300000000000002,
             (6, 2): 0.0,
             (6, 3): 0.0,
             (2, 0): 0.0,
             (2, 1): 9.8000000000000