In [3]:
import logging
import numpy as np
import pandas as pd
import random
from gym import spaces
import gym
from collections import defaultdict
logger = logging.getLogger(__name__)

class MazeEnv(gym.Env):
    #添加元数据，改变渲染环境时的参数
    metadata = {
        'render.modes': ['human', 'rgb_array'],
        'video.frames_per_second': 2
    }

    def __init__(self):

        self.states = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18] #状态空间
        self.x=[140,220,300,460,140,220,300,460,300,380,460,140,220,300,380,460,140,220]
        self.y=[460,460,460,460,380,380,380,380,300,300,300,220,220,220,220,220,140,140]
        self.scdict = {}
        for index,state in enumerate(self.states):
            self.scdict[state] = np.array([self.x[index] ,self.y[index]])
        self.terminate_states = dict()  #终止状态为字典格式
        self.terminate_states[16] = 1

        self.actions = ['n','s','e','w']
        self.acdict = {}
        for index1,action in enumerate(self.actions):
            self.acdict[index1] = action
        self.rewards = dict();        #回报的数据结构为字典
        self.rewards['11_s'] = 100.0
        self.rewards['15_e'] = 100.0
        self.t = dict();             #状态转移的数据格式为字典
        self.t['1_s'] = 5
        self.t['1_e'] = 2
        
        self.t['2_w'] = 1
        self.t['2_e'] = 3
        self.t['2_s'] = 6
        
        self.t['3_s'] = 7
        self.t['3_w'] = 2
        
        self.t['4_s'] = 8
        
        self.t['5_n'] = 1
        self.t['5_e'] = 6
        
        self.t['6_e'] = 7
        self.t['6_w'] = 5
        self.t['6_n'] = 2
        
        self.t['7_s'] = 9
        self.t['7_w'] = 6
        self.t['7_n'] = 3
        
        self.t['8_s'] = 11
        self.t['8_n'] = 4
        
        self.t['9_e'] = 10
        self.t['9_s'] = 14
        self.t['9_n'] = 7
        
        self.t['10_s'] = 15
        self.t['10_w'] = 9
        self.t['10_e'] = 11
        
        self.t['11_w'] = 10
        self.t['11_s'] = 16
        self.t['11_n'] = 8
        
        self.t['12_e'] = 13
        self.t['12_s'] = 17
        
        self.t['13_e'] = 14
        self.t['13_s'] = 18
        self.t['13_w'] = 12
        
        self.t['14_e'] = 15
        self.t['14_w'] = 13
        self.t['14_n'] = 9
        
        self.t['15_e'] = 16
        self.t['15_w'] = 14
        self.t['15_n'] = 10
        
        self.t['17_n'] = 12
        self.t['17_e'] = 18
        
        self.t['18_n'] = 13
        self.t['18_w'] = 17
        
        self.gamma = 0.8         #折扣因子
        self.viewer = None
        self.state = None
        self.mc_policy = None

    def getTerminal(self):
        return self.terminate_states
    def getGamma(self):
        return self.gamma

    def getStates(self):
        return self.states

    def getAction(self):
        return self.actions
    def getTerminate_states(self):
        return self.terminate_states
    def setAction(self,s):
        self.state=s
    def gen_random_samples(self,num):
        state_sample = []
        action_sample = []
        reward_sample = []
        # 模拟num次的采样
        for i in range(num):
            s_tmp = []
            a_tmp = []
            r_tmp = []
            s = self.states[int(random.random() * len(self.states))]
            is_done = False
            # 每次采样的过程
            while not is_done:
                a = np.random.choice(self.actions)
                s_,r,is_done,_ = self.transform(s,a)
                s_tmp.append(s)
                a_tmp.append(a)
                r_tmp.append(r)
                s = s_
            state_sample.append(s_tmp)
            action_sample.append(a_tmp)
            reward_sample.append(r_tmp)
        return state_sample,action_sample,reward_sample

    def mc_evaluate(self,state_sample,action_sample, reward_sample):
        v_s_a = defaultdict(float)
        n_s_a = defaultdict(int)
        num = len(state_sample)
        for i in range(num):
            G = 0.0
            episode_len = len(state_sample[i])
            #计算初始状态的累计回报
            for episode in range(episode_len-1,-1,-1):
                G *= self.gamma
                G += reward_sample[i][episode]
            # 计算每一状态的值函数累加
            for episode in range(episode_len):
                key = '%d_%s'%(state_sample[i][episode],action_sample[i][episode])
                n_s_a[key] += 1
                v_s_a[key] = (v_s_a[key]*(n_s_a[key]-1) + G)/n_s_a[key]
                G -= reward_sample[i][episode]
                G /= self.gamma
        return v_s_a
    def output_best_policy(self, v_s_a):
        best_policy = pd.Series(index=self.states)
        for state in self.states:
            a1 = self.actions[0]
            v1 = v_s_a['%d_%s'%(state,a1)]
            for action in self.actions:
                v2 = v_s_a['%d_%s'%(state,action)]
                if v2 > v1:
                    a1 = action
                    v1 = v2
            best_policy[state] = a1
        return best_policy
    def transform(self, state, action):
        if state in self.terminate_states:
            return state, 0, True, {}
        key = "%d_%s"%(state, action)   #将状态和动作组成字典的键值

        #状态转移
        if key in self.t:
            next_state = self.t[key]
        else:
            next_state = state
        is_terminal = False

        if next_state in self.terminate_states:
            is_terminal = True

        if key not in self.rewards:
            r = 0.0
        else:
            r = self.rewards[key]
        return next_state, r,is_terminal,{}
    def step(self, action):
        #系统当前状态
        state = self.state
        if state in self.terminate_states:
            return state, 0, True, {}
        key = "%d_%s"%(state, action)   #将状态和动作组成字典的键值

        #状态转移
        if key in self.t:
            next_state = self.t[key]
        else:
            next_state = state
        self.state = next_state

        is_terminal = False

        if next_state in self.terminate_states:
            is_terminal = True
        if key not in self.t:
            r = 0.0
        else:
            if key not in self.rewards:
                r = 0.0
            else:
                r = self.rewards[key]
        return next_state, r,is_terminal,{}
    def reset(self):
        self.state = self.states[int(random.random() * len(self.states))]
        state_sample,action_sample,reward_sample = self.gen_random_samples(100)
        Q = self.mc_evaluate(state_sample,action_sample, reward_sample)
#        print(Q)
        best_policy = self.output_best_policy(Q)
#         print(best_policy)
        return self.state, best_policy,self.scdict[self.state]
    def myreset(self):
        self.state = 1
        return self.scdict[self.state]
    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]
    def render(self, mode='human', close=False):
        if close:
            if self.viewer is not None:
                self.viewer.close()
                self.viewer = None
            return
        screen_width = 600
        screen_height = 600

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            #创建网格世界
            self.line1 = rendering.Line((100, 260),(500, 260))
            self.line2 = rendering.Line((100, 180),(500, 180))
            self.line3 = rendering.Line((100, 100),(500, 100))
            self.line4 = rendering.Line((100, 340),(500, 340))
            self.line5 = rendering.Line((100, 420),(500, 420))
            self.line6 = rendering.Line((100, 500),(500, 500))
            
            self.line7 = rendering.Line((100, 500), (100, 100))
            self.line8 = rendering.Line((180, 500), (180, 100))
            self.line9 = rendering.Line((260, 500), (260, 100))
            self.line10 = rendering.Line((340, 500), (340, 100))
            self.line11 = rendering.Line((420, 500), (420, 100))
            self.line12 = rendering.Line((500, 500), (500, 100))

            #创建第一个骷髅
            self.kulo1 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(380,460))
            self.kulo1.add_attr(self.circletrans)
            self.kulo1.set_color(0,0,0)
            #创建第二个骷髅
            self.kulo2 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(380, 380))
            self.kulo2.add_attr(self.circletrans)
            self.kulo2.set_color(0, 0, 0)
            
            self.gold1 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(140, 300))
            self.gold1.add_attr(self.circletrans)
            self.gold1.set_color(0, 0, 0)
            
            self.gold2 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(220, 300))
            self.gold2.add_attr(self.circletrans)
            self.gold2.set_color(0, 0, 0)
            
            self.gold3 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(300, 140))
            self.gold3.add_attr(self.circletrans)
            self.gold3.set_color(0, 0, 0)
            
            self.gold4 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(380, 140))
            self.gold4.add_attr(self.circletrans)
            self.gold4.set_color(0, 0, 0)
            
            self.gold5 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(460, 140))
            self.gold5.add_attr(self.circletrans)
            self.gold5.set_color(0, 0, 0)
            self.gold6 = rendering.make_circle(40)
            self.circletrans = rendering.Transform(translation=(460, 220))
            self.gold6.add_attr(self.circletrans)
            self.gold6.set_color(1, 0.9, 0)
            
            
        
            #创建机器人
            self.robot= rendering.make_circle(30)
            self.robotrans = rendering.Transform()
            self.robot.add_attr(self.robotrans)
            self.robot.set_color(0.8, 0.6, 0.4)

            self.line1.set_color(0, 0, 0)
            self.line2.set_color(0, 0, 0)
            self.line3.set_color(0, 0, 0)
            self.line4.set_color(0, 0, 0)
            self.line5.set_color(0, 0, 0)
            self.line6.set_color(0, 0, 0)
            self.line7.set_color(0, 0, 0)
            self.line8.set_color(0, 0, 0)
            self.line9.set_color(0, 0, 0)
            self.line10.set_color(0, 0, 0)
            self.line11.set_color(0, 0, 0)
            self.line12.set_color(0, 0, 0)

            self.viewer.add_geom(self.line1)
            self.viewer.add_geom(self.line2)
            self.viewer.add_geom(self.line3)
            self.viewer.add_geom(self.line4)
            self.viewer.add_geom(self.line5)
            self.viewer.add_geom(self.line6)
            self.viewer.add_geom(self.line7)
            self.viewer.add_geom(self.line8)
            self.viewer.add_geom(self.line9)
            self.viewer.add_geom(self.line10)
            self.viewer.add_geom(self.line11)
            self.viewer.add_geom(self.line12)
            self.viewer.add_geom(self.kulo1)
            self.viewer.add_geom(self.kulo2)
            self.viewer.add_geom(self.gold1)
            self.viewer.add_geom(self.gold2)
            self.viewer.add_geom(self.gold3)
            self.viewer.add_geom(self.gold4)
            self.viewer.add_geom(self.gold5)
            self.viewer.add_geom(self.gold6)
            self.viewer.add_geom(self.robot)

        if self.state is None: return None
        #self.robotrans.set_translation(self.x[self.state-1],self.y[self.state-1])
        self.robotrans.set_translation(self.x[self.state-1], self.y[self.state- 1])



        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

