In [34]:
import numpy as np
from enum import Enum

class State():
    def __init__(self,row=-1,column=-1):
        self.row=row
        self.column=column
    def __repr__(self):
        return "state: {} {}".format(self.row,self.column)
    def clone(self):
        return State(self.row,self.column)
    def __hash__(self):
        return hash((self.row,self.column))
    def __eq__(self,other):
        return self.row==other.row and self.column==other.column
    
class Action(Enum):
    up=1
    down=-1
    left=2
    right=-2

class Env():
    def __init__(self,grid,move_prob=0.8):
        self.grid=grid
        self.agent_state=State()
        self.default_reward=-0.1
        self.move_prob=move_prob
        self.reset()
    
    @property
    def row_length(self):
        return len(self.grid)
    @property
    def column_length(self):
        return len(self.grid[0])
    @property
    def actions(self):
        return [Action.up,Action.down,Action.left,Action.right]
    @property
    def states(self):
        states=[]
        for row in range(self.row_length):
            for column in range(self.column_length):
                if self.grid[row][column]!=9:
                    states.append(State(row,column))
        return states
    
    def transit_func(self,state,action):
        transit_prob={}
        if not self.can_action_at(state):
            return transit_prob
        opposite_direction=Action(action.value*-1)
        
        for a in self.actions:
            prob=0
            if a==action:
                prob=self.move_prob
            elif a!=opposite_direction:
                prob=(1-self.move_prob)/2
            
            next_state=self._move(state,a)
            if next_state not in transit_prob:
                transit_prob[next_state]=prob
            else:
                transit_prob[next_state]+=prob
                
        return transit_prob
    
    def can_action_at(self,state):
        if self.grid[state.row][state.column]==0:
            return True
        else:
            return False
    
    def _move(self,state,action):
        if not self.can_action_at(state):
            raise Exception("cant move from here")
        next_state=state.clone()
        
        #grid上の要素の移動
        
        if action==Action.up:
            next_state.row-=1
        elif action==Action.down:
            next_state.row+=1
        elif action==Action.left:
            next_state.column-=1
        elif action==Action.right:
            next_state.column+=1
        
        #状態の制限 grid外は動かない
        if not (0<=next_state.row<self.row_length):
            next_state=state
        if not (0<=next_state.column<self.column_length):
            next_state=state
            
        #print(next_state.row,next_state.column)
        
        if self.grid[next_state.row][next_state.column]==9:
            next_state=state
            
        return next_state
    
    def reward_func(self,state):
        reward=self.default_reward
        done=False
        attribute=self.grid[state.row][state.column]
        if attribute==1:
            reward=1
            done=True
        elif attribute==-1:
            reward=-1
            done=True
        return reward,done
    
    def reset(self):
        self.agent_state=State(self.row_length-1,0)
        return self.agent_state
    
    def step(self,action):
        next_state,reward,done=self.transit(self.agent_state,action)
        if next_state is not None:
            self.agent_state=next_state
        return next_state,reward,done
    
    def transit(self,state,action):
        transit_prob=self.transit_func(state,action)
        #print(transit_prob)
        if len(transit_prob)==0:
            return None,None,True
        next_state=[]
        probs=[]
        for s in transit_prob:
            next_state.append(s)
            probs.append(transit_prob[s])
        #ここで確率的に低いものが選ばれる可能性がある。
        next_state=np.random.choice(next_state,p=probs)
        
        reward,done=self.reward_func(next_state)
        return next_state,reward,done
    

In [30]:
import random

class Agent():
    def __init__(self,env):
        self.actions=env.actions
        
    def policy(self):
        c=random.choice(self.actions)
        return c


In [38]:
grid=[[0,0,0,1],
     [0,9,0,-1],
     [0,0,0,0]]

env=Env(grid)
agent=Agent(env)


for i in range(10):
    state=env.reset()
    total_reward=0
    done=False
    print(state)
    while not done:
        
        action=agent.policy()
        
        next_state,reward,done=env.step(action)
        total_reward+=reward
        state=next_state
        print("now_state={}  <-selected={},total_reward={}".format(state,action,total_reward))
    print("{}:{}:{}".format(i,total_reward,done))
    

state: 2 0
now_state=state: 2 0  <-selected=Action.left,total_reward=-0.1
now_state=state: 2 0  <-selected=Action.down,total_reward=-0.2
now_state=state: 2 1  <-selected=Action.down,total_reward=-0.30000000000000004
now_state=state: 2 1  <-selected=Action.left,total_reward=-0.4
now_state=state: 2 2  <-selected=Action.up,total_reward=-0.5
now_state=state: 2 1  <-selected=Action.left,total_reward=-0.6
now_state=state: 2 1  <-selected=Action.down,total_reward=-0.7
now_state=state: 2 1  <-selected=Action.down,total_reward=-0.7999999999999999
now_state=state: 2 1  <-selected=Action.down,total_reward=-0.8999999999999999
now_state=state: 2 1  <-selected=Action.up,total_reward=-0.9999999999999999
now_state=state: 2 0  <-selected=Action.left,total_reward=-1.0999999999999999
now_state=state: 1 0  <-selected=Action.up,total_reward=-1.2
now_state=state: 1 0  <-selected=Action.right,total_reward=-1.3
now_state=state: 1 0  <-selected=Action.left,total_reward=-1.4000000000000001
now_state=state: 2 0 