# Pythonで学ぶ強化学習

In [10]:
from enum import Enum
import numpy as np
import random
# from environment(file name) import Environment(class name)

# code1-1

In [14]:
class State():
    
    
    def __init__(self, row=-1, column=-1):
        self.row = row
        self.column = column
        
    
    def __repr__(self):
        return "<State: [{},{}]>".format(self.row, self.column)
    
    
    def clone(self):
        return State(self.row, self.column)
    
    
    def __hash__(self):
        return hash((self.row, self.column))
    
    
    def __eq__(self, other):
        return self.row == other.row and self.column == other.column
    
    
    
    
class Action(Enum):
    UP    = 1
    DOWN  = -1
    LEFT  = 2
    RIGHT = -2


In [25]:
Action(2)

<Action.LEFT: 2>

# code1-2

In [23]:
class Environment():
    
    
    def __init__(self, grid, move_prob=0.8):
#          grid is 2d array
#          0 : ordinary cell
#         -1 : damage cell (game end)
#          1 : reward cell (game end)
#          9 : block cell (can't locate agent)
        self.grid = grid
        self.agent_state = State()
        self.default_reward = -0.04
        self.move_prob = move_prob
        self.reset()
        
        
    @property
    def row_length(self):
        return len(self.grid)
    
    
    @property
    def column_length(self):
        return len(self.grid[0])
    
    
    @property
    def actions(self):
        return [Action.UP, Action.DOWN, Action.LEFT, Action.RIGHT]
    
    
    @property
    def states(self):
        states = []
        for row in range(self.row_length):
            for column in range(self.column_length):
                if self.grid[row][column]!=9:
                    states.append(State(row, column))
        return states
    
    
    
#     code1-3
    def transit_func(self, state, action):
        transition_probs = {}
        if not self.can_action_at(state):
            return transition_probs
        
        
        opposite_direction = Action(action.value * -1)
        
        
        for a in self.actions:
#             if a=opposite ,then prob = 0
            prob = 0
            if a == action:
                prob = self.move_prob
            elif a!= opposite_direction:
                prob = (1 - self.move_prob) / 2
                
                
            next_state = self._move(state, a)    
            if next_state not in transition_probs:
                transition_probs[next_state] = prob
            else:
                transition_probs[next_state] += prob
                
                
        return transition_probs
    
    
    
    def can_action_at(self, state):
        if self.grid[state.row][state.column] == 0:
            return True
        else:
            return False
        
        
    def _move(self, state, action):
        if not self.can_action_at(state):
            raise Exception("Can't move from here!")
            
            
        next_state = state.clone()
        
        
        if action == Action.UP:
            next_state.row -= 1
        elif action == Action.DOWN:
            next_state.row += 1
        elif action == Action.LEFT:
            next_state.column -= 1
        elif action == Action.RIGHT:
            next_state.column += 1
            
            
        if not (0 <= next_state.row < self.row_length):
            next_state = state
        if not (0 <= next_state.column < self.column_length):
            next_state = state
        if self.grid[next_state.row][next_state.column] == 9:
            next_state = state
            
            
        return next_state
    
    
    def reward_func(self, state):
        reward = self.default_reward
        done = False
        
        
        attribute = self.grid[state.row][state.column]
        if attribute == 1:
            reward = 1
            done = True
        elif attribute == -1:
            reward = -1
            done = True
            
            
        return reward, done
    
    
    def reset(self):
        self.agent_state = State(self.row_length - 1, 0)
        return self.agent_state
    
    
    def step(self, action):
        next_state, reward, done = self.transit(self.agent_state, action)
        if next_state is not None:
            self.agent_state = next_state
            
            
        return next_state, reward, done
    
    
    def transit(self, state, action):
        transition_probs = self.transit_func(state, action)
        if len(transition_probs) == 0:
            return None, None, True
        
        
        next_states = []
        probs = []
        for s in transition_probs:
            next_states.append(s)
            probs.append(transition_probs[s])
            
            
        next_state = np.random.choice(next_states, p=probs)
        reward, done = self.reward_func(next_state)
        return next_state, reward, done
        

# code1-6

In [22]:
class Agent():
    
    
    def __init__(self, env):
        self.actions = env.actions
        
        
    def policy(self, state):
        return random.choice(self.actions)
    
    
def main():
    grid = [
        [0, 0, 0, 1],
        [0, 9, 0, -1],
        [0, 0, 0, 0]
    ]
    env = Environment(grid)
    agent = Agent(env)
    
    
#     Try 10 games
    for i in range(50):
        state = env.reset()
        total_reward = 0
        done = False
        
        
        while not done:
            action = agent.policy(state)
            next_state, reward, done = env.step(action)
            total_reward += reward
            state = next_state
            
            
        print("Episode {}: Agent gets {} reward.".format(i, total_reward))
        
        
        
if __name__ == "__main__":
    main()

    
    

Episode 0: Agent gets -0.72 reward.
Episode 1: Agent gets 2.4000000000000004 reward.
Episode 2: Agent gets -0.31999999999999995 reward.
Episode 3: Agent gets 2.24 reward.
Episode 4: Agent gets 2.8000000000000007 reward.
Episode 5: Agent gets 1.28 reward.
Episode 6: Agent gets -0.44000000000000006 reward.
Episode 7: Agent gets -0.44000000000000006 reward.
Episode 8: Agent gets 0.6800000000000008 reward.
Episode 9: Agent gets 1.6800000000000002 reward.
Episode 10: Agent gets 1.4 reward.
Episode 11: Agent gets -0.11999999999999977 reward.
Episode 12: Agent gets -0.76 reward.
Episode 13: Agent gets 2.16 reward.
Episode 14: Agent gets -0.6799999999999999 reward.
Episode 15: Agent gets 2.3200000000000003 reward.
Episode 16: Agent gets 0.48000000000000065 reward.
Episode 17: Agent gets 1.160000000000001 reward.
Episode 18: Agent gets -0.07999999999999974 reward.
Episode 19: Agent gets 0.04000000000000026 reward.
Episode 20: Agent gets -0.23999999999999988 reward.
Episode 21: Agent gets 1.6400