参考：http://www.jonki.net/entry/2016/05/05/174519  
本家：http://mnemstudio.org/path-finding-q-learning-tutorial.htm

<img src="http://mnemstudio.org/ai/path/images/agent_clip_image002.gif" width=500>

<img src="http://cdn-ak.f.st-hatena.com/images/fotolife/j/jonki/20160505/20160505171911.png" width=400>

In [3]:
# -*- coding: utf-8 -*-
"""
Created on Thu May  5, 2016

@author: jonki
"""

import numpy as np
import random
import sys

# sample ref
# http://mnemstudio.org/path-finding-q-learning-tutorial.htm

# Reward matrix
R = np.array([
[-1, -1, -1, -1,  0,  -1],
[-1, -1, -1,  0, -1, 100],
[-1, -1, -1,  0, -1,  -1],
[-1,  0,  0, -1,  0,  -1],
[ 0, -1, -1,  0, -1, 100],
[-1,  0, -1, -1,  0, 100]
])

# Initial Q-value
Q = np.zeros((6,6))

LEARNING_COUNT = 1000
GAMMA = 0.8
GOAL_STATE = 5

class QLearning(object):
    def __init__(self):
        return
        
    def learn(self):
        # set a start state randomly 0～5のランダムな数を取得して状態とする
        state = self._getRandomState()
        
        # 学習される回数分ループさせる
        for i in range(LEARNING_COUNT):        
            # extract possible actions in state
            possible_actions = self._getPossibleActionsFromState(state)  # 現在の位置から可能な行動を取得する
            
            # choise an action from possible actions randomly
            action = random.choice(possible_actions)        
            
            # Update Q-value
            # Q(s,a) = r(s,a) + Gamma * max[Q(next_s, possible_actions)]
            next_state = action # in this example, action value is same as next state　例では行動＝次の状態となる
            next_possible_actions = self._getPossibleActionsFromState(next_state)
            max_Q_next_s_a = self._getMaxQvalueFromStateAndPossibleActions(next_state, next_possible_actions)
            Q[state, action] = R[state, action] + GAMMA * max_Q_next_s_a
            
            state = next_state
            
            # If an agent reached a goal state, restart an episode from a random start state
            if state == GOAL_STATE:
                state = self._getRandomState()
    
    def _getRandomState(self):
        return random.randint(0, R.shape[0] - 1)
      
    # 現在の位置から可能な行動を取得する
    def _getPossibleActionsFromState(self, state):
        # 禁則処理
        if state < 0 or state >= R.shape[0]:
            sys.exit("invaid state: %d" % state)
            
        # R（報酬）が-1（行けない）ではない行動のインデックスを取得する
        # メモ：np.whereは条件を満たすインデックスを取得する
        return list(np.where(np.array(R[state] != -1)))[0]
    
    def _getMaxQvalueFromStateAndPossibleActions(self, state, possible_actions):
        return max([Q[state][i] for i in (possible_actions)])
            
    def dumpQvalue(self):
        print(Q.astype(int)) # convert float to int for redability

    def runGreedy(self, start_state = 0):
        print("===== START =====")
        state = start_state
        while state != GOAL_STATE:
            print("current state: %d" % state)
            possible_actions = self._getPossibleActionsFromState(state)
            
            # get best action which maximaizes Q-value(s, a)
            max_Q = 0
            best_action_candidates = []
            for a in possible_actions:            
                if Q[state][a] > max_Q:
                    best_action_candidates = [a,]
                    max_Q = Q[state][a]
                elif Q[state][a] == max_Q:
                    best_action_candidates.append(a)
            
            # get a best action from candidates randomly
            best_action = random.choice(best_action_candidates)
            print("-> choose action: %d" % best_action)
            state = best_action # in this example, action value is same as next state
        print("state is %d, GOAL!!" % state)
            
            

In [4]:
QL = QLearning()
QL.learn()

QL.dumpQvalue()

for s in range(R.shape[0]-1):
    QL.runGreedy(s)

[[  0   0   0   0 350   0]
 [  0   0   0 280   0 460]
 [  0   0   0 280   0   0]
 [  0 350 224   0 350   0]
 [280   0   0 280   0 438]
 [  0 360   0   0 338 450]]
===== START =====
current state: 0
-> choose action: 4
current state: 4
-> choose action: 5
state is 5, GOAL!!
===== START =====
current state: 1
-> choose action: 5
state is 5, GOAL!!
===== START =====
current state: 2
-> choose action: 3
current state: 3
-> choose action: 4
current state: 4
-> choose action: 5
state is 5, GOAL!!
===== START =====
current state: 3
-> choose action: 1
current state: 1
-> choose action: 5
state is 5, GOAL!!
===== START =====
current state: 4
-> choose action: 5
state is 5, GOAL!!


In [8]:
R = np.array([
[-1, -1, -1, -1,  0,  -1],
[-1, -1, -1,  0, -1, 100],
[-1, -1, -1,  0, -1,  -1],
[-1,  0,  0, -1,  0,  -1],
[ 0, -1, -1,  0, -1, 100],
[-1,  0, -1, -1,  0, 100]
])
list(np.where(np.array(R[4] != -1)))[0]

array([0, 3, 5], dtype=int64)

In [9]:
np.array(R[4] != -1)

array([ True, False, False,  True, False,  True], dtype=bool)

In [11]:
np.where(np.array(R[4] != -1))

(array([0, 3, 5], dtype=int64),)

http://d.hatena.ne.jp/Kshi_Kshi/20111227/1324993576