# Sarsa法 三目並べ

In [4]:
def state_encode(state3):
    convert = [[0,1,2,3,4,5,6,7,8],# 元の状態
              [2,1,0,5,4,3,8,7,6], # 変換(2)
              [6,3,0,7,4,1,8,5,2], # 変換(3)
              [0,3,8,1,4,7,2,5,8], # 変換(4)
              [8,7,6,5,4,3,2,1,0], # 変換(5)
              [6,7,8,3,4,5,0,1,2], # 変換(6)
              [2,5,8,1,4,7,0,3,6], # 変換(7)
              [8,5,2,7,4,1,6,3,0]  # 変換(8)
              ]
    power = np.array([3**i for i in range(8,-1,-1)], dtype=float)
    
    cands = []
    for conv in convert:
        # 並び替え
        state = [state3[i] for i in conv]
        
        cands += [sum(state*power)]
    # 8個の候補のうち一番小さいものを選ぶ
    return min(cands)+1

In [5]:
import numpy as np
import random
import matplotlib.pyplot as plt

def action_decision(t, state3, policy):
    action = None
    if t == 0:
        action = 0
    else:
        while(True):
            sum_pol = 0
            ran = random.random()
            for i in range(len(state3)):
                sum_pol += policy[i]
                if sum_pol > ran:
                    break
                
            if state3[i] == 0:
                action = i
                break
            
    return action

def opponent_action(state3):
    lines = [[0,1,2],[3,4,5],[6,7,8],[0,3,6],[1,4,7],[2,5,8],[0,4,8],[2,4,6]]
    for line in lines:
        state_line = [state3[i] for i in line]
        count0 = state_line.count(0)
        count1 = state_line.count(1)
        # リーチされたら負けを防ぐ
        if count0 == 1 and count1 == 2:
            return line[state_line.index(0)]
            
    while(True):
        ran = random.randint(0, 8)
        if state3[ran] == 0:
            return ran
    

def check_fin(state3):
    fin = 0
    lines = [[0,1,2],[3,4,5],[6,7,8],[0,3,6],[1,4,7],[2,5,8],[0,4,8],[2,4,6]]
    for line in lines:
        state_line = [state3[i] for i in line]
        count1 = state_line.count(1)
        if count1 == 3:
            return 1 # 勝ち
        count2 = state_line.count(2)
        if count2 == 3:
            return 2 # 負け
    if 0 not in state3:
        return 3 # 引き分け
    return 0 # 継続

def reward_decision(fin):
    if fin == 1:
        return 10
    elif fin == 2:
        return -10
    elif fin == 3:
        return 0
    return None

def action_train(policy, t, state3):
    count1, count2 = 0, 0
    for i in range(len(state3)):
        if state3[i] == 1:
            count1 += 1
        elif state3[i] == 2:
            count2 += 1
            
    if count1 == count2:
        action = action_decision(t, state3, policy)
        state3[action] = 1
        fin = check_fin(state3)
        reward = reward_decision(fin)
        return action, reward, state3, fin
    else:
        action = opponent_action(state3)
        state3[action] = 2
        fin = check_fin(state3)
        reward = reward_decision(fin)
        return action, reward, state3, fin

In [6]:
def SarsaPolycyIteration(L, M, options):
    nstates = 3**9   # 状態数
    nactions = 9     # 行動数
    T = 9            # 最大ステップ数
    # Q関数の初期化
    Q = np.zeros([nstates, nactions])
    rate = []
    
    for l in range(L):
        results = np.zeros([M])
        np.random.seed(0)
        newQ = np.zeros([nstates, nactions])
        
        # エピソード
        for m in range(M):
            state3 = np.zeros([nactions])
            
            # ステップ
            for t in range(T):
                # 状態のエンコード
                state = state_encode(state3)
                # 政策の生成
                policy = np.zeros([nactions])
                
                if options['pmode'] == 1:    # greedy
                    v = max(Q[state])
                    a = np.where(Q[state]==v)[0][0]
                    policy[a] = 1
                    
                elif options['pmode'] == 2:  # e-greedy
                    v = max(Q[state])
                    a = np.where(Q[state]==v)[0][0]
                    policy = np.ones([nactions]) * options['epsilon'] / nactions
                    policy[a] = 1 - options['epsilon'] + options['epsilon'] / nactions
                    
                elif options['pmode'] == 3:  # softmax
                    policy = np.exp(Q[state] / options['tau']) / sum(np.exp(Q[state] / options['tau']))
                
                # 行動選択および実行
                action, reward, state3, fin = action_train(policy, t, state3)
                
                if reward is None:
                    reward = 0
                    
                if t > 0:
                    newQ[pstate][paction] += options['alpha'] * (reward-newQ[pstate][paction]+options['gamma']*max(newQ[state]))
                
                # ゲーム終了
                if fin > 0:
                    results[m] = fin
                    break    
                    
                    
                pstate = state
                paction = action
                
        # 行動価値の更新
        Q = newQ
        
        rate.append(len(results[results==1]) / M)
        
        print("Win: {0}/{1}, Draw: {2}/{3}, Lose: {4}/{5}".format(len(results[results==1]),M,
                                                                  len(results[results==3]),M,
                                                                  len(results[results==2]),M))
        
    return rate

In [7]:
if __name__=='__main__':
    # オプション　pmode:(greedy, e-greedy, softmax), epsilon:ランダム行動確率, gamma:割引率
    options = {'pmode':2,'epsilon':0.1,'gamma':0.9,'alpha': 1}
    rate = SarsaPolycyIteration(100,100,options)
    
    plt.clf()
    plt.plot(range(len(rate)), rate)
    plt.savefig('sarsa.png')

Win: 13/100, Draw: 47/100, Lose: 40/100
Win: 24/100, Draw: 58/100, Lose: 18/100
Win: 42/100, Draw: 41/100, Lose: 17/100
Win: 43/100, Draw: 42/100, Lose: 15/100
Win: 43/100, Draw: 41/100, Lose: 16/100
Win: 27/100, Draw: 45/100, Lose: 28/100
Win: 33/100, Draw: 43/100, Lose: 24/100
Win: 26/100, Draw: 49/100, Lose: 25/100
Win: 31/100, Draw: 41/100, Lose: 28/100
Win: 29/100, Draw: 54/100, Lose: 17/100
Win: 25/100, Draw: 56/100, Lose: 19/100
Win: 22/100, Draw: 53/100, Lose: 25/100
Win: 27/100, Draw: 62/100, Lose: 11/100
Win: 42/100, Draw: 48/100, Lose: 10/100
Win: 29/100, Draw: 61/100, Lose: 10/100
Win: 28/100, Draw: 52/100, Lose: 20/100
Win: 43/100, Draw: 41/100, Lose: 16/100
Win: 40/100, Draw: 38/100, Lose: 22/100
Win: 29/100, Draw: 51/100, Lose: 20/100
Win: 29/100, Draw: 54/100, Lose: 17/100
Win: 21/100, Draw: 60/100, Lose: 19/100
Win: 20/100, Draw: 54/100, Lose: 26/100
Win: 30/100, Draw: 44/100, Lose: 26/100
Win: 21/100, Draw: 57/100, Lose: 22/100
Win: 19/100, Draw: 57/100, Lose: 24/100




In [105]:
import numpy as np
import math

C = np.empty((0,2), float) # 状態の平均[座標,速度]

posi_grid = [-1.2, -0.3, 0.6]
velo_grid = [-0.07, -0.02, 0.02, 0.07]

for i in posi_grid:
    for j in velo_grid:
        C = np.append(C, np.array([[i, j]]), axis=0)
        
print(C)
s = np.array([0.1, 0.1], dtype=float)
t = np.array([0.1, 0.2], dtype=float)

Phi = np.zeros(len(C))
for i in range(len(C)):
    diff = s - C[i]
    print(diff)
    square_diff = diff[0]**2 + diff[1]**2
    Phi[i] = math.exp(- square_diff / 2)
    
print(Phi)
print(np.where(Phi == max(Phi))[0][0])

Theta = np.ones([3, len(C)])
print(Theta)
Q = np.zeros(3)
for i in range(len(Theta)):
    Q[i] = sum(Phi * Theta[i])
    
print(Q)

print(np.exp(Q))
print(sum(np.exp(Q)))


A = np.array([[5]])
print(np.linalg.inv(A))

Phi = np.zeros([1,1])
print(Phi)
#print(np.linalg.inv(2))

Phi = np.ones([1,3])
print(Phi)
print(np.dot(Phi,Phi.T))
print(np.linalg.inv(np.dot(Phi,Phi.T))*Phi)


Phi = np.ones([12,1])
Theta = np.ones([3,12])
Q = np.dot(Theta, Phi)
print(Q)

print(np.where(Q==max(Q))[0][0])
#action = np.where(Q==max(Q))[0][0]


Phi_fun = np.array([])
Phi = np.ones([1,3])
Phi2 = np.zeros([1,3])
Phi_fun = np.append(Phi_fun, Phi[0][0])
Phi_fun = np.append(Phi_fun, Phi2[0][0])
print(Phi_fun)
ob = np.array([0.5, 0.5])
print(ob)

P = np.empty((0,12))
Phi = np.zeros([1, 12])
Phi2 = np.ones([1, 12])
P = np.append(P, Phi, axis=0)
P = np.append(P, Phi2, axis=0)

print(P)

r = np.empty((0,1))
r0_list = np.append(r, [[3]],  axis=0)
print(r0_list)

Theta = np.ones([1, 36])
print(np.reshape(Theta, (3,12)))
print(Theta)


Theta = np.zeros([1, 3])
Theta1 = np.ones([1, 3])
Phi = np.ones([1, 12])

print(Theta)
print(Phi)
T = Theta + Theta1 + 5
print(Theta.T * Phi)

[[-1.2  -0.07]
 [-1.2  -0.02]
 [-1.2   0.02]
 [-1.2   0.07]
 [-0.3  -0.07]
 [-0.3  -0.02]
 [-0.3   0.02]
 [-0.3   0.07]
 [ 0.6  -0.07]
 [ 0.6  -0.02]
 [ 0.6   0.02]
 [ 0.6   0.07]]
[ 1.3   0.17]
[ 1.3   0.12]
[ 1.3   0.08]
[ 1.3   0.03]
[ 0.4   0.17]
[ 0.4   0.12]
[ 0.4   0.08]
[ 0.4   0.03]
[-0.5   0.17]
[-0.5   0.12]
[-0.5   0.08]
[-0.5   0.03]
[ 0.42339489  0.42647565  0.42818497  0.4293641   0.90987323  0.91649378
  0.9201671   0.92270104  0.86983651  0.87616574  0.87967743  0.88209987]
7
[[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
 [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]]
[ 8.8844343  8.8844343  8.8844343]
[ 7218.72989801  7218.72989801  7218.72989801]
21656.189694
[[ 0.2]]
[[ 0.]]
[[ 1.  1.  1.]]
[[ 3.]]
[[ 0.33333333  0.33333333  0.33333333]]
[[ 12.]
 [ 12.]
 [ 12.]]
0
[ 1.  0.]
[ 0.5  0.5]
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
 [ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]]
[[ 3.]]
[[ 1.  1.  1

In [20]:
import numpy as np
import math
import random

mu = np.array([[random.random()] for _ in range(3)])
print(mu)

mu = np.array([[max(mu[i][0], 0.5)] for i in range(3)])
print(mu)

import operator
dict = {'1.jpeg':1,'2.jpeg':2,'3.jpeg':3}

valMax = max(dict.items(), key=operator.itemgetter(1))
print( valMax[0]) #('3.jpeg', 3)

r = np.array([])
r = np.append(r, [3],  axis=0)
r = np.append(r, [2],  axis=0)
print(r)
r + 2
print(r * (r+2))

rew = np.ones([12, 1])
mu_numer = np.empty((0,3))
mu = np.zeros([3,1])
mu2 = np.ones([3,1])
print(mu)
mu_numer = np.append(mu_numer, mu.T, axis=0)
mu_numer = np.append(mu_numer, mu2.T, axis=0)
mu_numer = np.append(mu_numer, mu2.T, axis=0)
print(mu_numer)
print(np.array([sum(mu_numer).T]).T)
print(sum((rew - np.array([sum(mu_numer).T]))*3))



[[ 0.5098831 ]
 [ 0.03267914]
 [ 0.08555759]]
[[ 0.5098831]
 [ 0.5      ]
 [ 0.5      ]]
3.jpeg
[ 3.  2.]
[ 15.   8.]
[[ 0.]
 [ 0.]
 [ 0.]]
[[ 0.  0.  0.]
 [ 1.  1.  1.]
 [ 1.  1.  1.]]
[[ 2.]
 [ 2.]
 [ 2.]]
[-36. -36. -36.]


In [33]:
import numpy as np

vocab_size = 10
onehot_lists = np.empty((0,2,10))
onehot_list = np.empty((0,10))
#print(np.array([[-1]*vocab_size]))
onehot_list = np.append(onehot_list, np.array([[-1]*vocab_size]), axis=0)
onehot_list = np.append(onehot_list, np.array([[0]*3 + [1] + [0]*(vocab_size-3-1)]), axis=0)
print(onehot_list)
print(onehot_lists)
onehot_lists = np.append(onehot_lists, [onehot_list], axis=0)
print(onehot_lists)

[[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
 [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]]
[]
[[[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1.]
  [ 0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]]]
