In [1]:
import tensorflow as tf
import tensorflow.contrib.slim as slim
import numpy as np

In [2]:
#Bandit initialization
bandit_arms = [0.2,0,-0.2,-2]
num_arms = len(bandit_arms)
def pullBandit(bandit):
    #Get a random number.
    result = np.random.randn(1)
    if result > bandit:
        #return a positive reward.
        return 1
    else:
        #return a negative reward.
        return -1

In [None]:
#for Multi-arm bandit problem

In [3]:
# Agent
# 기존에 생성된 graph를 모두 삭제하고, reset시켜 중복되는 것을 막아준다.
# placeholder는 다른 tensor를 할당하는 것을 말함. feed_dict로 값을 한꺼번에 할당할 수 있다.


tf.reset_default_graph()

#These two lines established the feed-forward part of the network. 
weights = tf.Variable(tf.ones([num_arms]))
output = tf.nn.softmax(weights)

#The next six lines establish the training proceedure. We feed the reward and chosen action into the network
#to compute the loss, and use it to update the network.
reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
action_holder = tf.placeholder(shape=[1],dtype=tf.int32)

responsible_output = tf.slice(output,action_holder,[1])
loss = -(tf.log(responsible_output)*reward_holder)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
update = optimizer.minimize(loss)

In [4]:
# Session.run('var') 뜻은 'var'와 var를 불러오기 위해 필요한 연산을 모두 실행하라는 것을 의미한다.
# feed_dict 뜻은 일시적으로 해당 tensor의 값을 value로 변경한다는 것이다.
# np.random.choice=> 뽑는 배열의 크기, 배열에서 뽑는 인자의 개수, 복원 추출인지 비복원인지, 확률을 선택할 수 있다.
total_episodes = 1000 #Set total number of episodes to train agent on.
total_reward = np.zeros(num_arms) #Set scoreboard for bandit arms to 0.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        
        #Choose action according to Boltzmann distribution.
        actions = sess.run(output)
        a = np.random.choice(actions,p=actions)
        action = np.argmax(actions == a)

        reward = pullBandit(bandit_arms[action]) #Get our reward from picking one of the bandit arms.
        
        #Update the network.
        _,resp,ww = sess.run([update,responsible_output,weights], feed_dict={reward_holder:[reward],action_holder:[action]})
    
        #Update our running tally of scores.
        total_reward[action] += reward
        if i % 50 == 0:
            print("Running reward for the " + str(num_arms) + " arms of the bandit: " + str(total_reward))
        i+=1
print("\nThe agent thinks arm " + str(np.argmax(ww)+1) + " is the most promising....")
if np.argmax(ww) == np.argmax(-np.array(bandit_arms)):
    print("...and it was right!")
else:
    print("...and it was wrong!")

Running reward for the 4 arms of the bandit: [1. 0. 0. 0.]
Running reward for the 4 arms of the bandit: [ 2.  0. -1.  8.]
Running reward for the 4 arms of the bandit: [ 0.  4.  3. 22.]
Running reward for the 4 arms of the bandit: [-1.  4.  3. 41.]
Running reward for the 4 arms of the bandit: [-4.  3.  3. 55.]
Running reward for the 4 arms of the bandit: [-10.   3.   0.  68.]
Running reward for the 4 arms of the bandit: [-10.   7.  -1.  81.]
Running reward for the 4 arms of the bandit: [-11.   3.   0. 101.]
Running reward for the 4 arms of the bandit: [-11.  -3.   4. 117.]
Running reward for the 4 arms of the bandit: [-10.   6.   8. 131.]
Running reward for the 4 arms of the bandit: [-12.  10.   7. 144.]
Running reward for the 4 arms of the bandit: [-20.  11.   5. 155.]
Running reward for the 4 arms of the bandit: [-17.  10.   1. 171.]
Running reward for the 4 arms of the bandit: [-23.   7.   2. 181.]
Running reward for the 4 arms of the bandit: [-22.   8.   2. 195.]
Running reward for 

In [None]:
#context bandit problem

In [5]:
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
        
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits)) #Returns a random state for each episode.
        return self.state
        
    def pullArm(self,action):
        #Get a random number.
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

In [6]:
class agent():
    def __init__(self, lr, s_size,a_size):
        #These lines established the feed-forward part of the network. The agent takes a state and produces an action.
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH = slim.one_hot_encoding(self.state_in,s_size)
        output = slim.fully_connected(state_in_OH,a_size,\
            biases_initializer=None,activation_fn=tf.nn.sigmoid,weights_initializer=tf.ones_initializer())
        self.output = tf.reshape(output,[-1])
        self.chosen_action = tf.argmax(self.output,0)

        #The next six lines establish the training proceedure. We feed the reward and chosen action into the network
        #to compute the loss, and use it to update the network.
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

In [7]:
tf.reset_default_graph() #Clear the Tensorflow graph.

cBandit = contextual_bandit() #Load the bandits.
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions) #Load the agent.
weights = tf.trainable_variables()[0] #The weights we will evaluate to look into the network.

total_episodes = 10000 #Set total number of episodes to train agent on.
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions]) #Set scoreboard for bandits to 0.
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

# Launch the tensorflow graph
with tf.Session() as sess:
    sess.run(init)
    i = 0
    while i < total_episodes:
        s = cBandit.getBandit() #Get a state from the environment.
        
        #Choose either a random action or one from our network.
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
        
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        #Update the network.
        feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)
        
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))
        i+=1
for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Instructions for updating:
Use `tf.cast` instead.
Mean reward for each of the 3 bandits: [ 0.    0.   -0.25]
Mean reward for each of the 3 bandits: [33.75 34.25 38.25]
Mean reward for each of the 3 bandits: [66.25 68.5  77.5 ]
Mean reward for each of the 3 bandits: [100.   106.5  114.25]
Mean reward for each of the 3 bandits: [141.   147.   144.25]
Mean reward for each of the 3 bandits: [181.25 183.5  178.5 ]
Mean reward for each of the 3 bandits: [215.25 222.25 215.75]
Mean reward for each of the 3 bandits: [252.   260.75 250.5 ]
Mean reward for each of the 3 bandits: [288.   300.75 287.5 ]
Mean reward for each of the 3 bandits: [327.   343.25 318.  ]
Mean reward for each of the 3 bandits: [363.   381.25 358.5 ]
Mean reward for each of the 3 bandits: [398.5  423.   397.25]
Mean reward for each of the 3 bandits: [439.25 460.75 430.25]
Mean reward for each of the 3 bandits: [475.   502.5  461.75]
Mean reward for each of the 3 bandits: [513.5  539.5  491.25]
Mean reward for each of the 3