In [1]:
#######################################################################
# Copyright (C)                                                       #
# 2016 Shangtong Zhang(zhangshangtong.cpp@gmail.com)                  #
# 2016 Jan Hakenberg(jan.hakenberg@gmail.com)                         #
# 2016 Tian Jun(tianjun.cpp@gmail.com)                                #
# 2016 Kenta Shimada(hyperkentakun@gmail.com)                         #
# update --->                                                         #
# 2018 Ran Xiao(xiaoranone@gmail.com)                                 #
# python3.5                                                           #
#######################################################################


In [2]:
from __future__ import print_function
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


强化学习第二章的例子的代码，10臂赌博问题，首先建立一个k臂赌博者的类。


In [37]:
class Bandit:
    '''参数：
        kArm: int, 赌博臂的个数
        epsilon: double, e-贪心算法的概率值
        initial: 每个行为的行为的初始化估计
        stepSize: double,更加估计值的常数步数
        sampleAverages: if ture, 使用简单的均值方法替代stepSize权重更新
        UCB: 不是None时，使用UCB算法,(初始值优化算法)
        gradient: if ture, 使用算法的选择的基础标志(过去的均值作为基准，评价现在的值)
        gradientBaseline: if true, 使用过去的奖励的平均值
    '''
    def __init__(self, kArm=10, epsilon=0., initial=0., stepSize=0.1, 
                 sampleAverages=False,UCB=None, gradient=False, 
                 gradientBaseline=False, trueReward=0.):
        self.k = kArm
        self.epsilon = epsilon
        self.stepSize = stepSize
        self.sampleAverages = sampleAverages
        self.indices = np.arange(self.k) # 有kArm个选择
        self.time = 0 # 总的选择次数 ？？
        self.UCB = UCB
        self.gradient = gradient
        self.gradientBaseline = gradientBaseline
        self.averageReward = 0
        self.trueReward = trueReward
        
        # 记录每个行为的真实奖励
        self.qTrue = []
        
        # 记录每个行为的估计值
        self.qEst = np.zeros(self.k)
        
        # 记录每个行为被选择的次数
        self.actionCount = []
        
        # 使用N(0,1)高斯分布+trueReward，初始化真是的奖励
        # 使用初始值initial初始化估计值
        for i in range(0,self.k):
            self.qTrue.append(np.random.randn()+trueReward)
            self.qEst[i] = initial
            self.actionCount.append(0)
        
        # 得到正在的最好的选择对饮的k臂
        self.bestAction = np.argmax(self.qTrue)
    
    # 对于这个bandit游戏，选择一个行为，使用explore(评估) or exploit(探索)
    def getAction(self):
        # explore(评估)
        # 使用epsilon-greedy算法，每次以概率epsilon随机选择一个行为，
        # 否则使用贪心规则
        if self.epsilon > 0: 
            if np.random.binomial(1,self.epsilon) == 1:# 打乱，随机选择
                np.random.shuffle(self.indices)
                return self.indices[0]
        
        # exploit
        # 使用初始值优化这个算法
        if self.UCB is not None:
            UCBEst = self.qEst + self.UCB * np.sqrt(np.log(self.time+1) / np.asarray(self.actionCount)+1)
            return np.argmax(UCBEst)
        
        # 使用基准线评测,增强比较
        if self.gradient:
            # softmax计算每个行为的偏好程度
            expEst = np.exp(self.qEst)
            self.actionProb = expEst / np.sum(expEst)
            # 根据概率随机选择
            return np.random.choice(self.indices,p=self.actionProb)
        # 选择最大值的下标
        return np.argmax(self.qEst)
    
    # 采取何种行为
    def takeAction(self, action):
        # 基于N(real reward, 1)产生一个奖励
        reward = np.random.randn() + self.qTrue[action]
        # 次数加1
        self.time += 1
        # 迭代计算平均奖励
        self.averageReward = (self.time - 1.0) / self.time * self.averageReward + reward / self.time
        self.actionCount[action] += 1
        
        if self.sampleAverages:
            # 使用简单平均值更新估计值
            self.qEst[action] += 1.0 / self.actionCount[action] * (reward - self.qEst[action])
        elif self.gradient:
            oneHot = np.zeros(self.k)
            oneHot[action] = 1
            if self.gradientBaseline:
                baseline = gradientBaseline
            else:
                baseline = 0
            # 基于选择，全部更新值，选中的action进行加，没有选中的进行减去一个值
            self.qEst = self.qEst + self.stepSize * (reward - baseline) * (oneHot - self.actionProb)
        else:
            # 固定步长更新值
            self.qEst += self.stepSize * (reward - self.qEst[action])
        
        return reward
        

In [38]:
figureIndex = 0

# 做出对应的图，figure 2.1
def figure2_1():
    global figureIndex
    figureIndex += 1
    sns.violinplot(data=np.random.randn(200,10) + np.random.randn(10))
    plt.xlabel('Action')
    plt.ylabel('Reward distribution')


def banditSimulation(nBandits, time, bandits):
    bestActionCounts = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
    averageRewards = [np.zeros(time, dtype='float') for _ in range(0, len(bandits))]
    for banditInd, bandit in enumerate(bandits):
        for i in range(0, nBandits):
            for t in range(0, time):
                action = bandit[i].getAction()
                reward = bandit[i].takeAction(action)
                averageRewards[banditInd][t] += reward
                if action == bandit[i].bestAction:
                    bestActionCounts[banditInd][t] += 1
        bestActionCounts[banditInd] /= nBandits
        averageRewards[banditInd] /= nBandits
    return bestActionCounts, averageRewards


# for figure 2.2
def epsilonGreedy(nBandits, time):
    epsilons = [0, 0.1, 0.01]
    # 赌博的个数
    bandits = []
    for epsInd, eps in enumerate(epsilons):
        bandits.append([Bandit(epsilon=eps, sampleAverages=True) for _ in range(0,nBandits)])
    
    bestActionCounts, avetageReward = banditSimulation(nBandits, time, bandits)
    global figureIndex
    plt.figure(figureIndex)
    figureIndex += 1
    for eps, counts in zip(epsilons, beatActionCounts):
        plt.plot(counts, label='epsilon='+str(eps))
    plt.xlabel('Steps')
    plt.ylabel('% optimal action')
    plt.legend()
    plt.figure(figureIndex)
    figureIndex += 1
    for eps, reward in zip(epsilons, avetageReward):
        plt.plot(reward, label='epsolon='+str(eps))
    plt.xlabel('Steps')
    plt.ylabel('average reward')
    plt.legend()
    
    
    

In [None]:
figure2_1()

epsilonGreedy(2000,1000)

plt.show()