In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns


In [1]:
class Bandit:
    def __init__(self, kArm = 10, epsilon = 0., initial = 0., stepSize = 0.1, sampleAverages=False, UCBParam=None, gradient=False, gradientBaseline=False, trueReward=0.):
        self.k = kArm
        self.stepSize = stepSize
        self.sampleAverages = sampleAverages
        self.indices = np.arange(self.k)
        self.time = 0
        self.UCBParam = UCBParam
        self.gradient = gradient
        self.gradientBaseline = gradientBaseline
        self.averageReward = 0
        self.trueReward = trueReward
        self.epsilon = epsilon

        self.qTrue = []
        self.qEst = np.zeros(self.k)
        
        self.actionCount = []

        for i in range(0, self.k):
            self.qTrue.append(np.random.randn() + trueReward)
            self.qEst[i] = initial
            self.actionCount.append(0)

        self.bestAction = np.argmax(self.qTrue)

    def getAction(self):
        if self.epsilon > 0:
            if np.random.binomial(1, self.epsilon) == 1:
                return np.random.choice(self.indices)

        if self.UCBParam is not None:
            UCBEst = self.qEst + self.UCBParam * np.sqrt(np.log(self.time +1) / np.asarray((self.actionCount) +1))

            return np.argmax(UCBEst)

        if self.gradient:
            expEst = np.exp(self.qEst)
            self.actionProb = expEst / np.sum(expEst)
            return np.random.choice(self.indices, p=self.actionProb)

        return np.argmax(self.qEst)

    def takeAction(self, action):
        reward = np.random.randn() + self.qTrue[action]
        self.time += 1
        self.actionCount[action] += 1
        self.averageReward += (reward - self.averageReward) / self.time

        if self.sampleAverages:
            self.qEst[action] += (reward - self.qEst[action]) / self.actionCount[action]
        else:
            self.qEst[action] = reward

        if self.gradient:
            self.actionProb = np.exp(self.qEst) / np.sum(np.exp(self.qEst))
            self.actionProb[action] += 1 / self.k
            self.actionProb = self.actionProb / np.sum(self.actionProb)

        if self.gradientBaseline:
            self.qTrue[action] += (reward - self.qTrue[action]) / self.actionCount[action]

        return reward



Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/friedrichwilkegrosche/miniforge3/envs/hivemind

  added / updated specs:
    - seaborn


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    bottleneck-1.3.5           |  py310h96f19d2_0         104 KB
    numexpr-2.8.3              |  py310h5a06f4b_0         112 KB
    openssl-1.1.1q             |       h1a28f6b_0         2.2 MB
    pandas-1.4.3               |  py310hc377ac9_0         9.6 MB
    pytz-2022.1                |  py310hca03da5_0         195 KB
    ------------------------------------------------------------
                                           Total:        12.2 MB

The following NEW packages will be INSTALLED:

  bottleneck         pkgs/main/osx-arm64::bottleneck-1.3.5-py310h96f19d2_0
  numexpr            pkgs/main/osx-arm64::numexpr-2.8