In [1]:
import numpy.random as rn
from random import random
import numpy as np
# Local imports
from SMPyBandits.Environment import Evaluator, tqdm
# Import arms
from SMPyBandits.Arms import Bernoulli


If you want the speed up brought by numba.jit, try to manually install numba and check that it works (installing llvmlite can be tricky, cf. https://github.com/numba/numba#custom-python-environments
Info: Using the Jupyter notebook version of the tqdm() decorator, tqdm_notebook() ...
ERROR: 'resource' module not available, but it is in the standard library.
Have you messed up your Python installation?
Are you on Windows? In this case, it's okay.
Please submit a new bug on https://github.com/SMPyBandits/SMPyBandits/issues/new
If you want the speed up brought by numba.jit, try to manually install numba and check that it works (installing llvmlite can be tricky, cf. https://github.com/numba/numba#custom-python-environments


In [2]:
# Import algorithms
from SMPyBandits.Policies import EmpiricalMeans, EpsilonGreedy, UCB
from SMPyBandits.Policies.IndexPolicy import IndexPolicy
from SMPyBandits.Policies.BasePolicy import BasePolicy
from SMPyBandits.Policies.with_proba import with_proba

In [None]:
#EmpiricalMeans?

In [None]:
#EmpiricalMeans??

In [3]:
# MAB environment parameters
HORIZON = 10000
REPETITIONS = 100 # Number of repetition of the experiment (to have an average)
N_JOBS = 1 # Number of CPU cores

#: Default value for epsilon for `YourEpsilonGreedy`
EPSILON = 0.1

In [4]:
ENVIRONMENTS = [  # 1)  Bernoulli arms
        {   # An easy but widely adopted problem
            "arm_type": Bernoulli,
            "params": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
        },
        {   # An other problem, best arm = last, with three groups: very bad arms (0.01, 0.02), middle arms (0.3 - 0.6) and very good arms (0.78, 0.8, 0.82)
            "arm_type": Bernoulli,
            "params": [0.01, 0.02, 0.3, 0.4, 0.5, 0.6, 0.795, 0.8, 0.805]
        },
        {   # A very hard problem, as used in [Cappé et al, 2012]
            "arm_type": Bernoulli,
            "params": [0.01, 0.01, 0.01, 0.02, 0.02, 0.02, 0.05, 0.05, 0.1]
        },
    ]

In [5]:

class YourEpsilonGreedy(BasePolicy):
    r""" The epsilon-greedy random policy.
    """

    def __init__(self, nbArms, epsilon=EPSILON, lower=0., amplitude=1.):
        super(YourEpsilonGreedy, self).__init__(nbArms, lower=lower, amplitude=amplitude)
        assert 0 <= epsilon <= 1, "Error: the 'epsilon' parameter for YourEpsilonGreedy has to be in [0, 1]."  # For DEBUG
        self._epsilon = epsilon


    # This decorator @property makes this method an attribute, cf. https://docs.python.org/3/library/functions.html#property
    @property
    def epsilon(self):  # This allows us to use time-dependent epsilon coef
        return self._epsilon

    def choice(self):
        """With a probability of epsilon, explore (uniform choice), otherwhise exploit based on empirical mean rewards."""
        #----------Your Code----------#
        
        if random() < self.epsilon:
            # Exploration: choose a random arm
            return rn.randint(self.nbArms)
        else:
            # Exploitation: choose the best arm
            return np.argmax(self.rewards)
        
        #----------End of Your Code----------#
   

In [15]:
POLICIES = [
        # --- EmpiricalMeans (or Greedy) algorithm, a baseline for comparison
        {
            "archtype": EmpiricalMeans,
            "params": {}
        },
        # --- UCB algorithm, a baseline for comparison
        {
            "archtype": UCB,
            "params": {}
        },
        # --- YourEpsilonGreedy algorithm
        {
            "archtype": YourEpsilonGreedy,
            "params": {"epsilon": 0.01}
        },
    ]

In [16]:
configuration = {
    # --- Duration of the experiment
    "horizon": HORIZON,
    # --- Number of repetition of the experiment (to have an average)
    "repetitions": REPETITIONS,
    # --- Parameters for the use of joblib.Parallel
    "n_jobs": N_JOBS,    # Number of CPU cores
    "verbosity": 6,      # Max joblib verbosity
    # --- Arms
    "environment": ENVIRONMENTS,
    # --- Algorithms
    "policies": POLICIES,
}
configuration

{'horizon': 10000,
 'repetitions': 100,
 'n_jobs': 1,
 'verbosity': 6,
 'environment': [{'arm_type': SMPyBandits.Arms.Bernoulli.Bernoulli,
   'params': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
  {'arm_type': SMPyBandits.Arms.Bernoulli.Bernoulli,
   'params': [0.01, 0.02, 0.3, 0.4, 0.5, 0.6, 0.795, 0.8, 0.805]},
  {'arm_type': SMPyBandits.Arms.Bernoulli.Bernoulli,
   'params': [0.01, 0.01, 0.01, 0.02, 0.02, 0.02, 0.05, 0.05, 0.1]}],
 'policies': [{'archtype': SMPyBandits.Policies.EmpiricalMeans.EmpiricalMeans,
   'params': {}},
  {'archtype': SMPyBandits.Policies.UCB.UCB, 'params': {}},
  {'archtype': __main__.YourEpsilonGreedy, 'params': {'epsilon': 0.01}}]}

In [17]:
# Creating the Evaluator object
evaluation = Evaluator(configuration)

Number of policies in this comparison: 3
Time horizon: 10000
Number of repetitions: 100
Sampling rate for plotting, delta_t_plot: 1
Number of jobs for parallelization: 1
Using this dictionary to create a new environment:
 {'arm_type': <class 'SMPyBandits.Arms.Bernoulli.Bernoulli'>, 'params': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}


Creating a new MAB problem ...
  Reading arms of this MAB problem from a dictionnary 'configuration' = {'arm_type': <class 'SMPyBandits.Arms.Bernoulli.Bernoulli'>, 'params': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]} ...
 - with 'arm_type' = <class 'SMPyBandits.Arms.Bernoulli.Bernoulli'>
 - with 'params' = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
 - with 'arms' = [B(0.1), B(0.2), B(0.3), B(0.4), B(0.5), B(0.6), B(0.7), B(0.8), B(0.9)]
 - with 'means' = [0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
 - with 'nbArms' = 9
 - with 'maxArm' = 0.9
 - with 'minArm' = 0.1

This MAB problem has: 
 - a [Lai & Robbins] complexity constant C(mu) = 7.52 ... 
 - a

In [19]:
for envId, env in tqdm(enumerate(evaluation.envs), desc="Problems"):
    # Evaluate just that env
    evaluation.startOneEnv(envId, env)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for envId, env in tqdm(enumerate(evaluation.envs), desc="Problems"):


ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [20]:
def plotAll(evaluation, envId):
    evaluation.printFinalRanking(envId)
    evaluation.plotRegrets(envId)

In [21]:
envId = 0
plotAll(evaluation, envId)


Giving the final ranks ...

Final ranking for this environment #0 : (using less accurate estimate of the regret)


IndexError: list index out of range