In [146]:
import random
import numpy as np
import pandas as pd
import scipy.stats as stats
from plotnine import *
from abc import ABC, abstractmethod

def reward_estimation(reward, last_reward, n):
    if(n == 0):
        return reward
    else:
        reward + 1/n*abs(last_reward - reward) 


class State:
    def __init__(self, number_of_choices):
        self.choices = np.array(range(number_of_choices))
        self.rewards = np.repeat(0, number_of_choices)
        self.taken = np.repeat(0, number_of_choices) 
        
class Agent(ABC):
 
    def __init__(self, number_of_choices):
        self.reward = 0
        self.State = State(number_of_choices)
        self.reward_sum = 0
      

    @abstractmethod
    def take_action(self):
        pass
    
    
    
class Greedy(Agent):
    def __init__(self, epsilon, number_of_choices):
        self.epsilon = epsilon
        super().__init__(number_of_choices)
        
    
    def take_action(self):
        explore = np.random.choice([False, True], 1, p = [1 - self.epsilon, self.epsilon])[0]
        choice = np.argmax(self.State.rewards)
        if explore == True:
            exploration_choices = np.delete(self.State.choices, choice)
            choice = random.choice(exploration_choices)
        return(choice)
    
    def get_reward(bandit):
        self.reward =  bandit.return_reward()
        self.reward_sum += self.reward
        
    def update_state(self):
        pass
    
    def display_state(self):
        print("choices: ",self.State.choices) 
        print("rewards estimations: ",self.State.rewards) 
        print("number of picks: ", self.State.taken) 
    

class Bandit:
    def __init__(self):
        self.__mean = random.randint(0,9)
        self.__sd = random.randint(0,9)
        
    def return_reward(self):
        return random.gauss(self.__mean, self.__sd)
        
    

In [147]:
random.seed(27)
n = 10
steps = 2000



bandits = list()
for i in range(10):
    bandits.append(Bandit())

In [148]:
greedy = Greedy(epsilon = 0.1, number_of_choices = 9)

In [150]:
greedy.display_state()
for i in range(1):
    choice = greedy.take_action()
    choice
    greedy.get_reward(bandits[choice])
    greedy.reward

choices:  [0 1 2 3 4 5 6 7 8]
rewards estimations:  [0 0 0 0 0 0 0 0 0]
number of picks:  [0 0 0 0 0 0 0 0 0]


TypeError: get_reward() takes 1 positional argument but 2 were given

In [7]:
choice_reward_df
choice_reward_df['choice'] = choice_reward_df['choice'].astype('category')

In [11]:
mu = 0
sigma = 1
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
y = stats.norm.pdf(x, mu, sigma)
y

array([0.00443185, 0.00530579, 0.00632878, 0.00752133, 0.00890582,
       0.0105065 , 0.01234943, 0.01446241, 0.01687483, 0.01961746,
       0.02272223, 0.02622189, 0.03014961, 0.03453857, 0.03942137,
       0.0448295 , 0.05079264, 0.05733801, 0.06448952, 0.07226707,
       0.08068571, 0.08975477, 0.09947714, 0.10984842, 0.12085626,
       0.13247967, 0.14468855, 0.15744319, 0.17069405, 0.18438164,
       0.1984366 , 0.21277993, 0.22732351, 0.24197072, 0.2566174 ,
       0.27115285, 0.28546117, 0.29942268, 0.31291556, 0.3258175 ,
       0.33800759, 0.34936814, 0.35978656, 0.36915722, 0.37738323,
       0.38437808, 0.3900672 , 0.39438923, 0.39729716, 0.39875915,
       0.39875915, 0.39729716, 0.39438923, 0.3900672 , 0.38437808,
       0.37738323, 0.36915722, 0.35978656, 0.34936814, 0.33800759,
       0.3258175 , 0.31291556, 0.29942268, 0.28546117, 0.27115285,
       0.2566174 , 0.24197072, 0.22732351, 0.21277993, 0.1984366 ,
       0.18438164, 0.17069405, 0.15744319, 0.14468855, 0.13247

In [99]:
random.choice(np.array([2,3,4]))

4

In [141]:
 bandits[random.choice(np.array([2,3,4]))].return_reward()

2.843913705364525

In [155]:
bandits[1].return_reward()

6.7365679410544