In [5]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.stats import beta

sns.set_style("whitegrid")

In [36]:
class Environment:
    def __init__(self,variants,payouts,n_trials,variance=False):
        self.variants = variants
        if variance:
            self.payouts = np.clip(payouts+np.random.normal(0,0.04,size=len(variants)),0,0.2)
        else:
            self.payouts = payouts
        self.n_trials=n_trials
        self.total_reward = 0
        self.n_k = len(variants)
        self.shape = (self.n_k,self.n_trials)
    def run(self,agent):
        for i in range(self.n_trials):
            x_choosen = agent.choose_k()
            reward = np.random.binomial(1,p=self.payouts[x_choosen])
            agent.reward = reward
            agent.update()
            self.total_reward +=reward
        agent.collect_data()
        return self.total_rewards

            

In [37]:
class BaseSampler:
    def __init__(self,env,n_samples=None,n_learning=None,e=0.05):
        self.env=env
        self.shape=(env.n_k,n_samples)
        self.variants=env.variants
        self.payouts=env.payouts
        self.n_trials = env.n_trials
        self.ad_i=np.zeros(env.n_trials)
        self.r_i=np.zeros(env.n_trials)
        self.thetas=np.zeros(self.n_trials)
        self.regret_i=np.zeros(env.n_trials)
        self.thetaregret=np.zeros(env.n_trials)
        
        self.a=np.ones(env.n_k)
        self.b=np.ones(env.n_k)
        self.theta = np.zeros(env.n_k)
        self.data = None
        self.reward = 0
        self.total_reward = 0
        self.k=0
        self.i=0
        self.n_samples=n_samples
        self.n_learning=n_learning
        self.e=e
        self.ep=np.random.uniform(0,1,size=env.n_trials)
        self.exploit=(1-e)
        
    def collectdata(self):
        self.data = pd.DataFrame(dict(ad=self.ad_i,reward=self.r_i,regret = self.regret_i))
        

In [38]:
class RandomSampler(BaseSampler):
    def __init__(self,env):
        super().__init__(env)
    def choose_k(self):
        self.k=np.random.choice(self.variants)
        return self.k
    def update(self):
        self.thetaregret[self.i]=np.max(self.theta)-self.theta[self.k]
        self.a[self.k]+=self.reward
        self.b[self.k]+=1
        self.theta=self.a/self.b
        self.ad_i[self.i]=self.k
        self.r_i[self.i]=self.reward
        self.i+=1
        

In [39]:
class eGreedy(BaseSampler):
    def __init__(self,env,n_learning,e):
        super().__init(env,n_learning,e)
    def choose_k(self):
        if(self.i<self.n_learning):
            self.k =np.random.choice(self.variants)
        else:
            self.k = np.argmax(self.theta)
        if(self.ep[self.i]>self.exploit):
            self.k=np.random.choice(self.variants)
        else:
            self.k=self.k
        return self.k
    def update(self):
        self.a[self.k]+=self.reward
        self.b[self.k]+=1
        self.theta= self.a/self.b
        self.thetas[self.i]=self.theta[self.k]
        self.thetaregret[self.i]=np.max(self.thetas) - self.theta[self.k]
        self.ad_i[self.i]=self.k
        self.r_i[self.i]=self.reward
        self.i+=1

In [40]:
n_trials = 10000
machines = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
payouts = [0.023, 0.03, 0.029, 0.001, 0.05, 0.06, 0.0234, 0.035, 0.01, 0.11]
labels = ["V" + str(i) + (str(p)) for i, p in zip(machines, payouts)]
labels

['V00.023',
 'V10.03',
 'V20.029',
 'V30.001',
 'V40.05',
 'V50.06',
 'V60.0234',
 'V70.035',
 'V80.01',
 'V90.11']

In [41]:
en0 = Environment(machines, payouts, n_trials)
rs = RandomSampler(env=en0)
en0.run(agent=rs)

AttributeError: 'Environment' object has no attribute 'total_reward'