# COMP90051 Project 2

In [2]:
# Do not edit. These are the only imports permitted.
%matplotlib inline
import numpy as np
from numpy.linalg import inv
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod

## 1. Implement ε-Greedy and UCB

In [3]:
class MAB(ABC):
    """
    Abstract class that represents a multi-armed bandit (MAB)
    """
    @abstractmethod
    def play(self, tround, context):
        pass
    @abstractmethod
    def update(self, arm, reward, context):
        pass


In [4]:
class EpsGreedy(MAB):
    def __init__(self, narms, epsilon, Q0=np.inf):
        self.narms, self.epsilon, self.Q0 = narms,epsilon, Q0
        self.arm_dict=dict.fromkeys(range(1, narms+1), self.Q0)
        self.arm_list =[x for x in range(1,narms+1)]
        self.values = [[]for i in self.arm_list]
        self.reward_list=[]
    def play(self, tround, context=None):
        mu = self.Q0
        if np.random.random() > self.epsilon:
            for i,value in enumerate(self.values):
                mu = self.arm_dict[i+1]
            max_mean_value = max(self.arm_dict.values())    
            max_arm_list = []
            for max_arm in self.arm_dict:
                if self.arm_dict[max_arm]==max_mean_value:
                    max_arm_list.append(max_arm)
            selection = np.random.choice(max_arm_list)
            return selection  
        else:
            temp_arm = np.random.randint(1,self.narms+1)
            return temp_arm
    def update(self, arm, reward, context=None):
        self.values[arm-1].append(reward)
        self.reward_list.append(reward)
        self.arm_dict[arm]=np.mean(self.values[arm-1])

In [5]:
class UCB(MAB):
    def __init__(self, narms, rho, Q0=np.inf):
        self.narms,self.rho,self.Q0 = narms,rho, Q0
        self.arm_dict=dict.fromkeys(range(1, narms+1), self.Q0)
        self.arm_list =[x for x in range(1,narms+1)]
        self.values = [[]for i in self.arm_list]
        self.reward_list=[]
        
    def play(self, tround, context=None):
        mu = self.Q0
        for i,value in enumerate(self.values):
            if self.arm_dict[i+1]!=self.Q0:
                mu = np.mean(value)+(self.rho*np.log(tround)/len(value))**0.5
                self.arm_dict[i+1]= mu
            else:
                max_mean_value =self.Q0
        max_mean_value = max(self.arm_dict.values())
        max_arm_list = []
        for max_arm in self.arm_dict:
            if self.arm_dict[max_arm]==max_mean_value:
                max_arm_list.append(max_arm)
        selection = np.random.choice(max_arm_list)
        return selection 
    def update(self, arm, reward, context=None):
        self.values[arm-1].append(reward)
        self.reward_list.append(reward)
        self.arm_dict[arm]=np.mean(self.values[arm-1])

## 2. Off-Policy Evaluation

In [6]:
file = np.loadtxt('dataset.txt')
arms = file[:,0]
rewards = file[:,1]
contexts = file[:,2:102]

In [7]:
def offlineEvaluate(mab, arms, rewards, contexts, nrounds=None):
    arms_index = 0
    for t in range(nrounds):
        while arms_index<10000:
            if arms[arms_index] == mab.play(t,contexts[arms_index]):
                mab.update(int(arms[arms_index]),int(rewards[arms_index]),contexts[arms_index])
                arms_index+=1
                break
            else:
                arms_index+=1
                continue
        if arms_index>=len(arms):
            break
    return mab.reward_list

In [9]:
mab_EpsGreedy = EpsGreedy(10, 0.05)
results_EpsGreedy = offlineEvaluate(mab_EpsGreedy, arms, rewards, contexts, 800)
print('EpsGreedy average reward', np.mean(results_EpsGreedy))

EpsGreedy average reward 0.27625


In [10]:
mab_UCB = UCB(10, 1.0)
results_UCB = offlineEvaluate(mab_UCB, arms, rewards, contexts, 800)
print('UCB average reward', np.mean(results_UCB))

UCB average reward 0.15


## 3. Contextual Bandits

In [11]:
class LinUCB(MAB):
    def __init__(self, narms, ndims, alpha):
        self.narms,self.ndims,self.alpha= narms,ndims,alpha
        self.A =dict.fromkeys(range(1, narms+1), np.eye(self.ndims))
        self.b =dict.fromkeys(range(1, narms+1), np.zeros(self.ndims).reshape((self.ndims,1)))
        self.arm_list =[x for x in range(1,narms+1)]
        self.reward_list=[]
    def play(self, tround, context):
        self.arm_context_list=context.reshape((self.narms,self.ndims))
        arm_context_dict={}
        for armid in self.arm_list:       
            Xta=self.arm_context_list[armid-1].reshape(self.narms,1)
            Theta = (inv(self.A[armid]))@self.b[armid]
            Pta=float(Theta.T@Xta+ self.alpha*np.sqrt(Xta.T@(inv(self.A[armid]))@Xta))
            arm_context_dict[armid]=Pta
        max_Pta = max(arm_context_dict.values())
        max_arm_list = []
        for max_arm in arm_context_dict:
            if arm_context_dict[max_arm]==max_Pta:
                max_arm_list.append(max_arm)
        return np.random.choice(max_arm_list)
    def update(self, arm, reward, context):
        self.reward_list.append(reward)
        Xta = self.arm_context_list[arm-1].reshape((self.ndims,1))
        self.A[arm] = self.A[arm] + Xta@Xta.T
        self.b[arm] = self.b[arm] + reward*Xta

In [12]:
mab_LinUCB = LinUCB(10, 10, 1.0)
results_LinUCB = offlineEvaluate(mab_LinUCB, arms, rewards, contexts, 800)
print('LinUCB average reward', np.mean(results_LinUCB))

LinUCB average reward 0.53


## 4. Evaluation
### 4.A.

In [None]:
# def plotFigure(mabRewardList):
#     X, Y= [X for X in range(1,len(mabRewardList)+1)], []
#     temp_mean = []
#     temp_y = mabRewardList
#     for y in temp_y:
#         temp_y.append(y)
#         Y.append(np.mean(temp_y))
#     return X,Y

X, Y= [X for X in range(1,len(mab_LinUCB.reward_list)+1)], []
temp_mean = []
temp_y = mab_LinUCB.reward_list
for y in temp_y:
    temp_y.append(y)
    Y.append(np.mean(temp_y))
# X,Y = plotFigure(mab_LinUCB.reward_list)
plt.plot(X,Y,label="1",c="r")
plt.show()

### 4.B.

## 5. KernelUCB

In [None]:
# Do not edit. Special import for this section.
from sklearn.metrics.pairwise import rbf_kernel

In [None]:
class KernelUCB(MAB):
    """
    Kernelised contextual multi-armed bandit (Kernelised LinUCB)
    
    Arguments
    =========
    narms : int
        number of arms

    ndims : int
        number of dimensions for each arm's context

    gamma : float
        positive real explore-exploit parameter
    
    eta : float
        positive real explore-exploit parameter
    
    kern : callable
        a kernel function from sklearn.metrics.pairwise
    """
    def __init__(self, narms, ndims, gamma, eta, kern):
        
    
    def play(self, tround, context):
        
    
    def update(self, arm, reward, context):
        
    