In [56]:
import random
import numpy as np
from collections import Counter

In [59]:
class LinUCB():
    '''
        This class implements LinUCB Disjoint algorithm
    '''
    
    def __init__( self,  kArms, alpha, ctxtVecLen):
        
        '''
            alpha : Controls exploration; 
                    Higher the alpha, wider the confidence interval and higher the chance of trying any given arm;
           
            kArms: Number of arms
            
            ctxtVecLen: determines the shape of Design Matrix & Response Vector
            
        '''
        
        self.kArms      = kArms
        self.alpha      = alpha
        self.ctxtVecLen = ctxtVecLen
        
        # A: (ctxtVecLen x ctxtVecLen) matrix = D_a.T * D_a + I_d  ( Where a is indexing on arms array)
        # The inverse of A is used in ridge regression 
        self.A_a = [np.identity(ctxtVecLen)  for eArm in range(kArms) ]        
        
        # b: (ctxtVecLen x 1) corresponding response vector. 
        # Equals to D_a.T * c_a in ridge regression formulation
        self.b_a = [np.zeros([ctxtVecLen,1]) for eArm in range(kArms) ]
        
    def __repr__(self):
        return f'LinUCB: kArms:{kArms}, alpha: {alpha}'.format(self.alpha, self.kArms, self.ctxtVecLen) 
    
    def select_arm(self, ctxtVec):
        ''' Returns the index of the next arm to pull '''
        
        # Reshape covariates input into (d x 1) shape vector
        ctxtVec = ctxtVec.reshape([-1,1])

        ucb = []
        for idx in range(kArms):

            # Find A inverse for ridge regression
            A_inv = np.linalg.inv(self.A_a[idx])

            # Perform ridge regression to obtain estimate of covariate coefficients theta
            # theta is (ctxtVecLen x 1) dimension vector
            theta = np.dot(A_inv, self.b_a[idx])

            # Find ucb based on p formulation (mean + std_dev) 
            # p is (1 x 1) dimension vector

            ucb.append( 
                np.dot(theta.T, ctxtVec) + self.alpha * np.sqrt(np.dot(ctxtVec.T, np.dot(A_inv,ctxtVec))) 
            )        
        
        max_val = max( ucb  )            
        return ucb.index(max_val) +1
    
    def update(self, ctxtVec, chosen_arm, reward):
        '''        
            After we pull an arm, we get a reward signal back from our system. This function update our algorithm's beliefs
            about the quality of the arm we just chose by providing this reward information.
        
            chosen_arm : The numeric index of the most recently chosen arm
            reward     : The reward received from chossing that arm
        '''
        # Reshape covariates input into (d x 1) shape vector
        ctxtVec = ctxtVec.reshape([-1,1])
        
        # Update A which is (d * d) matrix.
        self.A_a[chosen_arm] += np.dot(ctxtVec, ctxtVec.T)
        
        # Update b which is (d x 1) vector
        # reward is scalar
        self.b_a[chosen_arm] += reward * ctxtVec


kArms      = 10
alpha      = 1.5
ctxtVecLen = 100

LinUCBPolicy = LinUCB(kArms, alpha, ctxtVecLen)


with open('./dataset.txt') as fp:
    for eachRecord in fp:
        data_arm = int(eachRecord.split(' ')[0])
        data_reward = float(eachRecord.split()[1])
        covariate_string_list = eachRecord.split()[2:]
        data_x_array = np.array([float(eCov) for eCov in eachRecord.split()[2:]])
        break
    


tCtr = Counter()
noOfMatches = 0
cumulative_reward = 0 
with open('./dataset.txt') as fp:
    for idx, eachRecord in enumerate(fp):
        if idx > 10000:
            break
        data_arm = int(eachRecord.split(' ')[0])
        data_reward = float(eachRecord.split()[1])
        covariate_string_list = eachRecord.split()[2:]        
        data_x_array = np.array([float(eCov) for eCov in eachRecord.split()[2:]])
        selected_arm = LinUCBPolicy.select_arm( data_x_array )
        if selected_arm == data_arm:
            noOfMatches += 1
            cumulative_reward = cumulative_reward + data_reward
            LinUCBPolicy.update(data_x_array, selected_arm, data_reward) 

noOfMatches, cumulative_reward



(1020, 21.0)

In [60]:
21.0/1020

0.020588235294117647