In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from functools import partial 
np.random.seed(0)

In [None]:
def generate_instance():
    T = 100000
    N = 6
    d = 20
    mu_noise_level = 0.1
    reward_noise_level = 0.1
    covariate_diversity=True
    
    mu_true = np.random.randn(N,d)
    contexts = get_contexts(d,T,covariate_diversity)

    return {'T':T,'N':N,'d':d,'mu_noise_level':mu_noise_level,
          'reward_noise_level':reward_noise_level,
         'mu_true':mu_true,'contexts':contexts}

In [None]:
def mu_noise(d,mu_noise_level=0.5):
    '''noise in the mu true vectors'''
    return mu_noise_level*np.random.randn(d)

def reward_noise(reward_noise_level=0.1):
    '''noise in the reward signal'''
    return reward_noise_level*np.random.randn()

def get_choice_myopic(context,mu_x,payments=None):
    '''agents are myopic but deterministic. tbd: stochastic'''
    temp = np.dot(mu_x,context)
    if payments is not None:
        temp += payments
    return temp.argmax()

def get_contexts(d,T,covariate_diversity=True):
    '''pre-generated contexts'''
    contexts = {}
    if covariate_diversity is True:
        for t in range(T):
            contexts[t] = np.random.randn(d)
        return contexts
    else:
        mean = np.ones(d)
        cov0 = 0.1*np.random.randn(d,d)
        cov = np.ones((d,d)) + np.dot(cov0,cov0.transpose())

        for t in range(T):
            contexts[t] = np.random.multivariate_normal(mean, cov)
        return contexts    


In [None]:
def interactions(instance,get_payments):

    mu_true = instance['mu_true']
    N = instance['N']
    d = instance['d']
    mu_noise_level = instance['mu_noise_level']
    T = instance['T']
    contexts = instance['contexts']
    
    #Initial N rounds are free
    mu_estimated = np.zeros_like(mu_true)
    for t in range(N):
        mu_estimated[t] = mu_true[t] + mu_noise(d,mu_noise_level)

    #variables for logging
    pseudoregret_inst = np.zeros(T)
    history = {}

    #T interactions
    for t in range(T):
        #agent arrives
        context = contexts[t]

        #platform decides to pay
        payments = get_payments(context,contexts,history,t,T,N,mu_estimated)

        #agent decides arm and information is revealed
        choice = get_choice_myopic(context,mu_estimated,payments)
        mu_realized = mu_true[choice] + mu_noise(d,mu_noise_level)
        reward_realized = np.dot(mu_true[choice],context) + reward_noise()

        #Platform updates estimates
        mu_estimated[choice] = (t*mu_estimated[choice] + mu_realized)/(t+1)

        #Regret computation
        opt_choice = get_choice_myopic(context,mu_true,None)
        pseudoregret_inst[t] = np.dot(mu_true[opt_choice] - mu_true[choice],context)

        #History
        history[t] = {'choice':choice,'payments':payments,'mu_realized':mu_realized}

    assert pseudoregret_inst.min() >= 0
    pseudoregret = np.cumsum(pseudoregret_inst)
    return pseudoregret,history

In [None]:
def get_payments_none(context,contexts,history,t,T,N,mu_estimated):
    return None

In [None]:
def get_payments_iehu(context,contexts,history,t,T,N,mu_estimated):
    
    #count of arm pulls
    arm_pulls = np.zeros(N)
    for x in history:
        arm_pulls[history[x]['choice']] += 1
    
    #phase estimate
    s = arm_pulls.min()+1 #plus 1 because of the initial N pulls

    #payment eligible arm estimation
    phivec = np.zeros(N)
    for i in range(N):
        for k in range(t):
            constraint_holds = True
            for j in range(N):
                if i != j:
                    if np.dot(contexts[k],mu_estimated[i]-mu_estimated[j]) <= 0:
                        constraint_holds = False
            if constraint_holds == True:
                phivec[i] += 1
    phivec = phivec/phivec.sum()            
    
    if phivec.min() > 1/np.log(s+1e-6):
        return None #no payment
    else:
        #estimating payment amount
        peligible_arm = phivec.argmin()
        
        largest = 0
        for i in range(N):
            if i != peligible_arm:
                temp = np.dot(context,mu_estimated[i] - mu_estimated[peligible_arm])
                if temp > largest:
                    largest = temp

        payment = np.zeros(N)
        payment[peligible_arm] = largest
#         print('t',t,'payment',largest)
        return payment
    

In [None]:
n_mc_runs = 5
algos = {'greedy':get_payments_none}
# algos = {'iehu':get_payments_iehu}
# algos = {'greedy':get_payments_none, 'iehu':get_payments_iehu}
prs_all = {}
history_all = {}
for algo in algos:
    prs_all[algo] = [0]*n_mc_runs
    history_all[algo] = {}
instance = {}
for mc_run in range(n_mc_runs):
    instance[mc_run] = generate_instance()
    for algo in algos:
        prs_all[algo][mc_run],history_all[algo][mc_run] = interactions(instance = instance[mc_run],get_payments = algos[algo])

In [None]:
for e,algo in enumerate(algos):
    plt.figure()
    print(algo)
    plt.plot(np.mean(prs_all[algo],axis=0),label=algo)
plt.show()

In [None]:
# a = [history_all['greedy'][0][x]['choice'] for x in history_all['greedy'][0]]
# b = [history_all['iehu'][0][x]['choice'] for x in history_all['iehu'][0]]
# [x for x in zip(a,b)]