In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)

In [None]:
def generate_instance():
    T = 1000
    N = 5
    d = 2
    n_explore = N
    reward_noise_level = 0.1
    covariate_diversity=True
    
    mu_true = np.random.randn(N,d)
    contexts = get_contexts(d,T,covariate_diversity)

    return {'T':T,'N':N,'d':d,'n_explore':n_explore,
          'reward_noise_level':reward_noise_level,
          'mu_true':mu_true,'contexts':contexts}

In [None]:
def reward_noise(reward_noise_level=0.1):
    '''noise in the reward signal'''
    return reward_noise_level*np.random.randn()

def get_choice_myopic(context,mu_x,payments=None):
    '''agents are myopic but deterministic. tbd: stochastic'''
    temp = np.dot(mu_x,context)
    if payments is not None:
        temp += payments
    return temp.argmax()

def get_contexts(d,T,covariate_diversity=True):
    '''pre-generated contexts'''
    contexts = {}
    if covariate_diversity is True:
        for t in range(T):
            contexts[t] = np.random.randn(d)
        return contexts
    else:
        mean = np.ones(d)
        cov0 = 0.1*np.random.randn(d,d)
        cov = np.ones((d,d)) + np.dot(cov0,cov0.transpose())

        for t in range(T):
            contexts[t] = np.random.multivariate_normal(mean, cov)
        return contexts    


In [None]:
def interactions(instance,platform):

    mu_true = instance['mu_true']
    N = instance['N']
    d = instance['d']
    T = instance['T']
    n_explore = instance['n_explore']
    contexts = instance['contexts']
    
    #variables for logging
    pseudoregret_inst = np.zeros(T)
    history = {i:{'X':[],'y':[],'t':[],'payments':[]} for i in range(N)} #data for each arm
    history['meta'] = {'N':N,'n_explore':n_explore,'prev_choice':None,'prev_mu_estimated':np.zeros_like(mu_true)}
    
    #T interactions
    for t in range(T):
        #agent arrives
        context = contexts[t]

        #platform estimates mus and decides payments based on history
        payments,mu_estimated,initial_forced_choice = platform(t,context,history)
        
        if initial_forced_choice is not None: #do initial forced exploration
            choice = initial_forced_choice
        else: #agent decides arm
            choice = get_choice_myopic(context,mu_estimated,payments)
        
        #reward information is revealed
        reward_realized = np.dot(mu_true[choice],context) + reward_noise()

        #data collection
        history[choice]['X'].append(context)
        history[choice]['y'].append(reward_realized)
        history[choice]['t'].append(t)
        history[choice]['payments'].append(payments)
        history['meta']['prev_choice'] = choice
        history['meta']['prev_mu_estimated'] = mu_estimated
        
        #regret computation
        opt_choice = get_choice_myopic(context,mu_true,None)
        pseudoregret_inst[t] = np.dot(mu_true[opt_choice] - mu_true[choice],context)

    assert pseudoregret_inst.min() >= 0
    pseudoregret = np.cumsum(pseudoregret_inst)
    return pseudoregret,history

In [None]:
def estimate_mu(t,history):
    mu_estimated = history['meta']['prev_mu_estimated']
    update_index = history['meta']['prev_choice']
    X = np.array(history[update_index]['X'])
    y = np.array(history[update_index]['y'])
    
    mu_estimated[update_index] = np.linalg.lstsq(X, y, rcond=None)[0]
    
    return mu_estimated
    
def platform_greedy(t,context,history):
    n_explore = history['meta']['n_explore']
    N = history['meta']['N']
    
    if t< n_explore*N: #assign context to each arm in a round robin fashion:
        initial_forced_choice = t%N
        return None,history['meta']['prev_mu_estimated'], initial_forced_choice
    else:
        mu_estimated = estimate_mu(t,history)
        return None,mu_estimated, None

In [None]:
n_mc_runs = 5
algos = {'greedy':platform_greedy}
prs_all = {}
history_all = {}
for algo in algos:
    prs_all[algo] = [0]*n_mc_runs
    history_all[algo] = {}
instance = {}
for mc_run in range(n_mc_runs):
    instance[mc_run] = generate_instance()
    for algo in algos:
        prs_all[algo][mc_run],history_all[algo][mc_run] = interactions(instance = instance[mc_run],platform = algos[algo])

In [None]:
for e,algo in enumerate(algos):
    plt.figure()
    print(algo)
    plt.plot(np.mean(prs_all[algo],axis=0),label=algo)
plt.show()

In [None]:
[history_all['greedy'][x]['meta']['prev_mu_estimated'] for x in history_all['greedy']]

In [None]:
[instance[x]['mu_true'] for x in instance]

In [None]:
for e,algo in enumerate(algos):
    plt.figure()
    print(algo)
    plt.plot(np.array(prs_all[algo]).transpose(),label=algo)
plt.show()