In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

import pymc3 as pm
import theano
import theano.tensor as tt
import arviz as az

In [9]:
def generate_data(alpha, beta, n=100, 
                  p_r={'high_var': [.95, .05], 'low_var': [.5,.5]},
                  rs = np.array(([5.0, -495.0],[-5.0, 495.0],[10.0, -100.0],[-10.0, 100.0])),
                  sQ = np.zeros((4, 2))
                 ):
    
    # Need to denote both machine type and action
    
    # Pre-specify machines for each trial in a randomly balanced manner
    if n%4 != 0:
        print("Number of trials is not divisable by 4.\nCreating trials for %s trials."%(str(n-(n%4))))
        n = n-(n%4)
    
    machs = np.array([0,1,2,3])
    machs = np.tile(machs, int(n/4))
    np.random.shuffle(machs)
    
    # Initialize empty array that will be populated in the loop based on Q values
    acts = np.zeros(n, dtype=np.int)
    
    # Generate by coin flip for machine with differing probabilities and outcomes
    rews = np.zeros(n, dtype=np.int)

    # Stores the expected value for each of 4 machines in each trial for each action
    Qs = np.zeros((n, 4, 2))

    # Initialize Q table
    # Denotes expected value of each action
    # Should look like [0, 0] for each machine
    # *** The expected value of not playing should not change from 0! ***
    # Could these initial expected values/beliefs also be estimated from data?
    # E.g. what if kids have more optimistic priors about each machine though they learn at the same rate
    Q = sQ.copy()
    
    for i in range(n):
        
        cur_machine = machs[i]
        
        # Apply the Softmax transformation
        exp_Q = np.exp(beta*Q[cur_machine])
        prob_a = exp_Q / np.sum(exp_Q)

        # Simulate choice
        a = np.random.choice([0, 1], p=prob_a)
        
        # Simulate reward if machine is played
        if a == 1:
    
            # Before sampling reward determine which variance condition machine is in
            if cur_machine>1:
                cur_p = 'low_var'
            else:
                cur_p = 'high_var'

            # Sample reward for current machine given its reward probs and outcome options
            r = np.random.choice(rs[cur_machine], p = p_r[cur_p]) 
            
            # Update Q table only if the machine is played
            # And only the value of playing NOT of not playing
            Q[cur_machine][a] = Q[cur_machine][a] + alpha * (r - Q[cur_machine][a])
        
        # If the machine is not played then Q remains unchanged and no reward is received
        else:
            r = 0.0

        # Store values
        acts[i] = a
        rews[i] = r
        #Qs[i] = Q.copy()
        Qs[i] = Q

    return machs, acts, rews, Qs

In [10]:
true_alpha = .3
true_beta = 1
n = 80
machines, actions, rewards, all_Qs = generate_data(true_alpha, true_beta, n)

In [11]:
machines

array([2, 3, 0, 1, 1, 3, 2, 3, 0, 1, 3, 3, 0, 1, 1, 2, 3, 3, 2, 1, 1, 2,
       0, 1, 3, 0, 1, 1, 0, 2, 1, 2, 0, 1, 1, 3, 0, 3, 2, 3, 0, 1, 2, 0,
       0, 2, 2, 0, 3, 3, 2, 2, 1, 3, 0, 3, 1, 0, 2, 3, 3, 1, 2, 2, 3, 0,
       3, 0, 2, 1, 0, 2, 1, 0, 2, 0, 1, 2, 3, 0])

In [12]:
actions

array([1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1])

In [13]:
rewards

array([-100,  -10,    5,    0,    0,    0,    0,    0,    5,   -5,    0,
          0,    5,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          5,    0,    0,    5,   -5,    0,    5,    0,    0,    0,    5,
          0,    0,    0,    5,    0,    0,  -10,    5,   -5,    0,    5,
          5,    0,    0,    5,    0,    0,    0,    0,    0,    0,    5,
          0,    0,    5,    0,    0,    0,    0,    0,    0,    0,    5,
          0,    5,    0,    0,    5,    0,    0,    5,    0,    5,    0,
          0,    0,    5])

In [27]:
np.array([all_Qs[0],all_Qs[-1]])

array([[[  0.        ,   0.        ],
        [  0.        ,   0.        ],
        [  0.        , -30.        ],
        [  0.        ,   0.        ]],

       [[  0.        ,   4.99601039],
        [  0.        ,  -3.285     ],
        [  0.        , -30.        ],
        [  0.        ,  -5.1       ]]])

In [16]:
machines2, actions2, rewards2, all_Qs2 = generate_data(true_alpha, true_beta, n)
all_Qs2[-1]

array([[   0.    , -146.2005],
       [   0.    ,   -2.55  ],
       [   0.    ,  -27.9   ],
       [   0.    ,   -5.1   ]])

In [17]:
machines3, actions3, rewards3, all_Qs3 = generate_data(true_alpha, true_beta, n)
all_Qs3[-1]

array([[   0.        , -145.02373781],
       [   0.        ,   -1.96480345],
       [   0.        ,  -30.        ],
       [   0.        ,   73.25225799]])