/
logistic_regression_agent.py
352 lines (277 loc) · 11.6 KB
/
logistic_regression_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
import readline
import numpy as np
import math
import sys
from random import randint
from scipy.special import expit
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects import FloatVector
######### V, P SAMPLING FUNCTIONS
# sample v and p values from the posterior
def sample_vp(R, T, prior_params):
burn_in = 100
sample_iterations = 500
ndim = 2
nwalkers = 50
pos = [np.random.randn(ndim) for i in range(nwalkers)]
sampler = mc.EnsembleSampler(nwalkers, ndim, lnprob, args=(R, T, prior_params))
sampler.run_mcmc(pos, sample_iterations)
samples = sampler.chain[:, burn_in:, :].reshape((-1, ndim))
return samples[-1, :]
# it's convention to provide the bayesian terms as a logarithm, apparently...
# computes the log-prior on mu_V, mu_T given optional prior parameters
def lnprior(theta, prior_params):
v_prior_mean, v_prior_var, p_prior_mean, p_prior_var = prior_params
v, p = theta
vp_t1 = math.log(1 / math.sqrt(2 * math.pi * v_prior_var))
vp_t2 = (v - v_prior_mean)**2
vp_t2 /= (2 * v_prior_var)
vp = vp_t1 - vp_t2
pp_t1 = math.log(1 / math.sqrt(2 * math.pi * p_prior_var)) # if i were smart, i'd define a log-Gauss function...
pp_t2 = (p - p_prior_mean)**2
pp_t2 /= (2 * p_prior_var)
pp = pp_t1 - pp_t2
return vp + pp
# see comment on lnprior...
# computes the log-likelihood for given mu_V, mu_T given data (R, T)
def lnlike(theta, R, T):
v, p = theta
s = 0
for i in range(R.shape[0]):
like_var = 1 + (T[i, 0] ** 2) # if V ~ N(mu_V, 1), P ~ N(mu_P, 1), then V - tP is distributed
# according to N(mu_V - tmu_P, 1 + t^2)
like_mean = v - (T[i, 0] * p)
const_factor = math.log(1 / math.sqrt(2 * math.pi * like_var))
exp_factor = (R[i, 0] - like_mean) ** 2
exp_factor /= (2 * like_var)
s += const_factor - exp_factor
return s
# computes log of prior-likelihood product
def lnprob(theta, R, T, prior_params):
lp = lnprior(theta, prior_params)
ll = lnlike(theta, R, T)
return lp + ll
######### PROPMT SAMPLING FUNCTIONS
# sample a multinomial task distribution for the number of tasks
def sample_multinomial(counts, prior_alpha):
multinom = np.random.dirichlet(counts + prior_alpha)
return multinom
######### P(SUCCESS) SAMPLING FUNCTIONS
# given a list of prior means and inverse prior covariances, return parameters for logistic regression
# on the data (X, Y)
def sample_b0b1(X, Y, prior_means, prior_Hessians):
w_fit_MAP, H_posterior = bl.fit_bayes_logistic(Y, X, prior_means, prior_Hessians)
return w_fit_MAP
######### DATA PROCESSING FUNCTIONS
# given some number of tasks, return the reward for those tasks
def get_reward(num_tasks, t, is_tensor, true_params):
true_V, true_P, true_b0, true_b1 = true_params
R = []
T = []
if is_tensor:
reward = 0
for i in range(num_tasks):
if np.random.uniform(0, 1) < lin_logistic(t, true_b1, true_b0):
reward += true_V
if not reward == 0:
R = [reward]
T = [0]
else:
for i in range(num_tasks):
if np.random.uniform(0, 1) < lin_logistic(t, true_b1, true_b0):
R.append(true_V - i * true_P)
T.append(i)
return (R, T)
# expected discounted reward over the experiment time window
# gamma is discount factor
def expected_discounted_reward(b0, b1, V, P, gamma, t_0, T, max_tasks, task_dist):
DISCOUNT_SCALE_FACTOR = 0.025
s = 0
for t in np.arange(t_0, T + 1):
discount_factor = gamma ** ((t - t_0) * DISCOUNT_SCALE_FACTOR)
s += (discount_factor * expected_trial_reward(b0, b1, V, P, t, max_tasks, task_dist))
return s
# expected reward on a particular trial
def expected_trial_reward(b0, b1, V, P, t, max_tasks, task_dist):
s = 0
for i in np.arange(1, max_tasks + 1):
prob_task = task_dist[i - 1]
prob_success = lin_logistic(t, b1, b0)
num_terms = i / 2
sum_term = (2 * V) - ((i - 1) * P)
s += prob_task * prob_success * num_terms * sum_term
return s
######### VARIOUS UTILITY FUNCTIONS
# logistic form 1 / 1 + exp(-(k * (x - x0)))
def logistic(x, x0, k):
return expit(k * (x - x0))
# logistic form 1 / 1 + exp(-(b_1 * x + b_0))
def lin_logistic(x, b_1, b_0):
return expit(b_1 * x + b_0)
# translate k(x - m) to b0 + x * b_1
def km_to_b0b1(k, m):
return (-m * k, k)
# translate b0 + x * b_1 to k(x - m)
def b0b1_to_km(b0, b1):
return (b1, -b0 / b1)
# using softmax, pick an action
def pick_action(basis_val, tensor_val, params, algorithm="softmax"):
if algorithm == "softmax":
tau = params
vals = np.array([basis_val, tensor_val]) # hack so i can copy paste from RL agent code
denom = np.sum(np.exp(vals / tau))
dist = np.exp(vals / tau) / denom
choice = np.random.choice(2, p=dist)
else:
eps = params
if np.random.uniform(0, 1) < eps:
choice = np.random.choice(2)
elif basis_val == tensor_val:
choice = np.random.choice(2)
elif basis_val > tensor_val:
choice = 0
else:
choice = 1
return choice
def plot_reg(reg, x=None, data=None):
T = np.arange(0, 200).reshape(-1, 1)
try:
plt.scatter([x], data)
probs = reg.predict_proba(T)
print(probs[:, 1].shape)
plt.plot(T, probs[:, 1])
except:
probs = reg.predict_proba(T)
plt.plot(T, probs[:, 1])
#### main script
# experiment parameters
max_tasks = 4 # max number of concurrent tasks
num_trials = 1000 # number of experiment trials
true_V = 1 # reward for correctness
true_P = float(sys.argv[1]) # punishment for time delay
prop_mt = float(sys.argv[2]) # proportion of multitasking trials
gamma = float(sys.argv[3]) # reward discount factor
epsilon = 0.1 # parameter for epsilon greedy algorithm
delta = 0 # generalization parameter
# training parameters
basis_true_k = 0.01 # logistic training gradient
basis_true_m = num_trials * float(sys.argv[4]) # logistic training midpoint
basis_true_b0, basis_true_b1 = km_to_b0b1(basis_true_k, basis_true_m) # translate gradient/midpoint to linear parameters
tensor_true_k = 0.01 # logistic training gradient
tensor_true_m = num_trials * float(sys.argv[5]) # logistic training midpoint
tensor_true_b0, tensor_true_b1 = km_to_b0b1(tensor_true_k, tensor_true_m) # translate gradient/midpoint to linear parameters
true_tasks = np.zeros((max_tasks,)) # true task distribution
for i in range(max_tasks):
if i == 0:
true_tasks[i] = 1 - prop_mt
else:
true_tasks[i] = prop_mt / (max_tasks - 1)
basis_true_params = (true_V, true_P, basis_true_b0, basis_true_b1)
tensor_true_params = (true_V, 0, tensor_true_b0, tensor_true_b1)
## various prior parameters
# Dirichlet parameter
prior_alpha = np.ones((max_tasks,)) # uniform
# Logistic regression parameters, basis
prior_midpoint = num_trials / 2
prior_gradient = 0.1
basis_logistic_prior_means = np.array(km_to_b0b1(prior_gradient, prior_midpoint))
basis_logistic_prior_Hessian = np.ones((2,)) * 0.001 # very uncertain
# Logistic regression parameters, tensor
prior_midpoint = num_trials / 2
prior_gradient = 0.1
tensor_logistic_prior_means = np.array(km_to_b0b1(prior_gradient, prior_midpoint))
tensor_logistic_prior_Hessian = np.ones((2,)) * 0.001 # very uncertain
# rpy2 setup
base = importr('base')
utils = importr('utils')
arm = importr('arm')
# Reward inference parameters
v_prior_mean = true_V
v_prior_var = 0.1
p_prior_mean = true_P
p_prior_var = 0.001
vp_prior_params = (v_prior_mean, v_prior_var, p_prior_mean, p_prior_var)
# simulation setup
R = [true_V, true_V] # list of accumulated rewards
T = [0, 0] # list of timesteps at which rewards were accumulated
b_X = [num_trials - 1, 0] # list of times
b_Y = [1, 0] # list of successes at those times
t_X = [num_trials - 1, 0] # list of times
t_Y = [1, 0] # list of successes at those times
task_counts = np.zeros((max_tasks,)) # list of tasks sampled
task_counts[0] = 4
# initial representation reward values
basis_val = 0
tensor_val = 0
basis_pick_count = 2
basis_generalization_count = 0
tensor_pick_count = 2
tensor_generalization_count = 0
# simulation loop
choices = []
for t in np.arange(2, num_trials):
# pick number of tasks
num_tasks = np.random.choice(np.arange(1, max_tasks + 1), p=true_tasks)
task_counts[num_tasks - 1] += 1
# pick representation according to softmax over values
is_tensor = (pick_action(basis_val, tensor_val, params=epsilon, algorithm="e-greedy") == 1) # pick_action returns 1 for tensor
choices.append(is_tensor)
# get reward for all tasks
if is_tensor:
trial_R, trial_T = get_reward(num_tasks, tensor_pick_count + tensor_generalization_count, is_tensor, tensor_true_params)
# add data to success history
for i in range(len(trial_T)):
t_X.append(tensor_pick_count + tensor_generalization_count)
t_Y.append(1)
for i in range(num_tasks - len(trial_T)):
t_X.append(tensor_pick_count + tensor_generalization_count)
t_Y.append(0)
tensor_pick_count += 1
basis_pick_count += delta
else:
trial_R, trial_T = get_reward(num_tasks, basis_pick_count + basis_generalization_count, is_tensor, basis_true_params)
# add data to success history
for i in range(len(trial_T)):
b_X.append(basis_pick_count + basis_generalization_count)
b_Y.append(1)
for i in range(num_tasks - len(trial_T)):
b_X.append(basis_pick_count + basis_generalization_count)
b_Y.append(0)
basis_pick_count += 1
tensor_pick_count += delta
# add data to reward history
R += trial_R
T += trial_T
# infer V, P based off of reward history
# infer_V, infer_P = sample_vp(np.array(R).reshape(-1, 1), np.array(T).reshape(-1, 1), vp_prior_params)
infer_V = true_V
infer_P = true_P
# infer logistic training curve parameters based off of success
b_X_rpy2 = FloatVector(b_X).r_repr()
b_Y_rpy2 = FloatVector(b_Y).r_repr()
robjects.r('''
b_data = data.frame(%s, cbind(%s, 1 - %s))
b_reg = bayesglm((cbind(%s, 1-%s)) ~ %s, b_data, family="binomial", prior.mean = %f, prior.mean.for.intercept = %f)
''' % (b_X_rpy2, b_Y_rpy2, b_Y_rpy2, b_Y_rpy2, b_Y_rpy2, b_X_rpy2, basis_true_b1, basis_true_b0))
b_reg = robjects.r['b_reg']
t_X_rpy2 = FloatVector(t_X).r_repr()
t_Y_rpy2 = FloatVector(t_Y).r_repr()
robjects.r('''
t_data = data.frame(%s, cbind(%s, 1 - %s))
t_reg = bayesglm((cbind(%s, 1-%s)) ~ %s, t_data, family="binomial", prior.mean = %f, prior.mean.for.intercept = %f)
''' % (t_X_rpy2, t_Y_rpy2, t_Y_rpy2, t_Y_rpy2, t_Y_rpy2, t_X_rpy2, tensor_true_b1, tensor_true_b0))
t_reg = robjects.r['t_reg']
# infer task distribution based off of num_tasks
infer_task_dist = sample_multinomial(task_counts, prior_alpha)
# recompute expected basis discounted reward values
basis_val = float(expected_discounted_reward(b_reg.rx2('coefficients')[0], b_reg.rx2('coefficients')[1], infer_V, infer_P, gamma, basis_pick_count + basis_generalization_count, num_trials - tensor_pick_count + basis_generalization_count, max_tasks, infer_task_dist))
basis_val = float(expected_discounted_reward(basis_true_b0, basis_true_b1, infer_V, infer_P, gamma, basis_pick_count + basis_generalization_count, num_trials - tensor_pick_count + basis_generalization_count, max_tasks, infer_task_dist))
# recompute expected tensor reward value -- infer_P = 0 since you're parallelizing the execution
tensor_val = float(expected_discounted_reward(t_reg.rx2('coefficients')[0], t_reg.rx2('coefficients')[1], infer_V, 0, gamma, tensor_pick_count + tensor_generalization_count, num_trials - basis_pick_count + tensor_generalization_count, max_tasks, infer_task_dist))
tensor_val = float(expected_discounted_reward(tensor_true_b0, tensor_true_b1, infer_V, 0, gamma, tensor_pick_count + tensor_generalization_count, num_trials - basis_pick_count + tensor_generalization_count, max_tasks, infer_task_dist))
print("%d,%d" % (tensor_pick_count, basis_pick_count))
print(choices)