In [1]:
import click
import numpy as np
import time
from gurobipy import *
import collections

class fluid_model():
    def p(self, s, a, sprime):
        if a == 0:
            return s == sprime
        if a == 1:
            tmp_a, tmp_b = s[0], s[1]
            tmp_c, tmp_d = sprime[0], sprime[1]
            if tmp_a == tmp_c and tmp_b + 1 == tmp_d:
                return 1 - (tmp_a + 1)/(tmp_a + tmp_b + 2)
            if tmp_a + 1 == tmp_c and tmp_b == tmp_d:
                return (tmp_a + 1)/(tmp_a + tmp_b + 2)
            return 0
    
    def reward(self, s, a):
        if a == 0:
            return 0.0
        else:
            return (s[0] + 1)/(s[0] + s[1] + 2)
    def __init__(self, T, alphas):
        self.alphas = alphas
        self.T = T
        self.m, self.x, self.y, self.z = None, {}, {}, {}
        self.duals = None
        self.A, self.B, self.C = set(), set(), set()

    def __LP_sol(self, T, alphas):
        
        time = [t for t in range(T)]
        win = [i for i in range(T)]
        los = [j for j in range(T)]
        d_var, reward = multidict({(t, i, j): (i+1)/(i+j+2) \
                                    for t in time \
                                        for i in win \
                                            for j in los})
        m = Model("MAB")
        n = m.addVars(time, win, los, name = "n")
        x = m.addVars(time, win, los, name = "x")
        
        # set ojective
        m.setObjective(sum(x[t, i, j] * reward[(t, i, j)] \
                            for t in time for i in win \
                                for j in los), GRB.MAXIMIZE)
        
        # add resource constraint
        m.addConstrs(sum(x[t, i, j] for i in win for j in los) \
                        == alphas[t] for t in time)
        
        # add initial constraint
        m.addConstr(n[0, 0, 0] == 1)
        m.addConstr(sum(n[0, i, j] for i in win for j in los) == 1)
        
        # add x constraint
        m.addConstrs(x[t, i, j] <= n[t, i, j] for t in time for i in win for j in los)
        m.addConstrs(x[t, i, j] >= 0 for t in time for i in win for j in los)
        
        # add fluid balance
        m.addConstrs(n[t, i, j] - x[t-1, i-1, j]*reward[(t-1, i-1, j)] - x[t-1, i, j-1]*(1-reward[(t-1, i, j-1)]) - (n[t-1, i, j] - x[t-1, i, j]) == 0 
                    for t in range(1, T) for i in range(1, T) for j in range(1, T))
        m.addConstrs(n[t, 0, j] - x[t-1, 0, j-1]*(1-reward[(t-1, 0, j-1)]) - (n[t-1, 0, j] - x[t-1, 0, j])== 0 
                    for t in range(1, T) for j in range(1, T))
        m.addConstrs(n[t, i, 0] - x[t-1, i-1, 0]*reward[(t-1, i-1, 0)] - (n[t-1, i, 0] - x[t-1, i, 0]) == 0
                    for t in range(1, T) for i in range(1, T))
        m.addConstrs(n[t, 0, 0] - (n[t-1, 0, 0] - x[t-1, 0, 0]) == 0 for t in range(1, T))
        m.setParam('OutputFlag', False)
        m.optimize()
        print('Obj: %g' % m.objVal)
        self.objVal = m.objVal
        
        return m

    def __solve(self, method=1):
        setParam("Method", method)
        T = self.T
        m = self.__LP_sol(self.T, self.alphas)
        self.m = m
        self.duals = m.PI[0: T]
        
        self.sorted = {t: [(a, b) for a in range(T) for b in range(T) if a + b <= t] for t in range(T)}
        v = {}
        for t in range(T - 1, -1, -1):
            advantage = {(a, b): 0 for a in range(T) for b in range(T) if a + b <= t}
            for a in range(T):
                for b in range(T):
                    if a + b <= t:
                        if t == T - 1:
                            r_pull = self.reward((a, b), 1) - self.duals[t]
                            r_idle = 0
                        else:
                            r_pull = self.reward((a, b), 1) - self.duals[t] + \
                                     self.p((a, b), 1, (a + 1, b)) * v[t + 1, (a + 1, b)] + \
                                     self.p((a, b), 1, (a, b + 1)) * v[t + 1, (a, b + 1)]
                            r_idle = v[t + 1, (a, b)]
                        advantage[(a, b)] = r_pull - r_idle
                        if r_pull < r_idle:
                            v[(t, (a, b))] = r_idle
                            continue
                        v[(t, (a, b))] = r_pull
            self.sorted[t].sort(reverse=True, key=lambda x: advantage[x])
        
        self.z[(0, (0, 0))] = 1
        self.pull_reward = 0
        for t in range(T):
            resource = self.alphas[t]
            for a, b in self.sorted[t]:
                self.x[(t, (a, b))] = min(self.z[(t, (a, b))], resource)
                self.pull_reward += self.x[(t, (a, b))] * self.reward((a, b), 1)
                self.y[(t, (a, b))] = self.z[(t, (a, b))] - self.x[(t, (a, b))]
                resource = resource - self.x[(t, (a, b))]
            if t == T - 1:
                continue
            for a, b in self.sorted[t + 1]:
                self.z[(t + 1, (a, b))] = (self.y[(t, (a, b))] if a + b <= t else 0) + \
                                          a / (a + b + 1) * (self.x[(t, (a - 1, b))] if a >= 1 else 0) + \
                                          b / (a + b + 1) * (self.x[(t, (a, b - 1))] if b >= 1 else 0)
    
    def calculate_occupation_measure_and_classify_state(self, epsilon=10**(-6)):
        self.__solve()
        m, sol, T = self.m, dict(), self.T

        for key in self.z:
            if self.x[key] > epsilon and self.y[key] < epsilon:
                self.A.add(key)
                continue
            if self.x[key] > epsilon and self.y[key] > epsilon:
                self.B.add(key)
                continue
            if self.x[key] < epsilon and self.y[key] > epsilon:
                self.C.add(key)
                continue
        return
    
    def check_degeneracy(self):
        if sorted([key[0] for key in self.B]) == [i for i in range(self.T)] and \
                    abs(self.objVal - self.pull_reward) < 10**(-13):
            print(f"""
            Model objective: {self.pull_reward}, different from {self.objVal} (solution from LP) less than 10e-15.
            Model is non-degenerate.
            """)
        else:
            raise Exception("Model is degenerate.\n")

    def calculate_diffusion_index(self, epsilon=10**(-8)):
        self.calculate_occupation_measure_and_classify_state()
        x, y, z, T = self.x, self.y, self.z, self.T
        v = {(t, (a, b)): 0 for t in range(T + 1) for a in range(t + 1) for b in range(t + 1) if a + b <= t}
        diffusion_index = {(t, (a, b)): 0 for t in range(T) for a in range(t + 1) for b in range(t + 1) if a + b <= t}

        for t in range(T - 1, -1, -1):
            state_t = [(a, b) for a in range(t + 1) for b in range(t + 1) if a + b <= t]
            for s in state_t:
                s_w = (s[0] + 1, s[1]) # state after win
                s_l = (s[0], s[1] + 1) # state after loss
                diffusion_index[(t, s)] = self.reward(s, 1) + self.p(s, 1, s_w)*v[(t + 1, s_w)] \
                                          + self.p(s, 1, s_l)*v[(t + 1, s_l)] \
                                          - self.reward(s, 0) - self.p(s, 0, s)*v[(t + 1, s)]
            
            l_y_g_0 = [(diffusion_index[(t, s)], s) for s in state_t if self.x[(t, s)] > epsilon]

            _, sbar = min(l_y_g_0)
            for s in state_t:
                if (t, s) in self.A:
                    v[(t, s)]= diffusion_index[(t, s)] - diffusion_index[(t, sbar)] + v[(t + 1, s)]
                else:
                    v[(t, s)] = self.reward(s, 0) + v[(t + 1, s)]
        self.diffusion_index = diffusion_index

In [2]:
# import numpy as np
# import pandas as pd
# import time
# from numba import jit

# @jit(nopython=True)
# def batch_simulation(T, K, alpha, times, obj_val, a0= 1, b0 = 1):
#     rewards = []
#     start = time.time()
#     for _ in range(times):
#         state = defaultdict(int)
#         state[(a0, b0)] = K
#         reward_total = 0
#         for t in range(T):
#             # sample posterior
#             posterior_sample = []
#             for a, b in state:
#                 val_list = np.random.beta(a, b, state[(a, b)])
#                 for val in val_list:
#                     posterior_sample.append((val, a, b))
#             posterior_sample.sort(reverse = True)
            
#             # choose the arms to pull
#             for i in range(int(alpha * K)):
#                 _, a, b = posterior_sample[i]
#                 theta = np.random.beta(a, b)
#                 outcome = int(np.random.binomial(1, theta))
                
#                 reward_total += outcome
                
#                 state[(a, b)] -= 1
#                 tmp = state.get((a + outcome, b + 1 - outcome), 0)
#                 state[(a + outcome, b + 1 - outcome)] = tmp  + 1
            
#         rewards.append(reward_total)
#     rewards = np.array(rewards)
#     end = time.time()
#     return obj_val * K - np.mean(rewards), K, times, np.mean(rewards), np.std(rewards), end - start

# def wrapper(args):
#     return batch_simulation(*args)

In [3]:
import numpy as np
import pandas as pd
import time
from numba import jit

@jit(nopython=True)
def batch_simulation(T, K, alpha, times, obj_val, a0= 1, b0 = 1):
    rewards = []
    start = time.time()
    for _ in range(times):
        state = collections.defaultdict(int)
        state[(a0, b0)] = K
        reward_total = 0
        for t in range(T):
            # sample posterior
            posterior_sample = []
            for a, b in state:
                val_list = np.random.beta(a, b, state[(a, b)])
                for val in val_list:
                    posterior_sample.append((val, a, b))
            posterior_sample.sort(reverse = True)
            
            # choose the arms to pull
            pull = {key: 0 for key in state}
            for i in range(int(alpha * K)):
                _, a, b = posterior_sample[i]
                pull[(a, b)] += 1
            
            reward_total += np.sum([a / (a + b) * pull[(a, b)] for a, b in pull])
            for a, b in pull:
                success = np.random.binomial(pull[(a, b)], a / (a + b))
                state[(a, b)] -= pull[(a, b)]
                state[(a + 1, b)] += success
                state[(a, b + 1)] += pull[(a, b)] - success

        rewards.append(reward_total)
    rewards = np.array(rewards)
    end = time.time()
    return obj_val * K - np.mean(rewards), K, times, np.mean(rewards), np.std(rewards), end - start

def wrapper(args):
    return batch_simulation(*args)

In [4]:
from multiprocessing import Pool

def parallel_TS(T, N, alpha, n_proc, obj_val, times=1000):
    args = (T, N, alpha, times, obj_val)
    with Pool(n_proc) as p:
        # res = [..., [opt_gap, N, M, mean, std, time],...]
        res = p.map(wrapper, [args for _ in range(n_proc)])
    m = times * n_proc
    mean = np.mean([item[3] for item in res])
    std = np.sqrt(np.sum([times * item[4] ** 2 for item in res])) / (n_proc * times)
    comp_time = np.max([item[-1] for item in res])
    opt_gap = np.mean([item[0] for item in res])
    return opt_gap, N, m, mean, std, comp_time


In [5]:
def thompson_sampling_simulation(N, alpha, model, times=1000):
    T = model.T
    obj_val = model.objVal
    n_proc = int(N * 50 / times) + 1
    return parallel_TS(T, N, alpha, n_proc, obj_val, times)

In [None]:
import pandas as pd
import os

T, alpha = 15, 1/3
model = fluid_model(T, [alpha]*T)
model.calculate_diffusion_index()

def create_file(model, file_name, start=150, end=38400):
    if os.path.exists(file_name):
        return
    
    
    df = pd.DataFrame(index=["opt-gap", "N", "M", "expect-reward", "std", "time"])
    df.to_csv(file_name, index=True)
    N = start
    while N <= end:
        res = thompson_sampling_simulation(N, alpha, model, times=1000)
        df[N] = res
        df.to_csv(file_name, index=True)
        print(f"N: {N} finished. {df[N]}")
        N = N * 2
    return
create_file(model, "ts-15-2")

In [6]:
import pandas as pd
import os

T, alpha = 15, 1/3
model = fluid_model(T, [alpha]*T)
model.calculate_diffusion_index()

def create_file(model, file_name, start=150, end=38400):
    if os.path.exists(file_name):
        return
    
    
    df = pd.DataFrame(index=["opt-gap", "N", "M", "expect-reward", "std", "time"])
    df.to_csv(file_name, index=True)
    N = start
    while N <= end:
        res = thompson_sampling_simulation(N, alpha, model, times=1000)
        df[N] = res
        df.to_csv(file_name, index=True)
        print(f"N: {N} finished. {df[N]}")
        N = N * 2
    return
create_file(model, "ts-15-1")

Using license file /Users/xiangyuzhang/gurobi.lic
Academic license - for non-commercial use only
Changed value of parameter Method to 1
   Prev: -1  Min: -1  Max: 5  Default: -1
Obj: 3.5162
N: 150 finished. opt-gap            42.376073
N                 150.000000
M                8000.000000
expect-reward     485.053370
std                 0.126438
time               12.999925
Name: 150, dtype: float64
N: 300 finished. opt-gap             83.796159
N                  300.000000
M                16000.000000
expect-reward      971.062727
std                  0.133321
time                36.643834
Name: 300, dtype: float64
N: 600 finished. opt-gap            168.364110
N                  600.000000
M                31000.000000
expect-reward     1941.353662
std                  0.130936
time               109.318295
Name: 600, dtype: float64


Process ForkPoolWorker-57:
Process ForkPoolWorker-100:
Process ForkPoolWorker-84:
Process ForkPoolWorker-105:
Process ForkPoolWorker-94:
Process ForkPoolWorker-110:
Process ForkPoolWorker-114:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/a

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3326, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-6-600bd7461a40>", line 23, in <module>
    create_file(model, "ts-15-1")
  File "<ipython-input-6-600bd7461a40>", line 17, in create_file
    res = thompson_sampling_simulation(N, alpha, model, times=1000)
  File "<ipython-input-5-e1e7ffd4e1a7>", line 5, in thompson_sampling_simulation
    return parallel_TS(T, N, alpha, n_proc, obj_val, times)
  File "<ipython-input-4-d560e9d4ddc7>", line 7, in parallel_TS
    res = p.map(wrapper, [args for _ in range(n_proc)])
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 268, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 651, in get
    self.wait(timeout)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py

KeyboardInterrupt: 

In [None]:
create_file(model, "ts-15-1")