In [1]:
import pandas as pd
import numpy as np
import time

def reward(s):
    a, b = s
    return a / (a + b)

def transition(s, a, ss):
    if a == 0:
        return int(s == ss)
    if a == 1:
        x, y = s
        c, d = ss
        if x + y + 1 != c + d:
            return 0
        if x == c:
            return y / (x + y)
        if y == d:
            return x / (x + y)

def score(s, theta = 0.5):
    a, b = s
    mean = a / (a + b)
    std = np.sqrt(a * b / (a + b) / (a + b) /(a + b + 1))
    return mean + std * theta

def pull(current, K, alpha = 1/3):
    score_list = []
    for s in current:
        score_list.append((score(s), s))
    score_list.sort(reverse = True)
    pull = {}
    reward_period = 0
    
    resource = int(alpha * K)
    for score_of_s, s in score_list:
        if resource <= 0:
            break
        if resource > current[s]:
            pull[s] = current[s]
            reward_period = reward_period + current[s] * reward(s)
            resource = resource - current[s]
            continue
        if resource < current[s]:
            pull[s] = resource
            reward_period = reward_period + resource * reward(s)
            resource = 0
            continue
    return pull, reward_period

def proceed(current, pull):
    next_time = {}
    for arm in current:
        if arm not in pull:
            next_time[arm] = current[arm]
    
    for arm in current:
        if (arm in pull) and (current[arm] > pull[arm]):
            next_time[arm] = current[arm] - pull[arm]
    
    for arm in pull:
        a, b = arm
        m = np.random.binomial(pull[arm], reward(arm), 1)[0]
        if (a + 1, b) in next_time:   
            next_time[(a + 1, b)] += m
        else:
            next_time[(a + 1, b)] = m
        
        if (a, b + 1) in next_time:
            next_time[(a, b + 1)] += pull[arm] - m
        else:
            next_time[(a, b + 1)] = pull[arm] - m
    return next_time



def batched_simulation(T, K, alpha, times, obj_val):
    reward_list = []
    start = time.time()
    for _ in range(times):
        current = {(1, 1): K}
        reward_total = 0
        for _ in range(T):
            pulling, reward_period = pull(current, K)
            # print(reward_period)
            # print(pulling)
            # print(reward_period)
            reward_total = reward_total + reward_period
            current = proceed(current, pulling)
        reward_list.append(reward_total)
    rewards = np.array(reward_list)
    end = time.time()
    return obj_val * K - np.mean(rewards), K, times, np.mean(rewards), np.std(rewards), end - start

def wrapper(args):
    return batched_simulation(*args)

In [2]:
from multiprocessing import Pool

def parallel_UCB(T, N, alpha, n_proc, obj_val, times=1000):
    args = (T, N, alpha, times, obj_val)
    with Pool(n_proc) as p:
        # res = [..., [opt_gap, N, M, mean, std, time],...]
        res = p.map(wrapper, [args for _ in range(n_proc)])
    m = times * n_proc
    mean = np.mean([item[3] for item in res])
    std = np.sqrt(np.sum([times * item[4] ** 2 for item in res])) / (n_proc * times)
    comp_time = np.max([item[-1] for item in res])
    opt_gap = np.mean([item[0] for item in res])
    return opt_gap, N, m, mean, std, comp_time


In [3]:
def UCB_simulation(N, alpha, model, times=1000):
    T = model.T
    obj_val = model.objVal
    n_proc = int(N * 50 / times) + 1
    return parallel_UCB(T, N, alpha, n_proc, obj_val, times)

In [4]:
import pandas as pd
import os
from lp import fluid_model

T, alpha = 20, 1/3
model = fluid_model(T, [alpha]*T)
model.calculate_diffusion_index()

def create_file(model, file_name, start=150, end=38400):
    if os.path.exists(file_name):
        return
    
    
    df = pd.DataFrame(index=["opt-gap", "N", "M", "expect-reward", "std", "time"])
    df.to_csv(file_name, index=True)
    N = start
    while N <= end:
        res = UCB_simulation(N, alpha, model, times=1000)
        df[N] = res
        df.to_csv(file_name, index=True)
        print(f"N: {N} finished. {df[N]}")
        N = N * 2
    return

Using license file /home/xz556/gurobi.lic
Academic license - for non-commercial use only
Changed value of parameter Method to 1
   Prev: -1  Min: -1  Max: 5  Default: -1
Obj: 4.81431


In [5]:
create_file(model, "ucb-20")

N: 150 finished. opt-gap             2.264984
N                 150.000000
M                8000.000000
expect-reward     719.881790
std                 0.194248
time               30.376941
Name: 150, dtype: float64
N: 300 finished. opt-gap              3.060103
N                  300.000000
M                16000.000000
expect-reward     1441.233445
std                  0.182625
time                32.340412
Name: 300, dtype: float64
N: 600 finished. opt-gap              6.263231
N                  600.000000
M                31000.000000
expect-reward     2882.323865
std                  0.194516
time                46.782664
Name: 600, dtype: float64
N: 1200 finished. opt-gap              7.847624
N                 1200.000000
M                61000.000000
expect-reward     5769.326568
std                  0.183868
time                76.156630
Name: 1200, dtype: float64
N: 2400 finished. opt-gap              15.916589
N                  2400.000000
M                121000.000000
e