### Does offline(known) and offline(unknown) equilavent?

In [1]:
import numpy as np
from datetime import datetime
import coptpy as cp 
from coptpy import COPT

def offline_policy_linear(env_dict,T_type='unknown', runtime = False, Qcons = False):
    env = cp.Envr()
    model = env.createModel("offline_policy")
    model.setParam('Logging', False)

    B,bmax,fun = env_dict['B'],env_dict['bmax'],env_dict['f']
    T = env_dict['N'] if T_type == 'known' else env_dict['N_max']
    x = model.addMVar(T, lb=0, ub=bmax,vtype=COPT.CONTINUOUS, nameprefix="x")
    f = np.array([fun[t]['params']['f1'] for t in range(T)])
    if T_type == 'known':
        def loss(x):
            return f@x
        model.addConstr(x@np.ones(T)<=B)
    elif T_type == 'unknown':
        def loss(x):
            return f@np.diag(env_dict['Qs'][:T])@x
        if Qcons:
            model.addConstr(x@np.array(env_dict['Qs'][:T])<=B)
        else:
            model.addConstr(x@np.ones(T)<=B)
    start_time = datetime.now()
    model.setObjective(loss(x), sense=COPT.MAXIMIZE)
    model.solve()
    if model.status == COPT.OPTIMAL:
        solution=[(x[t].X) for t in range(T)]
        fun = model.objval
    else:
        print("Optimization was stopped with status:", model.status)
        solution=None
        fun=None
    if runtime:
        print('optimization time for N={T}:', datetime.now()-start_time)
    return solution, fun

In [29]:
from scipy.stats import rv_discrete

args = {
        'N_type': 'normal',
        'N_min': 5,
        'N_max':200, # NOTE: when Q_exp, set large N_max(200)!
                    # NOTE: when Q_power, set large N_max(70)!
        'N_mean':10,
        'N_std':1,
        'B_type':'mean',
        'astype':'multiplicative',
        'as_scale':5,
        'as_stepsize':10,
        'max_scale':50,
        'rounds':3,
        'Q_beta':0.99,
        'Q_alpha':0.75,
        'f_shape':'linear',
        'f_coef':rv_discrete(values=([2,3], [0.5,0.5])),# x**0.9
    }

In [7]:
N_mean = 3
N_min, N_max = N_mean, N_mean 
rv = rv_discrete(values=([N_mean],[1]))
Qs = np.array([1 - rv.cdf(t - 0.001) for t in range(1, N_max + 1)]+[0])
Qs
probs = [Qs[n]-Qs[n+1] for n in range(N_max)]
probs.append(Qs[-1])
probs

[0.0, 0.0, 1.0, 0.0]

In [73]:
from scipy.stats import truncnorm
from src.utils import rv_cont2dsct
N_mean = 5
N_std = 1
N_min, N_max = N_mean-2*N_std, N_mean+2*N_std #[3,7]
a, b = (N_min - N_mean) / N_std, (N_max - N_mean) / N_std
rv_continuous = truncnorm(a, b, loc=N_mean, scale=N_std) 
rv = rv_cont2dsct(rv_continuous,N_min,N_max)
Qs = np.array([1 - rv.cdf(t - 0.001) for t in range(1, N_max + 1)]+[0])
print(Qs)
probs = [Qs[n]-Qs[n+1] for n in range(N_max)]
probs.append(Qs[-1])
print(probs)
sum(probs)

[1.         1.         1.         0.95384276 0.70058933 0.29941067
 0.04615724 0.        ]
[0.0, 0.0, 0.04615723572698305, 0.2532534356100451, 0.4011786573259435, 0.2532534356100453, 0.04615723572698305, 0.0]


1.0

In [74]:
from scipy.stats import truncnorm, truncexpon
from src.utils import rv_cont2dsct
N_mean = 5
N_min = 1
N_max = 2*N_mean 
b = (N_max - N_min) / N_mean
rv_continuous = truncexpon(b=b, loc=N_min, scale=N_mean)
rv = rv_cont2dsct(rv_continuous,N_min,N_max)
Qs = np.array([1 - rv.cdf(t - 0.001) for t in range(1, N_max + 1)]+[0])
print(Qs)
probs = [Qs[n]-Qs[n+1] for n in range(N_max)]
probs.append(Qs[-1])
print(probs)
sum(probs)

[1.         0.88599203 0.68949151 0.5286105  0.39689227 0.2890505
 0.20075713 0.12846863 0.06928381 0.02082738 0.        ]
[0.11400797317891087, 0.1965005138243826, 0.16088101366364715, 0.131718233472787, 0.10784176848527638, 0.08829337232521173, 0.07228849921561509, 0.05918481740167769, 0.04845643012205836, 0.02082737831043313, 0.0]


1.0

In [None]:
from src.Env import Envr
env = Envr(args)
env_dict = env.draw_instance()
Qs = env_dict['Qs']
probs = [Qs[n]-Qs[n+1] for n in range(env_dict['N_max'])]
probs.append(Qs[-1])
known_reward = 0
for T in range(env.N_min,env.N_max+1):
    env_dict['N'] = T
    _,fun_known = offline_policy_linear(env_dict,'known',False)
    known_reward += probs[T-1]*fun_known
_,fun_unknown = offline_policy_linear(env_dict,'unknown',False,True)
print(known_reward,fun_unknown)

In [16]:
def offline_policy_sqrt(env_dict,T_type='unknown', runtime = False, Qcons = False):
    env = cp.Envr()
    model = env.createModel("offline_policy")
    model.setParam('Logging', False)

    B,bmax,fun = env_dict['B'],env_dict['bmax'],env_dict['f']
    T = env_dict['N'] if T_type == 'known' else env_dict['N_max']
    x = model.addMVar(T, lb=0, ub=np.sqrt(bmax),vtype=COPT.CONTINUOUS, nameprefix="x")
    f = np.array([fun[t]['params']['f1'] for t in range(T)])
    if T_type == 'known':
        def loss(x):
            return f@x
        model.addQConstr(x@x<=B)
    elif T_type == 'unknown':
        def loss(x):
            return f@np.diag(env_dict['Qs'][:T])@x
        if Qcons:
            model.addQConstr(x@np.diag(env_dict['Qs'][:T])@x<=B)
        else:
            model.addQConstr(x@x<=B)
    start_time = datetime.now()
    
    model.setObjective(loss(x), sense=COPT.MAXIMIZE)
    model.solve()
    if model.status == COPT.OPTIMAL:
        solution=[(x[t].X)**2 for t in range(T)]
        fun = model.objval
    else:
        print("Optimization was stopped with status:", model.status)
        solution=None
        fun=None
    if runtime:
        print('optimization time for N={T}:', datetime.now()-start_time)
    return solution, fun

In [None]:
from src.Env import Envr
args['f_shape'] = 'sqrt'
env = Envr(args)
env_dict = env.draw_instance()
Qs = env_dict['Qs']
probs = [Qs[n]-Qs[n+1] for n in range(env_dict['N_max'])]
probs.append(Qs[-1])
known_reward = 0
for T in range(env.N_min,env.N_max+1):
    env_dict['N'] = T
    _,fun_known = offline_policy_sqrt(env_dict,'known',False)
    known_reward += probs[T-1]*fun_known
_,fun_unknown_Q = offline_policy_sqrt(env_dict,'unknown',False,True)
_,fun_unknown = offline_policy_sqrt(env_dict,'unknown',False,False)
print(known_reward,fun_unknown_Q,fun_unknown)

In [60]:
def wrapper(args,f_shape,random_coef,N_type):
    args['N_type'] = N_type 
    args['f_shape'] = f_shape
    if random_coef:
        args['f_coef']=rv_discrete(values=([2,3], [0.5,0.5]))
    else:
        args['f_coef']=rv_discrete(values=([2,2], [0.5,0.5]))
    env = Envr(args)
    env_dict = env.draw_instance()
    Qs = env_dict['Qs']
    probs = [Qs[n]-Qs[n+1] for n in range(env_dict['N_max'])]
    probs.append(Qs[-1])
    known_reward = 0
    for T in range(env.N_min,env.N_max+1):
        env_dict['N'] = T
        _,fun_known = offline_policy_linear(env_dict,'known',False)
        known_reward += probs[T-1]*fun_known
    _,fun_unknown_Q = offline_policy_linear(env_dict,'unknown',False,True)
    _,fun_unknown = offline_policy_linear(env_dict,'unknown',False,False)
    result = [f"{f_shape}_RandomCoef_{random_coef}_{N_type}: exact {known_reward} approx_Q {fun_unknown_Q} approx {fun_unknown}"]
    return result

In [None]:
all_result = []
result = wrapper(args,'linear',True,'normal')
all_result.append(result)

result = wrapper(args,'linear',False,'normal')
all_result.append(result)

result = wrapper(args,'sqrt',True,'normal')
all_result.append(result)

result = wrapper(args,'sqrt',False,'normal')
all_result.append(result)


In [68]:
for result in all_result:
    print(result)

['linear_RandomCoef_True_normal: exact 167.02831319698186 approx_Q 167.0283131969817 approx 161.90412135301594']
['linear_RandomCoef_False_normal: exact 120.0 approx_Q 120.0 approx 120.0']
['sqrt_RandomCoef_True_normal: exact 162.04957890126454 approx_Q 162.04957890126448 approx 159.2947146967115']
['sqrt_RandomCoef_False_normal: exact 120.0 approx_Q 120.0 approx 120.0']


In [None]:
all_result = []
result = wrapper(args,'linear',True,'exponential')
all_result.append(result)

result = wrapper(args,'linear',False,'exponential')
all_result.append(result)

result = wrapper(args,'sqrt',True,'exponential')
all_result.append(result)

result = wrapper(args,'sqrt',False,'exponential')
all_result.append(result)

In [70]:
for result in all_result:
    print(result)

['linear_RandomCoef_True_exponential: exact 90.38967692322203 approx_Q 104.14306039271825 approx 87.1304439713688']
['linear_RandomCoef_False_exponential: exact 70.26807430484193 approx_Q 84.02145777433813 approx 70.26807430484193']
['sqrt_RandomCoef_True_exponential: exact 87.73898576320818 approx_Q 101.49236923270436 approx 83.86335925448854']
['sqrt_RandomCoef_False_exponential: exact 70.26807430484193 approx_Q 84.02145777433813 approx 70.26807430484193']


In [72]:
env.N_min

48

## 结论
b is always linear

f is linear, 与约束不加Q等价 [9.974094096084459 9.974094096084459]

f is sqrt, 始终不等价(加 Q [9.948586323679674 10.000000049839336],
                        不加Q [9.948586323679674 9.482202012847173])