Import packages.

In [2]:
import gym
import numpy as np
import dual_sourcing

Set up configurations for the environment.

In [23]:
Ghat = np.random.geometric(p=0.5)
Lambda = 10
init_inventory  = 0
for i in range(Ghat):
    init_inventory = init_inventory - np.random.poisson(Lambda)
print(init_inventory)

-19


In [27]:
CONFIG = {'Lr': 5, 'Le': 1, 'cr': 100, 'ce': 105, 'lambda': Lambda,
          'h': 1, 'b': 19, 'starting_state': np.append([0]*6, init_inventory), 'max_order': 20, 'max_inventory': 1000}

Make an instance of the environment.

In [28]:
env = gym.make('DualSourcing-v0', config=CONFIG)

Print the environment settings.

In [29]:
print(env.state)
print(env.action_space)
print(env.observation_space)

[  0   0   0   0   0   0 -19]
MultiDiscrete([21 21])
MultiDiscrete([  21   21   21   21   21   21 1000])


Test the step function.

In [5]:
env.seed(0)
env.state = [0] * 7
print(env.step([8, 2]))
print('***')
env.state = [5, 4, 5, 6, 6, 2, 2]
print(env.step([3, 1]))

(array([  0,   0,   0,   0,   8,   2, -10]), 0, 10, {})
***
(array([ 4,  5,  6,  6,  3,  1, -2]), -812, 11, {})


Evaluate an estimate of the value function for a certain policy by doing multiple episodes of simulation.

In [33]:
def evaluate(env, n_episodes, numiters, policy, *args):
    # policy: policy function
    # env: gym environment
    # n_episodes: number of total episodes to run (outer iteration)
    # numiters: number of time steps (inner iteration)
    
    av_reward = np.zeros(n_episodes)
    
    for i in range(n_episodes):
        av_r = 0
        env.reset() # reset environment
        for t in range(numiters):
            action = policy(*args) # use a constant action
            state, reward, demand, done, info = env.step(action)
            av_r = av_r + reward
        av_reward[i] = av_r / numiters
        
    return np.mean(av_reward), np.std(av_reward) # return average reward and std

In [6]:
def constant_policy(env):
    return np.array([0, 10])

In [38]:
evaluate(env, 100, 10000, constant_policy, env)

(-2879.9507599999993, 1905.6715052930335)

In [60]:
def TBS(env, Q, Se):
    ip = np.sum(env.state[1:env.Le]) + np.sum(env.state[env.Lr+1:])
    return Q, max(0, ip-Se)

In [67]:
evaluate(env, 100, 1000, TBS, env, 5, 3)

(-48426.34678, 1150.3982653473936)

In [1]:
def evaluate_done(env, n_episodes, numiters, policy, *args):
    # policy: policy function
    # env: gym environment
    # n_episodes: number of total episodes to run (outer iteration)
    # numiters: number of time steps (inner iteration)
    
    av_reward = np.zeros(n_episodes)
    
    for i in range(n_episodes):
        av_r = 0
        env.reset() # reset environment
        for t in range(numiters):
            action = policy(*args) # use a constant action
            state, reward, demand, done, info = env.step(action)
            av_r = av_r + reward
            if done:
                break
        print(done, t)
    
        av_reward[i] = av_r / numiters
    
        
    return np.mean(av_reward), np.std(av_reward) # return average reward and std

In [35]:
evaluate_done(env, 100, 1000000, constant_policy, env)

True 2644
True 7356
True 267
True 167
True 6281
True 949
True 6909
True 107
True 1144
True 77
True 430
True 73
True 136
True 27
True 133
True 63668
True 36
True 113
True 459
True 5375
True 168
True 184
True 1064
True 167
True 39
True 320
True 78
True 239
True 1690
True 141
True 86
True 274
True 47
True 211
True 4272
True 3435
True 123
True 63
True 224
True 9662
True 125
True 428
True 595
True 114
True 89
True 21
True 606
True 13881
True 193
True 98
True 79
True 109
True 1080
True 43
True 4795
True 125
True 195
True 1260
True 4867
True 28
True 3635
True 345
True 88
True 3170
True 57
True 1523
True 186
True 122
True 418
True 150
True 114
True 276
True 7167
True 2640
True 252
True 61
True 130
True 38
True 383
True 8741
True 42
True 157
True 2519
True 4641
True 186
True 429
True 5543
True 32
True 295706
True 211
True 42
True 25
True 722
True 89
True 127
True 1277
True 5151
True 90
True 335
True 266


(-89.18548736999999, 779.2597477737679)

In [37]:
evaluate_done(env, 100, 10000, constant_policy, env)

False 9999
False 9999
True 53
True 399
True 834
True 2116
False 9999
True 1399
True 154
True 345
True 162
False 9999
True 4685
True 8033
True 114
True 62
True 1064
True 176
True 416
True 408
True 140
True 347
True 7639
True 116
True 43
True 138
True 654
True 7138
True 57
False 9999
True 116
True 1565
True 49
True 275
True 414
True 394
True 384
False 9999
True 75
True 249
True 5805
True 159
True 230
True 1312
True 147
True 675
True 28
True 62
True 54
True 85
True 605
True 160
True 52
True 3509
True 2847
True 46
True 274
True 41
False 9999
True 41
True 1568
True 14
True 85
True 323
True 253
True 4606
True 909
True 68
True 6227
True 21
True 418
True 308
True 45
True 320
True 4734
False 9999
True 1637
True 75
True 135
True 1310
True 3622
True 2118
True 602
True 1030
True 442
True 697
True 120
False 9999
True 394
True 53
False 9999
True 398
True 335
True 20
True 377
True 1010
True 44
True 963
True 417
True 132


(-881.7967240000002, 1934.8869036319534)