In [1]:
from opt.mc_sim import *
from common.variables import *
from sim.sim_functions import *
import torch
import gym
from gym import spaces
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete, Tuple, MultiBinary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
s = SourcingEnv()

In [3]:
class CustomGymEnv(Env):
    #metadata = {'render.modes': ['human']}  
    
    def __init__(self, sourcing_env):
        self.SourcingEnv = sourcing_env
        self.counter = 0
        
        # Actions we can take, down, stay, up
        self.action_space = MultiDiscrete([30,30])

        # Inventory Observation State
        self.observation_space = Box(low=np.array([-30, 0, 0, 0, 0]), high=np.array([30, 30, 30, 1, 1]), shape=(5,), dtype=int)
        #Tuple(Box(-30,30,shape=(1,), dtype=int), Discrete(30), Discrete(30), MultiBinary(2))
                                       
    
    def step(self, action):        
        reward = self.reward_func(self.SourcingEnv.current_state, action)
        next_state, event, i, event_probs, supplier_index = self.SourcingEnv.step(action)
        self.counter += 1
        
        info = {}
        
        if self.counter < PERIODS:
            done = False
        else:
            done = True
        
        next_state_array = np.array(next_state.get_list_repr())
        return next_state_array, reward, done, info
    
    def reset(self):
        self.SourcingEnv = SourcingEnv()
        return np.array(self.SourcingEnv.current_state.get_list_repr())
        
    def reward_func(self, state, action):
        reward_hb = H_COST * state.s if state.s >= 0 else  -B_PENALTY * state.s 
        reward = reward_hb + np.sum(np.multiply(action, PROCUREMENT_COST_VEC))
        reward = float(reward)
        return -reward
    
    #def render(self):
        # Implement viz
       # pass

In [4]:
s = SourcingEnv()

In [5]:
custom_gym_env = CustomGymEnv(s)

In [6]:
m_state = custom_gym_env.SourcingEnv.current_state
m_state.get_list_repr()

[0, 0, 0, 1, 1]

In [7]:
str(m_state)

'Stock: 0, n backorders: [0. 0.], supplier status (on/off): [1. 1.]'

In [8]:
from stable_baselines3.common.env_checker import check_env

In [9]:
check_env(custom_gym_env) #warn=True)

In [10]:
episodes = 15
for episode in range(1, episodes+1):
    state = custom_gym_env.reset()
    done = False
    cost = 0
    
    
    while not done:
        #env.render()
        action = custom_gym_env.action_space.sample()
        n_state, reward, done, info = custom_gym_env.step(action)
        cost+=reward
        observation = custom_gym_env.step(action)
    print('Episode:{} Cost:{} Observation {}' .format(episode, cost, observation))
    

custom_gym_env.close()

Episode:1 Cost:-355.0 Observation (array([26, 10, 26,  1,  1]), -355.0, True, {})
Episode:2 Cost:-770.0 Observation (array([17, 17,  2,  1,  1]), -770.0, True, {})
Episode:3 Cost:-795.0 Observation (array([15, 15, 48,  1,  1]), -795.0, True, {})
Episode:4 Cost:-810.0 Observation (array([16, 16, 36,  1,  1]), -810.0, True, {})
Episode:5 Cost:-1195.0 Observation (array([14, 50, 14,  1,  1]), -1195.0, True, {})
Episode:6 Cost:-1285.0 Observation (array([26, 26, 46,  1,  1]), -1285.0, True, {})
Episode:7 Cost:-60.0 Observation (array([12,  0, 12,  1,  1]), -60.0, True, {})
Episode:8 Cost:-880.0 Observation (array([19, 19, 10,  1,  1]), -880.0, True, {})
Episode:9 Cost:-1305.0 Observation (array([29, 29,  0,  1,  1]), -1305.0, True, {})
Episode:10 Cost:-440.0 Observation (array([ 8,  8, 32,  1,  1]), -440.0, True, {})
Episode:11 Cost:-530.0 Observation (array([11, 11, 14,  1,  1]), -530.0, True, {})
Episode:12 Cost:-825.0 Observation (array([18, 18,  6,  1,  1]), -825.0, True, {})
Episode:1

In [11]:
custom_gym_env.observation_space

Box([-30   0   0   0   0], [30 30 30  1  1], (5,), int64)

In [12]:
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [13]:
log_path = os.path.join('Training', 'Logs')

In [14]:
model = PPO("MlpPolicy", custom_gym_env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [15]:
model.learn(total_timesteps=100000)

Logging to Training/Logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -740     |
| time/              |          |
|    fps             | 1206     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -674        |
| time/                   |             |
|    fps                  | 978         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014608638 |
|    clip_fraction        | 0.116       |
|    clip_range           | 0.2         |
|    entropy_loss         | -6.8        |
|    explained_variance   | 0           |

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -145        |
| time/                   |             |
|    fps                  | 846         |
|    iterations           | 11          |
|    time_elapsed         | 26          |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.052655995 |
|    clip_fraction        | 0.645       |
|    clip_range           | 0.2         |
|    entropy_loss         | -5.27       |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 2.26e+04    |
|    n_updates            | 100         |
|    policy_gradient_loss | -0.108      |
|    value_loss           | 5.23e+04    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1     

--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1        |
|    ep_rew_mean          | -4.3     |
| time/                   |          |
|    fps                  | 837      |
|    iterations           | 21       |
|    time_elapsed         | 51       |
|    total_timesteps      | 43008    |
| train/                  |          |
|    approx_kl            | 0.177923 |
|    clip_fraction        | 0.856    |
|    clip_range           | 0.2      |
|    entropy_loss         | -1.16    |
|    explained_variance   | 0        |
|    learning_rate        | 0.0003   |
|    loss                 | 44.8     |
|    n_updates            | 200      |
|    policy_gradient_loss | -0.121   |
|    value_loss           | 82.9     |
--------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | -1.9       |
| time/          

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 835         |
|    iterations           | 31          |
|    time_elapsed         | 75          |
|    total_timesteps      | 63488       |
| train/                  |             |
|    approx_kl            | 0.000356866 |
|    clip_fraction        | 0.00181     |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.00611    |
|    explained_variance   | 0           |
|    learning_rate        | 0.0003      |
|    loss                 | 0.000616    |
|    n_updates            | 300         |
|    policy_gradient_loss | -0.00276    |
|    value_loss           | 1.26        |
-----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 1   

--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 1        |
|    ep_rew_mean          | 0        |
| time/                   |          |
|    fps                  | 833      |
|    iterations           | 41       |
|    time_elapsed         | 100      |
|    total_timesteps      | 83968    |
| train/                  |          |
|    approx_kl            | 0.0      |
|    clip_fraction        | 0        |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.00295 |
|    explained_variance   | nan      |
|    learning_rate        | 0.0003   |
|    loss                 | 0.00247  |
|    n_updates            | 400      |
|    policy_gradient_loss | 0        |
|    value_loss           | 0.00274  |
--------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | 0          |
| time/          

<stable_baselines3.ppo.ppo.PPO at 0x7fc171b5c250>

In [16]:
model.save('PPO')

In [17]:
evaluate_policy(model, custom_gym_env, n_eval_episodes=10, render=False)



(0.0, 0.0)

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu
