In [1]:
from opt.mc_sim import *
from common.variables import *
from sim.sim_functions import *
import torch
import gym
from gym import spaces
from gym import Env
from gym.spaces import Discrete, Box, MultiDiscrete, Tuple, MultiBinary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
s = SourcingEnv()

In [3]:
class CustomGymEnv(Env):
        
    def __init__(self, sourcing_env):
        self.SourcingEnv = sourcing_env
        self.counter = 0
        
        # Actions we can take, down, stay, up
        self.action_space = MultiDiscrete([2,INVEN_LIMIT])

        # Inventory Observation State
        self.observation_space = Box(-30,30,shape=(5,), dtype=int)
        #Tuple(Box(-30,30,shape=(1,), dtype=int), Discrete(30), Discrete(30), MultiBinary(2))
                                       
    
    def step(self, action):        
        reward = self.reward_func(self.SourcingEnv.current_state, action)
        next_state, event, i, event_probs, supplier_index = self.SourcingEnv.step(action)
        self.counter += 1
        
        info = {}
        
        if self.counter < PERIODS:
            done = False
        else:
            done = True
        
        next_state_array = np.array(next_state.get_list_repr())
        return next_state_array, reward, done, info
    
    def reset(self):
        self.SourcingEnv = SourcingEnv()
        return np.array(self.SourcingEnv.current_state.get_list_repr())
        
    def reward_func(self, state, action):
        reward_hb = H_COST * state.s if state.s >= 0 else B_PENALTY * state.s 
        reward = reward_hb + np.sum(np.multiply(action, PROCUREMENT_COST_VEC))
        reward = float(reward)
        return -reward

In [4]:
INVEN_LIMIT

30

In [5]:
Discrete(INVEN_LIMIT)

Discrete(30)

In [6]:
s = SourcingEnv()

In [7]:
custom_gym_env = CustomGymEnv(s)

In [8]:
m_state = custom_gym_env.SourcingEnv.current_state
m_state.get_list_repr()

[0, 0, 0, 1, 1]

In [9]:
PERIODS = 10

In [10]:
PERIODS

10

In [11]:
custom_gym_env.step(np.array([1,3]))

(array([0, 1, 3, 1, 1]), -60.0, False, {})

In [12]:
from stable_baselines3.common.env_checker import check_env

In [13]:
check_env(custom_gym_env) #warn=True)

In [14]:
episodes = 15
for episode in range(1, episodes+1):
    state = custom_gym_env.reset()
    done = False
    cost = 0
    
    
    while not done:
        
        action = custom_gym_env.action_space.sample()
        n_state, reward, done, info = custom_gym_env.step(action)
        cost+=reward
        observation = custom_gym_env.step(action)
    print('Episode:{} Cost:{} Observation {}' .format(episode, cost, observation))
custom_gym_env.close()

Episode:1 Cost:-60.0 Observation (array([0, 1, 6, 1, 1]), -50.0, True, {})
Episode:2 Cost:-65.0 Observation (array([1, 1, 8, 1, 1]), -65.0, True, {})
Episode:3 Cost:-115.0 Observation (array([14,  2, 14,  1,  1]), -115.0, True, {})
Episode:4 Cost:-175.0 Observation (array([26,  2, 26,  1,  1]), -175.0, True, {})
Episode:5 Cost:-35.0 Observation (array([7, 0, 7, 1, 1]), -35.0, True, {})
Episode:6 Cost:-100.0 Observation (array([20,  0, 20,  1,  1]), -100.0, True, {})
Episode:7 Cost:-170.0 Observation (array([25,  2, 25,  1,  1]), -170.0, True, {})
Episode:8 Cost:-65.0 Observation (array([0, 1, 8, 1, 1]), -55.0, True, {})
Episode:9 Cost:-125.0 Observation (array([25,  0, 25,  1,  1]), -125.0, True, {})
Episode:10 Cost:-95.0 Observation (array([ 0,  2, 20,  0,  1]), -95.0, True, {})
Episode:11 Cost:-55.0 Observation (array([10,  0, 11,  1,  1]), -45.0, True, {})
Episode:12 Cost:-105.0 Observation (array([21,  0, 21,  1,  1]), -105.0, True, {})
Episode:13 Cost:-5.0 Observation (array([-1, 

In [15]:
import os
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [16]:
log_path = os.path.join('Training', 'Logs')

In [17]:
model = PPO("MlpPolicy", custom_gym_env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [18]:
model.learn(total_timesteps=5000)

Logging to Training/Logs/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -87.6    |
| time/              |          |
|    fps             | 1191     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1           |
|    ep_rew_mean          | -82.2       |
| time/                   |             |
|    fps                  | 973         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.029239912 |
|    clip_fraction        | 0.474       |
|    clip_range           | 0.2         |
|    entropy_loss         | -4.08       |
|    explained_variance   | 0           |

<stable_baselines3.ppo.ppo.PPO at 0x7fa3fcc567c0>

In [19]:
model.save('PPO')

In [20]:
evaluate_policy(model, custom_gym_env, n_eval_episodes=10, render=False)



(-5.0, 0.0)

In [1]:
!git log

[33mcommit 03bfc78fcfd1d90eac72576df5638e161338f18d[m[33m ([m[1;36mHEAD -> [m[1;32msamuel/work[m[33m)[m
Author: Samuel Misfeldt <samisfeldt@hotmail.com>
Date:   Wed Nov 16 13:07:32 2022 +0000

    Env working + PPO

[33mcommit 612dd286ba06bff8635aa2a21a70ba190484a437[m[33m ([m[1;31morigin/samuel/work[m[33m)[m
Author: samisfeldt <samisfeldt@hotmail.com>
Date:   Mon Nov 14 16:56:14 2022 +0100

    sam 1

[33mcommit 85f0957c0802a4841809bd80edac138e969efca7[m[33m ([m[1;31morigin/master[m[33m, [m[1;31morigin/HEAD[m[33m, [m[1;32mmaster[m[33m)[m
Merge: f053e3b de1b00c
Author: Larkin Liu <liu.larkin@gmail.com>
Date:   Mon Aug 8 15:22:35 2022 +0200

    Merge pull request #8 from tum-logistik/larkin/dual-index2
    
    Larkin/dual index2

[33mcommit de1b00c7fb7304e6a5b6abe6dd9204ede44453ae[m[33m ([m[1;31morigin/larkin/dual-index2[m[33m)[m
Author: larkz <liu.larkin@gmail.com>
Date:   Mon Aug 8 15:06:29 2022 +0200

    updated du

In [4]:
!git config --global user.name "Samuel-Misfeldt"

In [5]:
!git push

Username for 'https://github.com': ^C


In [6]:
!git checkout samuel/work

M	SourcingGym.ipynb
Already on 'samuel/work'
Your branch is ahead of 'origin/samuel/work' by 1 commit.
  (use "git push" to publish your local commits)
