# Experiment 1 Setup

- Environemt is fixed with seed 4
- Map is undiscovered

General env_setup:
- Num rooms: 3
- Items: Souls (2), Health Potion (0-1), Enemies (Level 1, 1-2)

In [1]:
!pip install stable-baselines3[extra]

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install --upgrade pip' command.[0m


# Initialization Environment
Initialization and setup of the custom environment

In [2]:
from env import RoguesSoulsEnv

env = RoguesSoulsEnv(max_steps=500)
env.set_seed(4)
env.set_mode(1)
# Needs resetting to apply above changes
state = env.reset()
env.render(mode='human')

------------------------


                        
                        
                        
                        
     |------|           
     |....*.|           
     |......+####       
     |.>...b|   #       
     |..$...| |-+--|    
     |------| |.*..|    
              |..$b|    
              |....|    
              |....|    
    |----|    |....|    
    |....|    |-+--|    
    |....|      #       
    |.@..+#######       
    |....|              
    |....|              
    |----|              
                        
                        
                        
                        
		LEVEL: 1/1	  HP: 16 	SOULS: 0
		DEFENSE: 2	MAX HP: 16 	POWER: 4
------------------------


# Flattening the observation space

Stable-baseline3 requires the obseration space to be a 1D vector for better policy optimisation.


In [3]:
from gym.wrappers import FlattenObservation
env = FlattenObservation(env)

# Initialization agent
Agent will be based on stable-baselines3 DQN algorithm

In [4]:
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import SubprocVecEnv, DummyVecEnv
env = DummyVecEnv([lambda: env])
model = DQN('MlpPolicy', env, verbose=1, gamma=0.5, exploration_fraction=0.8)

Using cpu device


# Adding the necessary check function
This function checks the average score from n episodes

In [5]:
def get_average(env, model, n:int = 100) -> int:
    total = 0
    for e in range(1, n+1):
        obs = env.reset()
        done = False
        score = 0

        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            score += reward
        total += score

    return total/n

# Training the agent

The agent will be trained until its average performance is above 180.

In [6]:
# for i in range(5):
#     tot_steps = 25000
#     model.learn(total_timesteps = tot_steps, reset_num_timesteps=False)
#     avg = get_average(model, 25)
#     print(f'Total iterations: {(i+1)*tot_steps}; Average {avg}')
#     if avg > 100:
#         break
    
model.learn(total_timesteps = 250000, reset_num_timesteps=False)
avg = get_average(env, model, 25)
print(f'Average: {avg}')

----------------------------------
| rollout/            |          |
|    exploration rate | 0.991    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 317      |
|    time_elapsed     | 6        |
|    total timesteps  | 2000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.981    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 328      |
|    time_elapsed     | 12       |
|    total timesteps  | 4000     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.972    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 329      |
|    time_elapsed     | 18       |
|    total timesteps  | 6000     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration rate | 0.745    |
| time/               |          |
|    episodes         | 108      |
|    fps              | 320      |
|    time_elapsed     | 167      |
|    total timesteps  | 53636    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.00017  |
|    n_updates        | 908      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.736    |
| time/               |          |
|    episodes         | 112      |
|    fps              | 317      |
|    time_elapsed     | 175      |
|    total timesteps  | 55636    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.142    |
|    n_updates        | 1408     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

----------------------------------
| rollout/            |          |
|    exploration rate | 0.567    |
| time/               |          |
|    episodes         | 184      |
|    fps              | 290      |
|    time_elapsed     | 314      |
|    total timesteps  | 91150    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 4.62e-05 |
|    n_updates        | 10287    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.558    |
| time/               |          |
|    episodes         | 188      |
|    fps              | 289      |
|    time_elapsed     | 322      |
|    total timesteps  | 93150    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000144 |
|    n_updates        | 10787    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

----------------------------------
| rollout/            |          |
|    exploration rate | 0.388    |
| time/               |          |
|    episodes         | 260      |
|    fps              | 282      |
|    time_elapsed     | 455      |
|    total timesteps  | 128753   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000122 |
|    n_updates        | 19688    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.379    |
| time/               |          |
|    episodes         | 264      |
|    fps              | 282      |
|    time_elapsed     | 463      |
|    total timesteps  | 130753   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 3.11     |
|    n_updates        | 20188    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

----------------------------------
| rollout/            |          |
|    exploration rate | 0.212    |
| time/               |          |
|    episodes         | 336      |
|    fps              | 276      |
|    time_elapsed     | 601      |
|    total timesteps  | 165912   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.017    |
|    n_updates        | 28977    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.202    |
| time/               |          |
|    episodes         | 340      |
|    fps              | 275      |
|    time_elapsed     | 609      |
|    total timesteps  | 167912   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000304 |
|    n_updates        | 29477    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 412      |
|    fps              | 271      |
|    time_elapsed     | 752      |
|    total timesteps  | 203814   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 7.36e-05 |
|    n_updates        | 38453    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 416      |
|    fps              | 270      |
|    time_elapsed     | 761      |
|    total timesteps  | 205814   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 5.57e-05 |
|    n_updates        | 38953    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 488      |
|    fps              | 267      |
|    time_elapsed     | 904      |
|    total timesteps  | 241764   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 7.41e-05 |
|    n_updates        | 47940    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rate | 0.05     |
| time/               |          |
|    episodes         | 492      |
|    fps              | 267      |
|    time_elapsed     | 911      |
|    total timesteps  | 243764   |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.000115 |
|    n_updates        | 48440    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration rat

# Results

In [7]:
import time
episodes = 10
print(f'TESTING MODEL ON {episodes} EPISODES\n')
for e in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0

    while not done:
#         env.render()
#         input('[ENTER]')
#         time.sleep(1)
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        score += reward
    completed = info[0].get('Completed')
    print(f'Episode: {e}, Score: {score}, Completed: {completed}')
    

TESTING MODEL ON 10 EPISODES

Episode: 1, Score: [-100.5], Completed: False
Episode: 2, Score: [-100.65], Completed: False
Episode: 3, Score: [-100.4], Completed: False
Episode: 4, Score: [-100.35], Completed: False
Episode: 5, Score: [-100.35], Completed: False
Episode: 6, Score: [-100.35], Completed: False
Episode: 7, Score: [-100.55], Completed: False
Episode: 8, Score: [-100.5], Completed: False
Episode: 9, Score: [-100.4], Completed: False
Episode: 10, Score: [-100.5], Completed: False
