# 0. Install and Import Dependencies

In [1]:
#!pip install tensorflow tensorflow-gpu stable_baselines3 gym box2d-py --user

In [2]:
import gym 
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
environment_name = 'LunarLander-v2'

# 1. Test Random Environment

In [4]:
env = gym.make(environment_name)

In [7]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:-419.02412227660744
Episode:2 Score:-98.52299655204354
Episode:3 Score:-57.66074030709397
Episode:4 Score:-76.8116972862527
Episode:5 Score:-316.48069072682597
Episode:6 Score:-102.28035240309637
Episode:7 Score:-138.9406618187568
Episode:8 Score:-124.83134815994737
Episode:9 Score:-319.4322794264382
Episode:10 Score:-131.2695117725625


# 2. Build and Train the Model

In [8]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])
model = DQN('MlpPolicy', env, verbose = 1)

Using cpu device


In [33]:
model.learn(total_timesteps=400000)

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.992    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 1354     |
|    time_elapsed     | 0        |
|    total_timesteps  | 322      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.983    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 1499     |
|    time_elapsed     | 0        |
|    total_timesteps  | 695      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.974    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 1738     |
|    time_elapsed     | 0        |
|    total_timesteps  | 1087     |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x29d431d3730>

# 3. Save and Test the Model

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)
env.close()

In [37]:
model.save("ACER_model")

In [11]:
del model

In [None]:
model = DQN.load("ACER_model", env=env)

In [58]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
    

KeyboardInterrupt: 

In [59]:
env.close()

In [57]:
env.observation_space.sample()

array([-0.89411455, -0.03638446, -0.3488878 , -1.7915399 ,  0.1893308 ,
        0.15538439, -0.43273813, -0.40387493], dtype=float32)

In [60]:
model.get_parameters()

{'policy': OrderedDict([('q_net.q_net.0.weight',
               tensor([[ 1.8926e-02, -1.0627e-01, -2.6043e-01,  2.1749e+00,  8.1710e-01,
                         7.0344e-02,  3.7524e-01, -7.7261e-01],
                       [ 5.5984e-01, -5.1702e-03, -8.7707e-01,  2.9409e-01,  7.4230e-01,
                        -6.7646e-01,  6.4690e-01, -6.5594e-02],
                       [-7.1153e-01,  8.7199e-02, -2.0720e-01, -3.7351e-01, -7.5740e-01,
                        -9.5548e-01, -2.9785e-01, -1.1635e+00],
                       [-2.7997e-01, -6.1002e-01, -4.6881e-01, -1.3145e+00, -6.7791e-02,
                         2.1477e-01, -2.0528e-02,  3.2795e-01],
                       [ 2.9522e-01,  7.8300e-02, -8.5914e-02,  7.1199e-01,  1.3669e+00,
                         6.9634e-01,  5.9726e-01, -7.4142e-02],
                       [ 2.3909e-01,  6.9849e-01,  4.9418e-01, -6.6342e-01, -6.2505e-02,
                         7.1409e-01, -4.1572e-01, -2.9602e-01],
                       [ 5.0020e-