# **Q-Learning**

In [None]:
from google.colab import drive
!pip install stable-baselines3[extra] gymnasium
!pip install gymnasium[accept-rom-license,atari]
!pip install pyvirtualdisplay
!sudo apt-get install -y python-opengl ffmpeg
!sudo apt-get install -y xvfb
!pip install swig
!pip install gymnasium[box2d]

import base64
from IPython import display as ipythondisplay
from pathlib import Path
from gymnasium.wrappers import RecordVideo
import gymnasium as gym
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
import numpy as np
import math

env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= True,
    wind_power= 15.0,
    turbulence_power= 1.5,
    render_mode="rgb_array")

LEARNING_RATE = 0.1
DISCOUNT = 0.95
EPISODES = 3e4 #set to >=3e4 to ensure training works for this problem

OBSERVATION_DIM = 8
NUM_ACTIONS = 4
NUM_BINS = 4 #8 use 2 or 3 bits for each observation dimension

env.reset()


epsilon = 1
#epsilon_change = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

print(env.observation_space.high)
print(env.observation_space.low)
# This function converts the floating point state values into
# discrete values. This is often called binning.  We divide
# the range that the state values might occupy and assign
# each region to a bucket.
#then we map the state to a single number between 0:numBins**obs_space
def discretizeLunarState(s, obs_space, numBins=4):
  highs = obs_space.high
  lows = obs_space.low

  discrete_state = []

  normalized = (min(5, max(-5, int((s[0]) / 0.05))), \
            min(5, max(-1, int((s[1]) / 0.1))), \
            min(3, max(-3, int((s[2]) / 0.1))), \
            min(3, max(-3, int((s[3]) / 0.1))), \
            min(3, max(-3, int((s[4]) / 0.1))), \
            min(3, max(-3, int((s[5]) / 0.1))), \
            int(s[6]), \
            int(s[7]))

  for i in [0,1,2,3,4,5]:
    bin = ( highs[i]-lows[i] ) / numBins
    val = int ( ( normalized[i] -  lows[i] ) / bin )
    discrete_state.append( val )

  discrete_state.append(int(s[6])) #boolean leg
  discrete_state.append(int(s[7])) #boolean leg

  shift = int( math.log2(NUM_BINS))

  state_key = 0
  for i in [0,1,2,3,4,5]:
    state_key = state_key << shift
    state_key += discrete_state[i]
  state_key<<1
  state_key+=discrete_state[6]
  state_key<<1
  state_key+=discrete_state[7]

  return state_key


obs = env.reset()
state = discretizeLunarState(obs[0], env.observation_space, NUM_BINS)
print(obs)
#so now the state is a tuple of discrete values, to be used as the key in Q(s,a) table.
print(state)


#set up qtable
#(num_states, num_actions)
q_table = np.zeros((NUM_BINS**8, NUM_ACTIONS)) #number of possible discrete states x number of actions
print(q_table.shape)




render=0

# Run one game.  The q_table to use is provided.  We also
# provide a flag to indicate if the game should be
# rendered/animated.  Finally, we also provide
# a flag to indicate if the q_table should be updated.
def run_game(env, q_table, render, should_update, exploit=False):
    done = False
    discrete_state = discretizeLunarState(env.reset()[0], env.observation_space, NUM_BINS)
    success = False
    total_reward = 0
    while not done:
        # TODO HERE: Implement Q-Learning steps of epsilon-greedy action selection/Exploit or explore
        # #note: if exploit==True, do not explore, exploit only - used for prediction after learning
        # Hint: to select max q from a row of Qtable, can use code like this:
        # np.argmax(q_table[discrete_state,:]), which selects argmax of a row
        # Exploit or explore
        if exploit or np.random.random() > epsilon:
            # Exploit - use q-table to take current best action
            # (and probably refine)
            action = np.argmax(q_table[discrete_state])
        else:
            # Explore - t
            action = np.random.randint(0, env.action_space.n)
        #
        #given an action selected,
        # Run simulation step, observe new state and reward
        new_state, reward, done, truncated, info = env.step(action)
        total_reward+=reward
        # Convert continuous state to discrete
        new_state_disc = discretizeLunarState(new_state, env.observation_space, NUM_BINS)


        #TODO: critical step here: Update q-table
        #implement the q-learning update using the observed value, discounted q-values from destination state, etc.
        #numpy array q_table is references by state_id, action_id.

        if should_update:
            # Q-Learning update
            current_q = q_table[discrete_state][action]
            max_future_q = np.max(q_table[new_state_disc])
            new_q = (1 - LEARNING_RATE) * current_q + LEARNING_RATE * (reward + DISCOUNT * max_future_q)
            q_table[discrete_state][action] = new_q

        discrete_state = new_state_disc

        if truncated:
          break

    return total_reward

episode = 0
success_count = 0

#make silent train environment, no graphics
train_env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False, #set to False for simpler /calm environment
    wind_power= 15.0,
    turbulence_power= 1.5)


# Loop through the required number of episodes
while episode < EPISODES:
    episode += 1
    done = False

    # Run the game.
    reward = run_game(train_env, q_table, False, True)
    print ("episode ", episode, " finished. reward: ", reward)

    # Count successes
    if reward>=200:
        success_count += 1

    # Reduce epsilon as training progresses
    epsilon = epsilon/math.log(EPISODES)

print(success_count)

eval_env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False, #must be same as train environment
    wind_power= 15.0,
    turbulence_power= 1.5,
    render_mode="rgb_array")



Collecting stable-baselines3[extra]
  Using cached stable_baselines3-2.3.2-py3-none-any.whl (182 kB)
Collecting shimmy[atari]~=1.3.0 (from stable-baselines3[extra])
  Using cached Shimmy-1.3.0-py3-none-any.whl (37 kB)
Collecting autorom[accept-rom-license]~=0.6.1 (from stable-baselines3[extra])
  Using cached AutoROM-0.6.1-py3-none-any.whl (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13->stable-baselines3[extra])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13->stable-baselines3[extra])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13->stable-baselines3[extra])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13->stable-baselines3[extra])
  Using cached nvidia_cudnn_cu1

Collecting autorom[accept-rom-license]~=0.4.2 (from gymnasium[accept-rom-license,atari])
  Using cached AutoROM-0.4.2-py3-none-any.whl (16 kB)
Collecting shimmy[atari]<1.0,>=0.1.0 (from gymnasium[accept-rom-license,atari])
  Using cached Shimmy-0.2.1-py3-none-any.whl (25 kB)
Installing collected packages: shimmy, autorom
  Attempting uninstall: shimmy
    Found existing installation: Shimmy 1.3.0
    Uninstalling Shimmy-1.3.0:
      Successfully uninstalled Shimmy-1.3.0
  Attempting uninstall: autorom
    Found existing installation: AutoROM 0.6.1
    Uninstalling AutoROM-0.6.1:
      Successfully uninstalled AutoROM-0.6.1
Successfully installed autorom-0.4.2 shimmy-0.2.1


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
episode  19079  finished. reward:  172.64008107180445
episode  19080  finished. reward:  181.90323916464456
episode  19081  finished. reward:  205.9564530369779
episode  19082  finished. reward:  190.634011993272
episode  19083  finished. reward:  120.77096502505741
episode  19084  finished. reward:  -85.4269470524271
episode  19085  finished. reward:  224.88040326310437
episode  19086  finished. reward:  221.6075627416252
episode  19087  finished. reward:  186.42687112925142
episode  19088  finished. reward:  140.08495010622414
episode  19089  finished. reward:  174.95143369564255
episode  19090  finished. reward:  -66.92414971278279
episode  19091  finished. reward:  -80.9946734725857
episode  19092  finished. reward:  30.17199444182731
episode  19093  finished. reward:  -410.06903994399187
episode  19094  finished. reward:  189.04822306856568
episode  19095  finished. reward:  174.1762603323196
episode  19096  finished

In [None]:
print("----- Q-LEARNING TESTING & EVAL -----")

eval_env.reset()
video_callable=lambda episode_id: True
eval_env = RecordVideo(eval_env, video_folder='./videos_lander_qlearn', episode_trigger=video_callable)
mean_reward =0
reward = run_game(eval_env, q_table, True, False, exploit=True)
mean_reward+=reward
reward = run_game(eval_env, q_table, True, False, exploit=True)
mean_reward+=reward
reward = run_game(eval_env, q_table, True, False, exploit=True)
mean_reward+=reward

print ("mean reward: ", mean_reward/3)

# Display the video
video0 = io.open(glob.glob('videos_lander_qlearn/rl-video-episode-0.mp4')[0], 'r+b').read()
encoded0 = base64.b64encode(video0)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video0/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded0.decode('ascii'))))

video1 = io.open(glob.glob('videos_lander_qlearn/rl-video-episode-1.mp4')[0], 'r+b').read()
encoded1 = base64.b64encode(video1)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video1/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded1.decode('ascii'))))

video2 = io.open(glob.glob('videos_lander_qlearn/rl-video-episode-2.mp4')[0], 'r+b').read()
encoded2 = base64.b64encode(video2)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video2/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded2.decode('ascii'))))

  and should_run_async(code)


# **DQN**

In [None]:
import gymnasium as gym
from stable_baselines3 import DQN
import torch as th
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines3.common.evaluation import evaluate_policy

# Create and initialize fresh Lunar Lander environment
train_env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False, #Should also learn even with wind enabled
    wind_power= 15.0,
    turbulence_power= 1.5)

time_step = train_env.reset()

# Instantiate the agent
#specify network architecture for policy and value networks

policy_kwargs = {
    "net_arch": [32, 32],
    "activation_fn": th.nn.ReLU,
    "batch_norm": True
}

dqn = DQN("MlpPolicy", env, verbose=1,
          policy_kwargs=policy_kwargs,
          learning_rate=0.001, gamma=1, batch_size=128) #provide appropriate parameters, net arch requires experimentation


Timesteps = 3e5 #set to >=100000 to converge
dqn.learn(total_timesteps=Timesteps)

# Save the agent
dqn.save("dqn_lander")

In [None]:
print("----- DQN TESTING & EVAL ----")
# Create a fresh environment for evaluation
eval_env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False, #must be same as train environment
    wind_power= 15.0,
    turbulence_power= 1.5,
    render_mode="rgb_array")


# Evaluate the agent
mean_reward, std_reward = evaluate_policy(dqn, eval_env, n_eval_episodes=10)

print(f"Mean reward: {mean_reward} +/- {std_reward}")

# **PPO**

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import VecFrameStack
import torch as th

# Train the agent
TIMESTEPS = 4e5
#experiment with number of steps
#setup training environment without video for speed

env_train = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False,
    wind_power= 15.0,
    turbulence_power= 1.5)

env_train.reset()

# Initialize the agent, use Proximal Policy Optimization (PPO)

lander_ppo = PPO()

#todo: experiment with number of steps
lander_ppo.learn(total_timesteps=TIMESTEPS)

# Save the model
lander_ppo.save(f"lander_ppo_model")
env.close()

# Evaluate the trained agent
env_train.reset()
mean_reward, std_reward = evaluate_policy(lander_ppo, env_train, n_eval_episodes=10)

print(f"Mean reward: {mean_reward} +/- {std_reward}")

# Don't forget to close the environment when you are done
env.close()



In [None]:
print("----- PPO TESTING & EVAL ----")

eval_env = gym.make(
    "LunarLander-v2",
    continuous= False, #set to False for simpler discrete version
    gravity= -10.0,
    enable_wind= False,
    wind_power= 15.0,
    turbulence_power= 1.5,
    render_mode="rgb_array")

obs = eval_env.reset()
video_folder = '/content/videos_lander_ppo'
# Record the environment
eval_env = RecordVideo(eval_env, video_folder='./videos_lander_ppo', episode_trigger=video_callable)

# Load the trained agent
# NOTE: if you have loading issue, you can pass `print_system_info=True`
# to compare the system on which the model was trained vs the current one
# model = DQN.load("dqn_lunar", env=env, print_system_info=True)
lander_ppo= PPO.load(f"lander_ppo_model", env=eval_env)

# Evaluate agent
mean_reward, std_reward = evaluate_policy(lander_ppo, eval_env, n_eval_episodes=3)
print("average reward: ", mean_reward)

eval_env.close()




# Display the video
video0 = io.open(glob.glob('videos_lander_ppo/rl-video-episode-0.mp4')[0], 'r+b').read()
encoded0 = base64.b64encode(video0)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video0/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded0.decode('ascii'))))

video1 = io.open(glob.glob('videos_lander_ppo/rl-video-episode-1.mp4')[0], 'r+b').read()
encoded1 = base64.b64encode(video1)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video1/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded1.decode('ascii'))))

video2 = io.open(glob.glob('videos_lander_ppo/rl-video-episode-2.mp4')[0], 'r+b').read()
encoded2 = base64.b64encode(video2)
ipythondisplay.display(HTML(data='''
    <video width="640" height="480" controls>
        <source src="data:video2/mp4;base64,{0}" type="video/mp4" />
    </video>
'''.format(encoded2.decode('ascii'))))


# Close the environment which should also save the video
env.close()