# Topic 3 Value Based Q-Learning

## Q-Learning

In [None]:
import numpy as np
import gym
import time

lr = 0.1
gamma = 0.8
epsilon = 0.1
episodes = 10000

#env = gym.make('FrozenLake-v0')
env = gym.make("Taxi-v3").env

In [None]:
## Initialize Q Table
Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Q-Learning

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False

    while not done:
        if np.random.random() < epsilon:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s,:])
        s_, r, done, _ = env.step(a)
        Q[s,a] += lr*(r+gamma*np.max(Q[s_,:]) - Q[s,a])
        s = s_

In [None]:
# Print Final Q Table
print(Q)

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)

print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

### Activity: Q Learning on Frozen-Lake Environment 

### Before Training

In [None]:
import gym
import time

env = gym.make('FrozenLake-v1')
env.reset()

env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    step_count += 1
    total_reward += reward
    time.sleep(0.1)


In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

### After Q Learning

In [None]:
import numpy as np
import gym
import time

lr = 0.01
gamma = 0.9
epsilon = 0.2
episodes = 10000

env = gym.make('FrozenLake-v1')

In [None]:
## Initialize Q Table
Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Q-Learning

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False

    while not done:
        if np.random.random() < epsilon:
            a = env.action_space.sample()
        else:
            a = np.argmax(Q[s,:])
        s_, r, done, _ = env.step(a)
        Q[s,a] += lr*(r+gamma*np.max(Q[s_,:]) - Q[s,a])
        s = s_

In [None]:
# Print Final Q Table
print(Q)

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)

In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

## SARSA

In [None]:
import numpy as np
import gym
import time

lr = 0.1
gamma = 0.8
epsilon = 0.1
episodes = 10000

#env = gym.make('FrozenLake-v0')
env = gym.make("Taxi-v3").env

In [None]:
## Initialize Q Table
Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Action Policy

def choose_action(s, Q):
    if np.random.random() < epsilon:
        a = np.random.randint(0, env.action_space.n)
    else:
        a = np.argmax(Q[s,:])
    return a

In [None]:
## SARSA

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False
        
    a = choose_action(s, Q)
    while not done:
        s_, r, done, _ = env.step(a)
        a_ = choose_action(s_, Q)
        Q[s,a] = Q[s,a] + lr*(r+gamma*(Q[s_,a_]) - Q[s,a])
        s = s_
        a = a_

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)



In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

### Activity: SARSA on Frozen-Lake Environment 

In [None]:
import numpy as np
import gym
import time

lr = 0.01
gamma = 0.9
epsilon = 0.2
episodes = 10000

env = gym.make('FrozenLake-v1')

In [None]:
## Initialize Q Table
Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)

In [None]:
## Action Policy

def choose_action(s, Q):
    if np.random.random() < epsilon:
        a = np.random.randint(0, env.action_space.n)
    else:
        a = np.argmax(Q[s,:])
    return a

In [None]:
## SARSA

for i in range(episodes):
    print("Episode {}/{}".format(i + 1, episodes))
    s = env.reset()
    done = False
        
    a = choose_action(s, Q)
    while not done:
        s_, r, done, _ = env.step(a)
        a_ = choose_action(s_, Q)
        Q[s,a] = Q[s,a] + lr*(r+gamma*(Q[s_,a_]) - Q[s,a])
        s = s_
        a = a_

In [None]:
## Compute the # of Steps and Total Rewards

s = env.reset()
done = False
step_count = 0
total_reward = 0

while not done:
    env.render()
    a = np.argmax(Q[s,:])
    s_, r, done, _ = env.step(a)
    s = s_
    step_count += 1
    total_reward += r
    time.sleep(0.1)



In [None]:
print("Total steps: ",step_count)
print("Total rewards: ",total_reward)

## DQN

In [None]:
import gym
from stable_baselines3 import DQN

env = gym.make("CartPole-v0")

model = DQN("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=23000)

model.save("dqn_cartpole")
del model # remove to demonstrate saving and loading
model = DQN.load("dqn_cartpole")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs,deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')

env.close()


### Activity: DQN on Arobot Environment

In [None]:
import gym
from stable_baselines3 import DQN

env = gym.make('Acrobot-v1')

model = ___________________
model.learn(_____________________)

episodes = 5
total_score = 0
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = __________________________________
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    total_score += score
    
env.close()
avg_score = total_score/episodes
print(f'Average Score:{avg_score}')

In [None]:
import gym
from stable_baselines3 import DQN

env = gym.make('Acrobot-v1')

model = DQN('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)

episodes = 5
total_score = 0
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action, _state = model.predict(obs, deterministic=True)
        obs, reward, done, info = env.step(action)
        score+=reward
    print(f'Episode:{episode} Score:{score}')
    total_score += score
    
env.close()
avg_score = total_score/episodes
print(f'Average Score:{avg_score}')