In [37]:
# import the libraries 
import gym
import random
import numpy as np
from IPython.display import clear_output
%%time


random.seed(1234)



UsageError: Line magic function `%%time` not found.


In [38]:
# load the game enviroment and render 
street_map = gym.make("Taxi-v3").env 


In [39]:
# getting a random state space of the vehicle 
street_map.reset() # resets the random space to a new location 
street_map.render()

+---------+
|[34;1mR[0m:[43m [0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+



In [40]:
# getting the number of action and state spaces available 
print("Action Space recorded {}".format(street_map.action_space))
print("State Space available {}".format(street_map.observation_space))

Action Space recorded Discrete(6)
State Space available Discrete(500)


In [41]:
#creating a reward table with the number os state spaces generated (0-499)
street_map.P[499]

{0: [(1.0, 499, -1, False)],
 1: [(1.0, 399, -1, False)],
 2: [(1.0, 499, -1, False)],
 3: [(1.0, 479, -1, False)],
 4: [(1.0, 499, -10, False)],
 5: [(1.0, 499, -10, False)]}

# SOLVING WITHOUT Q LEARNING 

In [42]:
# We provided a constant state to ensure results are constant when doing the analysis

In [43]:
space_states = street_map.encode(2, 3,1, 0) # (taxi row, taxi column, passenger index, destination index)
street_map.s = space_states
street_map.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |B: |
+---------+



In [44]:
# Number of state spaces 
print("number of space_states:", space_states)

number of space_states: 264


In [45]:
# getting the reward table
street_map.P[264]


{0: [(1.0, 364, -1, False)],
 1: [(1.0, 164, -1, False)],
 2: [(1.0, 284, -1, False)],
 3: [(1.0, 244, -1, False)],
 4: [(1.0, 264, -10, False)],
 5: [(1.0, 264, -10, False)]}

In [46]:
# fucntion to solve 
# initializing all the varibles 
street_map.s=264
reward=0
penalty=0
transitions=[]
no_of_epoch=0
done=False

In [47]:
#funtion 
while not done:
    action = street_map.action_space.sample()
    state, reward, done, info = street_map.step(action)

    if reward == -10:
        penalty += 1
    
    # Put each rendered frame into dict for animation
    transitions.append({
        'transitions': street_map.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    no_of_epoch += 1
    
    
print("Number of episodes : {}".format(no_of_epoch))
print("Penalties: {}".format(penalty))
print("rewards : {}".format(reward))

Number of episodes : 191
Penalties: 58
rewards : 20


# With Q learning 

In [48]:
# creating a q table with 500 x 6 dimension of zeros
q_table=np.zeros([street_map.observation_space.n, street_map.action_space.n])

In [49]:
q_table

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [50]:

###Training the agent
# Hyperparameters

alpha = 0.1
gamma = 0.6
epsilon = 0.1


all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = street_map.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = street_map.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = street_map.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.



In [59]:
q_table[264]

array([ -2.47997539,  -2.4510224 ,  -2.4510224 ,  -2.48041061,
       -10.96191949, -11.09668622])

In [86]:
total_epochs, total_penalties = 0, 0
episodes = 10

for _ in range(episodes):
    state = street_map.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = street_map.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 10 episodes:
Average timesteps per episode: 13.9
Average penalties per episode: 0.0


In [88]:
print(f"Average rewards: {reward/episodes}")

Average rewards: 2.0


In [92]:
scores=[]
scores.append(reward)
scores

[20]

In [90]:
import pylab
from pylab import *
import cv2


def imag(score,episode):
    scores=[]
    episodes=[]
    average=[]
    scores.append(score)
    episodes.append(score)
    average.append(sum(scores[-50:]) / len(scores[-50:]))
    if str(episode)[-2:] == "00":# much faster than episode % 100
        pylab.plot(episodes, scores, 'b')
        pylab.plot(episodes, average, 'r')
        pylab.ylabel('Score', fontsize=18)
        pylab.xlabel('Steps', fontsize=18)
        try:
            pylab.savefig('C:/Users/USER/Desktop'+".png")
        except OSError:
            pass

    return average[-1]

In [91]:
def imshow(image, rem_step=0):
    cv2.imshow(Model_name+str(rem_step), image[rem_step,...])
    if cv2.waitKey(25) & 0xFF == ord("q"):
        cv2.destroyAllWindows()
        return


# Deep Q network Implementation

In [102]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Embedding, Reshape
from keras.optimizers import Adam

In [103]:
street_map.reset()
street_map.step(street_map.action_space.sample())[0]

143

In [104]:
model_only_embedding = Sequential()
model_only_embedding.add(Embedding(500, 6, input_length=1))
model_only_embedding.add(Reshape((6,)))
print(model_only_embedding.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 1, 6)              3000      
_________________________________________________________________
reshape_8 (Reshape)          (None, 6)                 0         
Total params: 3,000
Trainable params: 3,000
Non-trainable params: 0
_________________________________________________________________
None


In [105]:
action_size = street_map.action_space.n
state_size = street_map.observation_space.n

In [106]:
model = Sequential()
model.add(Embedding(500, 10, input_length=1))
model.add(Reshape((10,)))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(action_size, activation='linear'))
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_9 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 50)                550       
_________________________________________________________________
dense_17 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_18 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_19 (Dense)             (None, 6)                 306       
Total params: 10,956
Trainable params: 10,956
Non-trainable params: 0
__________________________________________________

In [None]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn_only_embedding = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn_only_embedding.compile(Adam(lr=1e-3), metrics=['mae'])
dqn_only_embedding.fit(env, nb_steps=1000000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=100000)

In [None]:
dqn_only_embedding.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

# Advantage Actor-Critic (A2C) implementation

In [None]:
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import A2C

# Parallel environments
env = make_vec_env('CartPole-v1', n_envs=4)

model = A2C(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=25000)
model.save("a2c_cartpole")

del model # remove to demonstrate saving and loading

model = A2C.load("a2c_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()

In [None]:
 def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes, self.scores, 'b')
            pylab.plot(self.episodes, self.average, 'r')
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.savefig(self.path+".png")
            except OSError:
                pass

        return self.average[-1]

    def imshow(self, image, rem_step=0):
        cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
            return