In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gym
import random
import itertools
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

#dynamic memory allocation
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
print(tf.__version__)
env = gym.make('CartPole-v0')

2.2.0


In [3]:
#Cartpole Specific Model
def create_cartmodel(input_shape, no_of_actions, lr):
    model = tf.keras.Sequential([
        Dense(64, input_shape = (input_shape, ), activation = tf.nn.relu),
        Dense(32, activation = tf.nn.relu),
        Dense(no_of_actions)
    ])
    model.compile(optimizer = Adam(lr = lr), loss = 'mean_squared_error')
    return model

In [111]:
class DQL:

    def __init__(self, env, model, epsilon = 1, decay_rate = 0.99, min_epsilon = 0.1, discount_factor = 1, name = 'Default'):
        """
        Function which initializes all the values and parameters for the Deep Q Learning

        Input:
        env - Open AI Environment
        model - Q learning model specific to the environment
        epsilon - The rate of being greedy (Default = 1)
        decay_rate - Epsilon decay rate (Default = 0.99)
        min_epsilon - Minimum value of epsilon (Default = 0.1)
        discount_factor - Gamma value for RL (Default = 1)
        name - Model's Name (Default = 'Default)

        Returns:
        DQL Object
        """
        self.name = name
        self.env = env
        self.model = model
        self.epsilon = epsilon
        self.decay_rate = decay_rate
        self.discount_factor = discount_factor
        self.min_epsilon = min_epsilon
        self.er = []
        self.rewards = []
        self.mse = []
        self.batch_size = 0

    def epsilon_policy(self, state):
        '''
        Function which finds the action probabilities based on epsilon policy given a specific state

        Input:
        state - The current state of the environment

        Returns:
        Probabilities of all the actions
        ''' 

        if random.random() < self.epsilon:
            return random.randint(0, 1)
        
        else:
            return np.argmax(self.model.predict(state.reshape(1, -1))[0])

    def plot(self, rewards):
        return 0

    def prepopulate_er(self):
        state = self.env.reset()
        for i in range(self.batch_size):

            action = self.epsilon_policy(state)
            next_state, reward, done, _ = self.env.step(action)
            self.er.append((state, action, reward, next_state, done))

            if done:
                state = self.env.reset()
            
            else:
                state = next_state

    
    def update_network(self):

        samples = random.sample(self.er, self.batch_size)
        States = np.array([s[0] for s in samples])
        Q_tr = np.zeros((self.batch_size, self.env.action_space.n))

        for j,s in enumerate(samples):
            state, action, reward, next_state, done = s
            if done:
                Q_tr[j][action] = reward
            else:
                Q_tr[j][action] = reward + (self.discount_factor * np.max(self.model.predict(next_state.reshape(1,-1))[0]))
        
        hist = self.model.fit(States, Q_tr, epochs = 1, verbose = False)
        return round(hist.history['loss'][0], 3)

    def learn(self, noe, batch_size):
        
        self.batch_size = batch_size
        self.prepopulate_er()
        print(len(self.er))

        for i in range(noe):
            tot_reward = 0
            tot_loss = 0
            if(self.epsilon > self.min_epsilon):
                    self.epsilon *= self.decay_rate
            
            state = self.env.reset()
            for t in itertools.count():
                self.env.render()
                
                
                action = self.epsilon_policy(state)
                if(action == 0): 
                    print("Left") 
                else: 
                    print("Right")
                next_state, reward, done, _ = self.env.step(action)
                self.er.append((state, action, reward, next_state, done))
                loss = self.update_network()
                tot_reward += reward
                tot_loss += loss
                if done:
                    break
                else:
                    state = next_state

            #if(i % 10 == 0):
            print("Episode : {} Reward: {} Loss: {} Epsilon: {}".format(i, tot_reward, round(tot_loss, 3), round(self.epsilon, 3)))


In [109]:
model = create_cartmodel(4, 2, 0.001)
dql = DQL(env, model)
dql.learn(200, 32)
env.close()

32
Right
Left
Left
Right
Left
Left
Left
Left
Left
Left
Left
Right
Right
Episode : 0 Reward: 13.0 Loss: 7.279 Epsilon: 0.6361854860638709
Left
Left
Right
Left
Left
Left
Left
Left
Left
Left
Left
Left
Episode : 1 Reward: 12.0 Loss: 8.85 Epsilon: 0.5639051904523876
Left
Right
Left
Left
Right
Left
Left
Right
Left
Left
Right
Left
Left
Left
Left
Left
Episode : 2 Reward: 16.0 Loss: 17.772 Epsilon: 0.4801414565714212
Left
Right
Right
Left
Left
Left
Right
Left
Right
Right
Right
Left
Left
Left
Left
Right
Left
Right
Left
Left
Left
Left
Episode : 3 Reward: 22.0 Loss: 39.958 Epsilon: 0.3848960788934847
Left
Right
Left
Left
Left
Left
Left
Left
Right
Left
Left
Left
Left
Episode : 4 Reward: 13.0 Loss: 28.906000000000002 Epsilon: 0.337754400898902
Left
Left
Left
Left
Left
Left
Left
Right
Episode : 5 Reward: 8.0 Loss: 19.147 Epsilon: 0.3116610814491425
Left
Left
Left
Left
Left
Left
Left
Left
Left
Episode : 6 Reward: 9.0 Loss: 25.363999999999997 Epsilon: 0.2847077732731954
Left
Right
Left
Left
Left
Left
L

KeyboardInterrupt: 

In [110]:
env.close()