In [26]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import gym
from collections import deque
import random
import tqdm

In [32]:
ENV_NAME = 'MountainCar-v0'
EPISODE = 1000 # Episode limitation
# Hyper Parameters for DQN
INITIAL_EPSILON = 0.5 # starting value of epsilon
FINAL_EPSILON = 0.01 # final value of epsilon
BATCH_SIZE = 32 # size of minibatch
REPLAY_SIZE = 10000 # experience replay buffer size

env = gym.make(ENV_NAME)

In [3]:
state=env.reset()
print(state,state.shape,len(state))
for i in range(100):
    env.render()
    env.step(env.action_space.sample())
env.close()

[-0.54811972  0.        ] (2,) 2


In [4]:
class ReplayBuffer:
    def __init__(self, capacity=10000):# experience replay buffer size
        self.buffer = deque(maxlen=capacity)
    
    def put(self, state, action, reward, next_state, done):
        self.buffer.append([state, action, reward, next_state, done])
    
    def sample(self):
        sample = random.sample(self.buffer, args.batch_size)
        states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
        states = np.array(states).reshape(args.batch_size, -1)
        next_states = np.array(next_states).reshape(args.batch_size, -1)
        return states, actions, rewards, next_states, done
    
    def size(self):
        return len(self.buffer)

In [24]:
class Network:
    def __init__(self, state_dim, aciton_dim):
        self.state_dim  = state_dim
        self.action_dim = aciton_dim
        self.epsilon = 1.0
        
        self.model = self.create_model()
    
    # def create_model(self):
    #     model = tf.keras.Sequential([
    #         tf.keras.Input((self.state_dim,)),
    #         layers.Dense(32, activation='relu'),
    #         layers.Dense(16, activation='relu'),
    #         layers.Dense(self.action_dim)
    #     ])
    #     model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(0.005))
    #     return model
    
    def predict(self, state):
        return self.model.predict(state)
    
    def get_action(self, state):
        state = np.reshape(state, [1, self.state_dim])
        self.epsilon *= 0.995
        self.epsilon = max(EPISODE, FINAL_EPSILON)
        q_value = self.predict(state)[0]
        if np.random.random() < EPISODE:
            return random.randint(0, self.action_dim-1)
        return np.argmax(q_value)

    def train(self, states, targets):
        self.model.fit(states, targets, epochs=1, verbose=0)
    # def __init__(self,state_size,action_size):
    #     self.state_dim=state_size
    #     self.action_dim=action_size
    #     self.epsilon =EPISODE
    #     self.create_network()

    def create_model(self,lr=0.005):
        model=tf.keras.Sequential()
        model.add(tf.keras.Input((self.state_dim,)))
        model.add(layers.Dense(32,activation='relu'))
        model.add(layers.Dense(24,activation='relu'))
        model.add(layers.Dense(self.action_dim,activation='linear'))
        model.compile(loss='mse',optimizer=tf.keras.optimizers.Adam(lr), metrics=['accuracy'])
        model.summary()
        return model
        

    # def action(self,state):
    #     state = np.reshape(state, [1, self.state_dim])
    #     self.epsilon *= args.eps_decay
    #     self.epsilon = max(self.epsilon, FINAL_EPSILON)
    #     q_value = self.predict(state)[0]
    #     if np.random.random() < self.epsilon:
    #         return random.randint(0, self.action_dim-1)
    #     return np.argmax(q_value)
    # def predict(self, state):
    #     return self.model.predict(state)

    # def train(self, states, targets):
    #     self.model.fit(states, targets, epochs=1, verbose=0)

In [30]:
class Agent:
    def __init__(self,env):
        self.state_dim=env.observation_space.shape[0]
        self.action_dim=env.action_space.n
        self.memory = deque(maxlen=3000)
        self.env=env
        self.epsilon=1.0
        self.epsilon_min=0.01
        self.epsilon_decay = 0.995
        self.train_batch=32
        self.gamma=0.9

        self.model = Network(self.state_dim, self.action_dim)
        self.target_model = Network(self.state_dim, self.action_dim)
        self.target_update()
        self.buffer = ReplayBuffer()

    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))

    def target_update(self):
        weights = self.model.model.get_weights()
        self.target_model.model.set_weights(weights)

    def replay(self):
        for _ in range(10):
            states, actions, rewards, next_states, done = self.buffer.sample()
            targets = self.target_model.predict(states)
            next_q_values = self.target_model.predict(next_states)[range(args.batch_size),np.argmax(self.model.predict(next_states), axis=1)]
            targets[range(args.batch_size), actions] = rewards + (1-done) * next_q_values * args.gamma
            self.model.train(states, targets)
    
    def train(self, max_episodes=1000):
        for ep in tqdm.trange(max_episodes):
            done, total_reward = False, 0
            state = self.env.reset()
            while not done:
                action = self.model.get_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.buffer.put(state, action, reward, next_state, done)
                total_reward += reward
                state = next_state
            if self.buffer.size() >= REPLAY_SIZE:
                self.replay()
            self.target_update()
agent=Agent(env)
agent.train(1000)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_39 (Dense)             (None, 32)                96        
_________________________________________________________________
dense_40 (Dense)             (None, 24)                792       
_________________________________________________________________
dense_41 (Dense)             (None, 3)                 75        
Total params: 963
Trainable params: 963
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_42 (Dense)             (None, 32)                96        
_________________________________________________________________
dense_43 (Dense)             (None, 24)                792       
________________________________

In [None]:
state=env.reset()
for i in range(100):
    env.render()
    env.step(agent.target_model.get_action(state))
env.close()