In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gym
import random
import itertools

#dynamic memory allocation
gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)
print(tf.__version__)

2.2.0


In [2]:
env = gym.make('CartPole-v0')
env.reset()
for _ in range(200):
    env.render()
    env.step(env.action_space.sample())
env.close()

In [15]:
a = env.reset()
a.shape

(4,)

In [88]:
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
def create_model(input_shape, no_of_actions):
    model = tf.keras.Sequential([
        Dense(64, input_shape = (input_shape, ), activation = tf.nn.relu),
        Dense(32, activation = tf.nn.relu),
        Dense(no_of_actions)
    ])
    model.compile(optimizer = Adam(lr = 0.001), loss = 'mean_squared_error')
    return model

In [89]:
def epolicy(model, epsilon, nA, state):
    best_action = np.argmax(model.predict(state.reshape(1,-1)))
    probs = [epsilon/nA for i in range(nA)]
    probs[best_action] += 1.0-epsilon
    return probs 


In [128]:
def deep_ql(no_of_episodes, epsilon, env, batch_size, decay_rate, discount_factor):
    er = []
    model = create_model(4, 2)
    model.summary()
    print(model.output_shape)

    episode_length = []
    episode_reward = []
    state = env.reset()
    for i in range(batch_size):
        probs = epolicy(model, epsilon, 2, state)
        action = np.random.choice(np.arange(len(probs)), p = probs)
        print(action)
        next_state, reward, done, _ = env.step(action)
        er.append((state, action, reward, next_state, done))

        if done:
            state = env.reset()
        
        else:
            state = next_state

    for i in range(no_of_episodes):
        if i%50 == 0:
            print("Episode no : ", i)
        
        state = env.reset()
        for t in itertools.count():

            epsilon = epsilon**decay_rate
            if i%50 == 0:
                env.render()
            

            probs = epolicy(model, epsilon, 2, state)
            action = np.random.choice(np.arange(len(probs)), p = probs)
            onext_state, reward, odone, _ = env.step(action)
            er.append((state, action, reward, onext_state, odone))


            samples = random.sample(er, batch_size)
            States_tr = np.array([s[0] for s in samples])
            Q_tr = np.zeros((batch_size, 2))
            print(Q_tr.shape)
            for j, s in enumerate(samples):
                state, action, reward, next_state, done = s
                Q_tr[j][action] = (reward + np.invert(done) * discount_factor * np.max(model.predict(next_state.reshape(1, -1))))
            
            model.fit(States_tr, Q_tr, epochs = 1, verbose = False)

            if odone:
                break
            else:
                state = onext_state


In [129]:
deep_ql(10, 0.8, env, 32, 0.99, 1)

Model: "sequential_39"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_118 (Dense)            (None, 64)                320       
_________________________________________________________________
dense_119 (Dense)            (None, 32)                2080      
_________________________________________________________________
dense_120 (Dense)            (None, 2)                 66        
Total params: 2,466
Trainable params: 2,466
Non-trainable params: 0
_________________________________________________________________
(None, 2)
1
0
1
0
0
0
1
0
0
0
1
1
1
0
1
1
0
1
0
1
0
1
1
1
0
0
1
0
0
1
0
0
Episode no :  0
(32, 2)
1/1 - 0s - loss: 0.5487
(32, 2)
1/1 - 0s - loss: 0.5454
(32, 2)
1/1 - 0s - loss: 0.5354
(32, 2)
1/1 - 0s - loss: 0.5416
(32, 2)
1/1 - 0s - loss: 0.5348
(32, 2)
1/1 - 0s - loss: 0.5264
(32, 2)
1/1 - 0s - loss: 0.5254
(32, 2)
1/1 - 0s - loss: 0.5279
(32, 2)
1/1 - 0s - loss: 0.5322
(32

In [130]:
env.close()