# Deep Q-Learning

In [1]:
import gym
import numpy as np
from gym import wrappers
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Estabelecendo alguns parametros

In [2]:
BATCH_SIZE = 64
MAX_EPISODES = 2000
GAMMA = 0.9 # Fator de desconto

## Ambiente
Eu usei a OpenAI Gym https://gym.openai.com/ <br>
Eu treinei o Cart Pole env

In [3]:
env = gym.make('CartPole-v1') # Instanciando o novo ambiente

print('States:', env.observation_space) # Mostrar as observações
print('Actions:', env.action_space)     # Mostrar as possíveis ações a serem tomadas

States: Box(4,)
Actions: Discrete(2)


Reder um episódio para ver o comportamento do ambiente

In [4]:
env.reset() # Setar o ambiente para o estado inicial

done = False
while not done:
    env.render()
    _, _, done, _ = env.step(env.action_space.sample()) # Executar ações aleatórias

env.close()

## Q-Network
Uma rede neural com duas camadas ocultas com 32 unidades cada e ativação ReLU(rectified linear unit)

A entrada de rede é uma matriz com forma igual ao espaço de observação e
sua saída possui valores para cada ação

In [5]:
model = Sequential()
model.add(Dense(32, activation='relu', name='fc1', input_shape=env.observation_space.shape))
model.add(Dense(32, activation='relu', name='fc2'))
model.add(Dense(env.action_space.n, name='fc3'))
model.summary()

model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
fc1 (Dense)                  (None, 32)                160       
_________________________________________________________________
fc2 (Dense)                  (None, 32)                1056      
_________________________________________________________________
fc3 (Dense)                  (None, 2)                 66        
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________


## Replay da experiência
Inicializa um buffer para armazenar transições passadas

In [6]:
buffer = deque(maxlen=2000) # Um buffer circular com no máximo 2000 amostras

while len(buffer) < BATCH_SIZE: # Preenchendo o buffer com um lote, para que possa começar a aprender
    state = env.reset()
    done = False
    
    while not done:
        action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        transition = (state, action, reward, new_state, done)
        buffer.append(transition) # Armazenar transição no buffer

## Treinamento

Aqui, explorei os estados no início da taxa de exploração a partir de 100%.
Em cada episódio, diminuímos essa taxa multiplicando-a por uma taxa de decaimento.

In [None]:
exploration_decay = 0.995 # Taxa de decaimento
exploration_rate = 1.0    # Taxa de exploração inicial

for episode in range(1, MAX_EPISODES+1):
    state = env.reset()
    done = False
    score = 0.0
    
    while not done:
#         env.render() # Comentei esta linha para ocultar a interface gráfica
        
        if np.random.rand() < exploration_rate :
            # Se rand < exploration_rate o agente deve explorar
            action = env.action_space.sample()
        else:
            # Se não o agente usará o Q-Network para obter a melhor ação
            action = np.argmax(model.predict(state[None]))
            
        new_state, reward, done, info = env.step(action) # Perform action
        transition = (state, action, reward, new_state, done)
        buffer.append(transition) # Armazenar transição no buffer
        
        state = new_state
        
        score += reward # Atualizando pontuação total do episódio
        
    if exploration_rate > 0.01:
        # Atualizando exploration_rate. Deve ser pelo menos 1%
        exploration_rate *= exploration_decay
        
    # Replay da experiencia
    # Amostra de um lote aleatório do buffer
    indexes = np.random.choice(len(buffer), BATCH_SIZE, replace=True)
    batch = [buffer[i] for i in indexes]
    states = np.array([item[0] for item in batch])
    actions = np.array([item[1] for item in batch])
    rewards = np.array([item[2] for item in batch])
    new_states = np.array([item[3] for item in batch])
    terminals = np.array([item[4] for item in batch])

    # Prever Q(s, a, theta) para states 
    predictions = model.predict(states)

    # Atualiza os valores de acordo com o algoritimo Deep Q-Learnig
    for i in range(len(batch)):
        if terminals[i]:
            # yj = rj
            predictions[i,actions[i]] = rewards[i]
        else:
            # yj = rj + gamma * Q(s', a', theta)
            predictions[i,actions[i]] = rewards[i] + GAMMA * np.max(model.predict(new_states[i][None]))

    # Treina o modelo com lote
    loss, _ = model.train_on_batch(states, predictions)

    print('Episode: {}, Score: {}'.format(episode, score))


Episode: 1, Score: 31.0
Episode: 2, Score: 14.0
Episode: 3, Score: 34.0
Episode: 4, Score: 21.0
Episode: 5, Score: 23.0
Episode: 6, Score: 12.0
Episode: 7, Score: 29.0
Episode: 8, Score: 9.0
Episode: 9, Score: 19.0
Episode: 10, Score: 22.0
Episode: 11, Score: 21.0
Episode: 12, Score: 24.0
Episode: 13, Score: 10.0
Episode: 14, Score: 14.0
Episode: 15, Score: 11.0
Episode: 16, Score: 17.0
Episode: 17, Score: 16.0
Episode: 18, Score: 12.0
Episode: 19, Score: 20.0
Episode: 20, Score: 17.0
Episode: 21, Score: 21.0
Episode: 22, Score: 38.0
Episode: 23, Score: 17.0
Episode: 24, Score: 16.0
Episode: 25, Score: 11.0
Episode: 26, Score: 47.0
Episode: 27, Score: 29.0
Episode: 28, Score: 16.0
Episode: 29, Score: 16.0
Episode: 30, Score: 33.0
Episode: 31, Score: 32.0
Episode: 32, Score: 13.0
Episode: 33, Score: 13.0
Episode: 34, Score: 18.0
Episode: 35, Score: 51.0
Episode: 36, Score: 23.0
Episode: 37, Score: 16.0
Episode: 38, Score: 10.0
Episode: 39, Score: 57.0
Episode: 40, Score: 15.0
Episode: 4

Episode: 315, Score: 171.0
Episode: 316, Score: 160.0
Episode: 317, Score: 152.0
Episode: 318, Score: 210.0
Episode: 319, Score: 186.0
Episode: 320, Score: 208.0
Episode: 321, Score: 241.0
Episode: 322, Score: 380.0
Episode: 323, Score: 236.0
Episode: 324, Score: 344.0
Episode: 325, Score: 241.0
Episode: 326, Score: 336.0
Episode: 327, Score: 201.0
Episode: 328, Score: 234.0
Episode: 329, Score: 376.0
Episode: 330, Score: 231.0
Episode: 331, Score: 222.0
Episode: 332, Score: 233.0
Episode: 333, Score: 207.0
Episode: 334, Score: 251.0
Episode: 335, Score: 320.0
Episode: 336, Score: 214.0
Episode: 337, Score: 220.0
Episode: 338, Score: 227.0
Episode: 339, Score: 167.0
Episode: 340, Score: 181.0
Episode: 341, Score: 166.0
Episode: 342, Score: 190.0
Episode: 343, Score: 183.0
Episode: 344, Score: 156.0
Episode: 345, Score: 153.0
Episode: 346, Score: 155.0
Episode: 347, Score: 145.0
Episode: 348, Score: 126.0
Episode: 349, Score: 142.0
Episode: 350, Score: 126.0
Episode: 351, Score: 221.0
E

Episode: 619, Score: 336.0
Episode: 620, Score: 437.0
Episode: 621, Score: 222.0
Episode: 622, Score: 298.0
Episode: 623, Score: 428.0
Episode: 624, Score: 256.0
Episode: 625, Score: 257.0
Episode: 626, Score: 325.0
Episode: 627, Score: 244.0
Episode: 628, Score: 231.0
Episode: 629, Score: 399.0
Episode: 630, Score: 217.0
Episode: 631, Score: 258.0
Episode: 632, Score: 252.0
Episode: 633, Score: 389.0
Episode: 634, Score: 245.0
Episode: 635, Score: 328.0
Episode: 636, Score: 277.0
Episode: 637, Score: 172.0
Episode: 638, Score: 336.0
Episode: 639, Score: 232.0
Episode: 640, Score: 398.0
Episode: 641, Score: 185.0
Episode: 642, Score: 180.0
Episode: 643, Score: 225.0
Episode: 644, Score: 247.0
Episode: 645, Score: 235.0
Episode: 646, Score: 211.0
Episode: 647, Score: 215.0
Episode: 648, Score: 213.0
Episode: 649, Score: 207.0
Episode: 650, Score: 241.0
Episode: 651, Score: 400.0
Episode: 652, Score: 272.0
Episode: 653, Score: 310.0
Episode: 654, Score: 265.0
Episode: 655, Score: 237.0
E

Episode: 923, Score: 500.0
Episode: 924, Score: 273.0
Episode: 925, Score: 474.0
Episode: 926, Score: 255.0
Episode: 927, Score: 268.0
Episode: 928, Score: 500.0
Episode: 929, Score: 500.0
Episode: 930, Score: 251.0
Episode: 931, Score: 293.0
Episode: 932, Score: 425.0
Episode: 933, Score: 500.0
Episode: 934, Score: 314.0
Episode: 935, Score: 354.0
Episode: 936, Score: 224.0
Episode: 937, Score: 225.0
Episode: 938, Score: 459.0
Episode: 939, Score: 234.0
Episode: 940, Score: 276.0
Episode: 941, Score: 274.0
Episode: 942, Score: 356.0
Episode: 943, Score: 417.0
Episode: 944, Score: 247.0
Episode: 945, Score: 285.0
Episode: 946, Score: 238.0
Episode: 947, Score: 500.0
Episode: 948, Score: 500.0
Episode: 949, Score: 270.0
Episode: 950, Score: 318.0
Episode: 951, Score: 500.0
Episode: 952, Score: 500.0
Episode: 953, Score: 412.0
Episode: 954, Score: 500.0
Episode: 955, Score: 305.0
Episode: 956, Score: 500.0
Episode: 957, Score: 255.0
Episode: 958, Score: 453.0
Episode: 959, Score: 500.0
E

Episode: 1219, Score: 239.0
Episode: 1220, Score: 240.0
Episode: 1221, Score: 243.0
Episode: 1222, Score: 230.0
Episode: 1223, Score: 235.0
Episode: 1224, Score: 241.0
Episode: 1225, Score: 195.0
Episode: 1226, Score: 241.0
Episode: 1227, Score: 273.0
Episode: 1228, Score: 180.0
Episode: 1229, Score: 265.0
Episode: 1230, Score: 263.0
Episode: 1231, Score: 260.0
Episode: 1232, Score: 217.0
Episode: 1233, Score: 199.0
Episode: 1234, Score: 207.0
Episode: 1235, Score: 230.0
Episode: 1236, Score: 269.0
Episode: 1237, Score: 218.0
Episode: 1238, Score: 183.0
Episode: 1239, Score: 189.0
Episode: 1240, Score: 199.0
Episode: 1241, Score: 170.0
Episode: 1242, Score: 222.0
Episode: 1243, Score: 211.0
Episode: 1244, Score: 250.0
Episode: 1245, Score: 190.0
Episode: 1246, Score: 236.0
Episode: 1247, Score: 256.0
Episode: 1248, Score: 255.0
Episode: 1249, Score: 218.0
Episode: 1250, Score: 360.0
Episode: 1251, Score: 188.0
Episode: 1252, Score: 234.0
Episode: 1253, Score: 261.0
Episode: 1254, Score

Episode: 1512, Score: 500.0
Episode: 1513, Score: 230.0
Episode: 1514, Score: 393.0
Episode: 1515, Score: 294.0
Episode: 1516, Score: 495.0
Episode: 1517, Score: 280.0
Episode: 1518, Score: 262.0
Episode: 1519, Score: 300.0
Episode: 1520, Score: 316.0
Episode: 1521, Score: 394.0
Episode: 1522, Score: 500.0
Episode: 1523, Score: 287.0
Episode: 1524, Score: 310.0
Episode: 1525, Score: 443.0
Episode: 1526, Score: 402.0
Episode: 1527, Score: 254.0
Episode: 1528, Score: 271.0
Episode: 1529, Score: 256.0
Episode: 1530, Score: 260.0
Episode: 1531, Score: 227.0
Episode: 1532, Score: 196.0
Episode: 1533, Score: 252.0
Episode: 1534, Score: 315.0
Episode: 1535, Score: 235.0
Episode: 1536, Score: 345.0
Episode: 1537, Score: 484.0
Episode: 1538, Score: 500.0
Episode: 1539, Score: 241.0
Episode: 1540, Score: 279.0
Episode: 1541, Score: 241.0
Episode: 1542, Score: 463.0
Episode: 1543, Score: 270.0
Episode: 1544, Score: 500.0
Episode: 1545, Score: 500.0
Episode: 1546, Score: 255.0
Episode: 1547, Score

Episode: 1805, Score: 500.0
Episode: 1806, Score: 375.0
Episode: 1807, Score: 500.0
Episode: 1808, Score: 500.0
Episode: 1809, Score: 500.0
Episode: 1810, Score: 444.0
Episode: 1811, Score: 245.0
Episode: 1812, Score: 376.0
Episode: 1813, Score: 500.0
Episode: 1814, Score: 340.0
Episode: 1815, Score: 325.0
Episode: 1816, Score: 500.0
Episode: 1817, Score: 500.0
Episode: 1818, Score: 275.0
Episode: 1819, Score: 500.0
Episode: 1820, Score: 264.0
Episode: 1821, Score: 347.0
Episode: 1822, Score: 258.0
Episode: 1823, Score: 236.0
Episode: 1824, Score: 326.0
Episode: 1825, Score: 191.0
Episode: 1826, Score: 212.0
Episode: 1827, Score: 272.0
Episode: 1828, Score: 194.0
Episode: 1829, Score: 265.0
Episode: 1830, Score: 222.0
Episode: 1831, Score: 216.0
Episode: 1832, Score: 264.0
Episode: 1833, Score: 254.0
Episode: 1834, Score: 278.0
Episode: 1835, Score: 269.0
Episode: 1836, Score: 323.0
Episode: 1837, Score: 408.0
Episode: 1838, Score: 259.0
Episode: 1839, Score: 221.0
Episode: 1840, Score

In [8]:
# Salvar modelo
model.save('cart_pole.h5')

## Teste
Testando o modelo após o treinamento

In [25]:

done = False
state = env.reset()
score = 0.0

while not done:
    env.render()
    action = np.argmax(model.predict(state[None]))
    state, reward, done, _ = env.step(action)
    score += reward
    
print(score)
    
    

281.0


In [27]:
env.close()