In [1]:
pip install tensorflow==2.3.1 gym keras-rl2 gym[atari]



In [13]:
# Подключение библиотек
import gym 
import random
import matplotlib.pyplot as plt

import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D
from tensorflow.keras.optimizers import Adam

from rl.agents import DQNAgent
from rl.memory import SequentialMemory
from rl.policy import LinearAnnealedPolicy, EpsGreedyQPolicy

In [14]:
# Проверка версии
import tensorflow as tf
tf.__version__

'2.3.1'

In [21]:
# Подключаем Диск
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# Загружаем среду
env = gym.make('SpaceInvaders-v0')
height, width, channels = env.observation_space.shape
actions = env.action_space.n

In [16]:
# Проверяем действия
env.unwrapped.get_action_meanings()

['NOOP', 'FIRE', 'RIGHT', 'LEFT', 'RIGHTFIRE', 'LEFTFIRE']

In [24]:
# Запускаем 5 тестовых эпизодов
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        # env.render()
        action = random.choice([0,1,2,3,4,5])
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:120.0
Episode:2 Score:90.0
Episode:3 Score:80.0
Episode:4 Score:125.0
Episode:5 Score:210.0


In [35]:
# Функция построения модели
def build_model(height, width, channels, actions):
    model = Sequential()
    model.add(Convolution2D(32, (8,8), strides=(4,4), activation='relu', input_shape=(3,height, width, channels)))
    model.add(Convolution2D(64, (4,4), strides=(2,2), activation='relu'))
    model.add(Convolution2D(64, (3,3), activation='relu'))
    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model

In [36]:
# Создадим модель и посмотрим характеристики
model = build_model(height, width, channels, actions)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 3, 51, 39, 32)     6176      
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 3, 24, 18, 64)     32832     
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 3, 22, 16, 64)     36928     
_________________________________________________________________
flatten_2 (Flatten)          (None, 67584)             0         
_________________________________________________________________
dense_8 (Dense)              (None, 512)               34603520  
_________________________________________________________________
dense_9 (Dense)              (None, 256)               131328    
_________________________________________________________________
dense_10 (Dense)             (None, 6)                

In [38]:
# Функция создания агента
def build_agent(model, actions):
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., 
                                  value_min=.1, value_test=.2, nb_steps=10000)
    memory = SequentialMemory(limit=1000, window_length=3)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                   enable_dueling_network=True, dueling_type='avg', nb_actions=actions, nb_steps_warmup=1000)
    return dqn

In [39]:
# Создаём агента и компилируем
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-4))

In [40]:
dqn.fit(env, nb_steps=10000, visualize=False, verbose=2)


Training for 10000 steps ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
  521/10000: episode: 1, duration: 4.825s, episode steps: 521, steps per second: 108, episode reward: 140.000, mean reward:  0.269 [ 0.000, 30.000], mean action: 2.367 [0.000, 5.000],  loss: --, mean_q: --, mean_eps: --
 1362/10000: episode: 2, duration: 25.466s, episode steps: 841, steps per second:  33, episode reward: 210.000, mean reward:  0.250 [ 0.000, 30.000], mean action: 2.622 [0.000, 5.000],  loss: 9.590323, mean_q: 9.528542, mean_eps: 0.893710
 2042/10000: episode: 3, duration: 44.196s, episode steps: 680, steps per second:  15, episode reward: 150.000, mean reward:  0.221 [ 0.000, 30.000], mean action: 2.546 [0.000, 5.000],  loss: 1.148274, mean_q: 8.324891, mean_eps: 0.846865
 3039/10000: episode: 4, duration: 61.288s, episode steps: 997, steps per second:  16, episode reward: 290.000, mean reward:  0.291 [ 0.000, 30.000], mean a

<tensorflow.python.keras.callbacks.History at 0x7f09b77449e8>

In [41]:
dqn.fit(env, nb_steps=10000, verbose=1)


Training for 10000 steps ...
Interval 1 (0 steps performed)
done, took 547.533 seconds


<tensorflow.python.keras.callbacks.History at 0x7f09b7d73a58>