In [1]:
import gym
import gym.envs.atari as atari
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)

In [2]:
env = gym.make('BreakoutDeterministic-v0')

  and should_run_async(code)
  logger.warn(


Error: We're Unable to find the game "Breakout". Note: Gym no longer distributes ROMs. If you own a license to use the necessary ROMs for research purposes you can download them via `pip install gym[accept-rom-license]`. Otherwise, you should try importing "Breakout" via the command `ale-import-roms`. If you believe this is a mistake perhaps your copy of "Breakout" is unsupported. To check if this is the case try providing the environment variable `PYTHONWARNINGS=default::ImportWarning:ale_py.roms`. For more information see: https://github.com/mgbellemare/Arcade-Learning-Environment#rom-management

In [3]:
print(env.observation_space)
print(env.observation_space.shape)
print(np.min(env.observation_space.low))
print(np.max(env.observation_space.high))
print(env.action_space)

NameError: name 'env' is not defined

In [None]:
x = env.reset()
plt.imshow(x)
plt.show()

In [None]:
env.reset()
env.step(1)
env.render()
for i in range(150):
    a = 2+np.random.randint(2)
    env.step(a)
    env.render()
env.step(1)
for i in range(150):
    a = 2+np.random.randint(2)
    env.step(a)
    env.render()

In [None]:
env.close()

The 4 available actions in Breakout are as follows:
- 0 NOOP (no operation)
- 1 FIRE (press fire button)
- 2 RIGHT (move paddle right)
- 3 LEFT (move paddle left)

In [None]:
env.reset()
s, r, d, info = env.step(1)
print(s.shape)
print(r)
print(d)
print(info)

Whenever we press a button, the `step` function returns:
- the next screen, 
- a "reward" signal indicating how much we've won during this time step,
- a boolean indicating if we've lost,
- some extra information.

## Deep neural networks and Q-learning


In [None]:
from skimage.color import rgb2gray
from skimage.transform import resize

def process_screen(x):
    return 256*resize(rgb2gray(x), (110,84))[17:101,:]

y=process_screen(x)
plt.imshow(y, cmap="gray")
plt.show()
print(y.shape)

In [None]:
# stack the 4 last frames
z = np.stack([y,y,y,y],axis=-1)
print(z.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten

dqn = Sequential()
#1st layer
dqn.add(Conv2D(filters=16, kernel_size=(8,8), strides=4, activation="relu", input_shape=(84,84,4)))
#2nd layer
dqn.add(Conv2D(filters=32, kernel_size=(4,4), strides=2, activation="relu"))
dqn.add(Flatten())
#3rd layer
dqn.add(Dense(units=256, activation="relu"))
#output layer
dqn.add(Dense(units=4, activation="linear"))

dqn.compile(optimizer="rmsprop", loss="mean_squared_error")

#from keras.utils import plot_model
#plot_model(dqn, to_file="images/dqn_keras.png", show_shapes=True)

## Experience Replay


In [None]:
total_steps = 10000000
replay_memory_size = 100000
mini_batch_size = 32
gamma = 0.99

def epsilon(step):
    if step<1e6:
        return 1.-step*9e-7
    return .1

def clip_reward(r):
    rr=0
    if r>0:
        rr=1
    if r<0:
        rr=-1
    return rr

def greedy_action(network, x):
    Q = network.predict(np.array([x]))
    return np.argmax(Q)

def MCeval(network, trials, length, gamma):
    scores = np.zeros((trials))
    for i in range(trials):
        screen_x = process_screen(env.reset())
        stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
        x = np.stack(stacked_x, axis=-1)
        for t in range(length):
            a = greedy_action(network, x)
            raw_screen_y, r, d, _ = env.step(a)
            r = clip_reward(r)
            screen_y = process_screen(raw_screen_y)
            scores[i] = scores[i] + gamma**t * r
            if d==True:
                # restart episode
                screen_x = process_screen(env.reset())
                stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
                x = np.stack(stacked_x, axis=-1)
            else:
                # keep going
                screen_x = screen_y
                stacked_x.append(screen_x)
                x = np.stack(stacked_x, axis=-1)
    return np.mean(scores)


In [None]:
# A class for the replay memory
from collections import deque

class MemoryBuffer:
    "An experience replay buffer using numpy arrays"
    def __init__(self, length, screen_shape, action_shape):
        self.length = length
        self.screen_shape = screen_shape
        self.action_shape = action_shape
        shape = (length,) + screen_shape
        self.screens_x = np.zeros(shape, dtype=np.uint8) # starting states
        self.screens_y = np.zeros(shape, dtype=np.uint8) # resulting states
        shape = (length,) + action_shape
        self.actions = np.zeros(shape, dtype=np.uint8) # actions
        self.rewards = np.zeros((length,1), dtype=np.uint8) # rewards
        self.terminals = np.zeros((length,1), dtype=np.bool) # true if resulting state is terminal
        self.terminals[-1] = True
        self.index = 0 # points one position past the last inserted element
        self.size = 0 # current size of the buffer
    
    def append(self, screenx, a, r, screeny, d):
        self.screens_x[self.index] = screenx
        #plt.imshow(screenx)
        #plt.show()
        #plt.imshow(self.screens_x[self.index])
        #plt.show()
        self.actions[self.index] = a
        self.rewards[self.index] = r
        self.screens_y[self.index] = screeny
        self.terminals[self.index] = d
        self.index = (self.index+1) % self.length
        self.size = np.min([self.size+1,self.length])
    
    def stacked_frames_x(self, index):
        im_deque = deque(maxlen=4)
        pos = index % self.length
        for i in range(4): # todo
            im = self.screens_x[pos]
            im_deque.appendleft(im)
            test_pos = (pos-1) % self.length
            if self.terminals[test_pos] == False:
                pos = test_pos
        return np.stack(im_deque, axis=-1)
    
    def stacked_frames_y(self, index):
        im_deque = deque(maxlen=4)
        pos = index % self.length
        for i in range(4): # todo
            im = self.screens_y[pos]
            im_deque.appendleft(im)
            test_pos = (pos-1) % self.length
            if self.terminals[test_pos] == False:
                pos = test_pos
        return np.stack(im_deque, axis=-1)
    
    def minibatch(self, size):
        #return np.random.choice(self.data[:self.size], size=sz, replace=False)
        indices = np.random.choice(self.size, size=size, replace=False)
        x = np.zeros((size,)+self.screen_shape+(4,))
        y = np.zeros((size,)+self.screen_shape+(4,))
        for i in range(size):
            x[i] = self.stacked_frames_x(indices[i])
            y[i] = self.stacked_frames_y(indices[i])
        return x, self.actions[indices], self.rewards[indices], y, self.terminals[indices]

In [None]:
# initialize state and replay memory
screen_x = process_screen(env.reset())
stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
x = np.stack(stacked_x, axis=-1)
replay_memory = MemoryBuffer(replay_memory_size, (84, 84), (1,))
# initial state for evaluation
evaluation_period = 10000
Xtest = np.array([x])
nb_epochs = total_steps // evaluation_period
epoch=-1
scoreQ = np.zeros((nb_epochs))
scoreMC = np.zeros((nb_epochs))

# Deep Q-learning with experience replay
for step in range(total_steps):
    # evaluation
    if(step%10000 == 0):
        epoch = epoch+1
        # evaluation of initial state
        scoreQ[epoch] = np.mean(dqn.predict(Xtest).max(1))
        # roll-out evaluation
        scoreMC[epoch] = MCeval(network=dqn, trials=20, length=700, gamma=gamma)
    # action selection
    if np.random.rand() < epsilon(step):
        a = np.random.randint(env.action_space.n)
    else:
        a = greedy_action(dqn, x)
    # step
    raw_screen_y, r, d, _ = env.step(a)
    r = clip_reward(r)
    screen_y = process_screen(raw_screen_y)
    replay_memory.append(screen_x, a, r, screen_y, d)
    # train
    if step>mini_batch_size:
        X,A,R,Y,D = replay_memory.minibatch(mini_batch_size)
        QY = dqn.predict(Y)
        QYmax = QY.max(1).reshape((mini_batch_size,1))
        update = R + gamma * (1-D) * QYmax
        QX = dqn.predict(X)
        QX[np.arange(mini_batch_size), A.ravel()] = update.ravel()
        dqn.train_on_batch(x=X, y=QX)
    # prepare next transition
    if d==True:
        # restart episode
        screen_x = process_screen(env.reset())
        stacked_x = deque([screen_x, screen_x, screen_x, screen_x], maxlen=4)
        x = np.stack(stacked_x, axis=-1)
    else:
        # keep going
        screen_x = screen_y
        stacked_x.append(screen_x)
        x = np.stack(stacked_x, axis=-1)

In [None]:
from IPython.display import YouTubeVideo
YouTubeVideo("TmPfTpjtdgg")

## <a id="frozenlake"></a> Frozen lake
is a really easy game called FrozenLake. It is a discrete environment, easy to solve. It is provided in order to help you play around with the different concepts of Q-learning without any value function approximation.

In [5]:
pip install pygame

Collecting pygame
  Downloading pygame-2.1.2-cp38-cp38-win_amd64.whl (8.4 MB)
Installing collected packages: pygame
Successfully installed pygame-2.1.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import gym
import gym.envs.toy_text.frozen_lake as fl
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
np.set_printoptions(precision=3)

In [2]:
env = gym.make('FrozenLake-v1')
_=env.render()

  and should_run_async(code)


AttributeError: 'FrozenLakeEnv' object has no attribute 'lastaction'

In [4]:
help(fl.FrozenLakeEnv)

Help on class FrozenLakeEnv in module gym.envs.toy_text.frozen_lake:

class FrozenLakeEnv(gym.core.Env)
 |  FrozenLakeEnv(*args, **kwds)
 |  
 |  Frozen lake involves crossing a frozen lake from Start(S) to Goal(G) without falling into any Holes(H) by walking over
 |  the Frozen(F) lake. The agent may not always move in the intended direction due to the slippery nature of the frozen lake.
 |  
 |  
 |  ### Action Space
 |  The agent takes a 1-element vector for actions.
 |  The action space is `(dir)`, where `dir` decides direction to move in which can be:
 |  
 |  - 0: LEFT
 |  - 1: DOWN
 |  - 2: RIGHT
 |  - 3: UP
 |  
 |  ### Observation Space
 |  The observation is a value representing the agent's current position as
 |  current_row * nrows + current_col (where both the row and col start at 0).
 |  For example, the goal position in the 4x4 map can be calculated as follows: 3 * 4 + 3 = 15.
 |  The number of possible observations is dependent on the size of the map.
 |  For example, t

  and should_run_async(code)


In [3]:
print(env.action_space)
print(env.observation_space)
print(env.observation_space.n)
print("Reset: ", env.reset())
print("Step right:", env.step(fl.RIGHT))
env.render()
print("Second step right:", env.step(fl.RIGHT))
print("Third step right:", env.step(fl.RIGHT))
env.render()

  and should_run_async(code)


Discrete(4)
Discrete(16)
16
Reset:  0
Step right: (0, 0.0, False, {'prob': 0.3333333333333333})
Second step right: (1, 0.0, False, {'prob': 0.3333333333333333})
Third step right: (5, 0.0, True, {'prob': 0.3333333333333333})


Four utility functions and a bit of display help.

In [12]:
def to_s(row,col):
    return row*env.unwrapped.ncol+col

def to_row_col(s):
    col = s%env.unwrapped.ncol
    row = int((s-col)/env.unwrapped.ncol)
    return row,col

actions = {fl.LEFT: '\u2190', fl.DOWN: '\u2193', fl.RIGHT: '\u2192', fl.UP: '\u2191'}
print(actions)

def greedyQpolicy(Q):
    pi = np.zeros((env.observation_space.n),dtype=np.int)
    for s in range(env.observation_space.n):
        pi[s] = np.argmax(Q[s,:])
    return pi

def print_policy(pi):
    for row in range(env.unwrapped.nrow):
        for col in range(env.unwrapped.ncol):
            print(actions[pi[to_s(row,col)]], end='')
        print()
    return

{0: '←', 1: '↓', 2: '→', 3: '↑'}


In order to help you study the convergence of Q-learning, we provide an approximate value for $Q^*$.

In [13]:
Qstar = np.array([[ 0.068,  0.066,  0.066,  0.059],
 [ 0.039,  0.043,  0.04,   0.061],
 [ 0.074,  0.068,  0.072,  0.057],
 [ 0.039,  0.039,  0.033,  0.055],
 [ 0.091,  0.071,  0.064,  0.048],
 [ 0.,     0.,     0.,     0.   ],
 [ 0.112,  0.09,   0.112,  0.022],
 [ 0.,     0.,     0.,     0.   ],
 [ 0.071,  0.118,  0.102,  0.145],
 [ 0.157,  0.247,  0.204,  0.133],
 [ 0.299,  0.266,  0.225,  0.108],
 [ 0.,     0.,     0.,     0.   ],
 [ 0.,     0.,     0.,     0.   ],
 [ 0.188,  0.306,  0.38,   0.266],
 [ 0.395,  0.639,  0.615,  0.537],
 [ 0.,     0.,     0.,     0.   ]])

## Frozen Lake - Code

In [1]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

# Let's initialize the Q-function
Qql = np.zeros((16,4))
max_steps = 5000000
gamma = 0.9
alpha = 0.001

def epsilon_greedy(Q, s, epsilon):
    a = np.argmax(Q[s,:])
    if(np.random.rand()<=epsilon): # random action
        aa = np.random.randint(env.action_space.n-1)
        if aa==a:
            a=env.action_space.n-1
        else:
            a=aa
    return a

# Q-learning
count = np.zeros((env.observation_space.n,env.action_space.n)) # to track update frequencies
epsilon = 1
x = env.reset()
for t in range(max_steps):
    if((t+1)%1000000==0):
        epsilon = epsilon/2
    a = epsilon_greedy(Qql,x,epsilon)
    y,r,d,_ = env.step(a)
    Qql[x][a] = Qql[x][a] + alpha * (r+gamma*np.max(Qql[y][:])-Qql[x][a])
    count[x][a] += 1
    if d==True:
        x = env.reset()
    else:
        x=y

# Q-learning's final value function and policy

print("Max error:", np.max(np.abs(Qql-Qstar)))
print("Final epsilon:", epsilon)
pi_ql = greedyQpolicy(Qql)
print("Greedy Q-learning policy:")
print_policy(pi_ql)
print("Difference between pi_sarsa and pi_star (recall that there are several optimal policies):")
print(pi_ql-pi_star)
Qpi_ql, residuals = policy_Qeval_iter(pi_ql,1e-4,10000)
print("Max difference in value between pi_sarsa and pi_star:", np.max(np.abs(Qpi_ql-Qstar)))
print("Min difference in value between pi_sarsa and pi_star:", np.min(np.abs(Qpi_ql-Qstar)))

# Plot visitation frequencies map

count_map = np.zeros((env.unwrapped.nrow, env.unwrapped.ncol, env.action_space.n))
for a in range(env.action_space.n):
    for x in range(env.observation_space.n):
        row,col = to_row_col(x)
        count_map[row, col, a] = count[x,a]

fig, axs = plt.subplots(ncols=4)
for a in range(env.action_space.n):
    name = "a = " + actions[a]
    axs[a].set_title(name)
    axs[a].imshow(np.log(count_map[:,:,a]+1), interpolation='nearest')
    #print("a=", a, ":", sep='')
    #print(count_map[:,:,a])
plt.show()
env.render()


NameError: name 'env' is not defined

## <a id="tictactoe"></a> Deep Tic-Tac-Toe learning

Let's implement the DQN algorithm on a simple Tic-Tac-Toe game.

Note that this is not a Stochastic Optimal Control problem *per se*: it is an adversarial, two-player game (while stochastic optimal control is limited to one-player games). But maybe you can imagine a way of adapting Q-learning to this setting (think about how AlphaGo found better-than-human strategies at Go).

This exercice is thus a way to push the boundaries of what we have seen before.

###  The board game

This is a quite simple implementation. The board is a tuple of size 9 where each action refers to a position in the tuple. We store the status of the current player and who won the game.

The main classes and objects:

* player(state) 
* available_move(state)
* next_state(state, action, current_player)
* win(board, player)
* payoff(current_player, state) 
* play(state, action)

In [None]:
class oxo:
    def __init__(self):
        self.current_player = 1
        self.actions = [0, 1, 2, 3, 4, 5, 6, 7, 8]
        self.board = (0,0,0,0,0,0,0,0,0)
        self.nb_move = 0
        self.end_game = 0 # -1: null, O: running, 1: player 1 win, 2: player 2 win

    def play(self, state, action):
        self.current_player = self.player(state)
        stateList = list(self.board)
        stateList[action] = self.current_player
        self.board = tuple(stateList)
        self.nb_move += 1
        #print("nb move: ", self.nb_move)
        self.actions.remove(action)
        
        if self.asWin():
            self.end_game = self.current_player
        if self.nb_move == 9 and self.end_game == 0:
            self.end_game = -1

    def next_state(self, state, action, player):
        stateList = list(state)
        stateList[action] = player
        return tuple(stateList)

    def available_move(self, state):
        am = []
        i = 0;
        for x in state:
            if x == 0: 
                am += [i]
            i += 1
        return am

    def asWin(self):
        p = self.current_player
        b = self.board
        if (b[0] == b[1] == b[2] == p or b[3] == b[4] == b[5] == p or b[6] == b[7] == b[8] == p or
            b[0] == b[3] == b[6] == p or b[1] == b[4] == b[7] == p or b[2] == b[5] == b[8] == p or
            b[0] == b[4] == b[8] == p or b[2] == b[4] == b[6] == p):
            return True
        else: return False

    def win(self, b, p):
        if (b[0] == b[1] == b[2] == p or b[3] == b[4] == b[5] == p or b[6] == b[7] == b[8] == p or
            b[0] == b[3] == b[6] == p or b[1] == b[4] == b[7] == p or b[2] == b[5] == b[8] == p or
            b[0] == b[4] == b[8] == p or b[2] == b[4] == b[6] == p):
            return True
        else: return False
        

    # -1: running, 0; exaecquo, 1 player 1, 2 player 2    
    def payoff(self, p, b):
        nb_move = 0
        for i in b:
            if i != 0: nb_move += 1

        if self.win(b, 1):    return 1
        if self.win(b, 2):    return 2    
        if nb_move == 9: return 0        
        return -1

    def player(self, board):
        J1=0
        J2=0
        for i in board:
            if i==1: J1+=1 
            if i==2: J2+=1 
        if J1==J2: return 1
        return(2)

    def simulation(self):
        while self.end_game == 0:
            self.myPrint()
            print ("actions ", self.actions)
            action = int(input("Player %s: " % (self.current_player)))

            if action in self.actions: self.play(self.board, action)
            else: print ("wrong move, try again")

            if self.asWin(): 
                print("Player " , self.current_player , " win!!!")
                self.end_game = self.current_player

            if self.nb_move == 9 and self.end_game == 0: 
                print("No winner, No looser!")
                self.end_game == -1

            if self.current_player == 1: self.current_player = 2
            else: self.current_player = 1

    def myPrint(self):
        b = []
        for x in self.board:
            if x == 1: b.append('X')
            else: 
                if x == 2: b.append('O')
                else: b.append('.')
        print()
        print("     ", b[0] , "  " , b[1] , "  " , b[2], "       ", 0 , "  " , 1 , "  " , 2)
        print("     ", b[3] , "  " , b[4] , "  " , b[5], "  ->   ", 3 , "  " , 4 , "  " , 5)
        print("     ", b[6] , "  " , b[7] , "  " , b[8], "       ", 6 , "  " , 7 , "  " , 8)
        print()

### Can you implement an efficient DQN player?

We provide a test function below to let you play against your deep Q learning agent. It supposes `model` is a keras-style deep neural network. The second argument states if the AI plays first (`playerAI=1`) or second (`playerAI=1`).

In [None]:
def testAlgo(model, playerAI):
    loop = True
    while (loop):
        jeu = oxo()
        #switch first player at each round
        if playerAI==1:
            playerAI=2
        else:
            playerAI=1
        #while game still in progress
        while(jeu.end_game==0):
            state = jeu.board
            current_player = jeu.player(state)
            b = current_player
            if b == 1: b='X'
            else: b = 'O'
            
            if current_player == playerAI: 
                
                qval = model.predict(np.array(state).reshape(1, len(state)), batch_size=batchSize)
                print("State=", state)
                print("Actions:", jeu.actions)
                for i in range(len(qval[0])):
                    print("     Action:", i, "Q-value:", qval[0][i])
                qval_av_action = [-9999]*9
                for ac in jeu.actions:
                    qval_av_action[ac] = qval[0][ac]
                #print(qval_av_action)
                action = (np.argmax(qval_av_action))
                print("Action:", action)
            else:
                jeu.myPrint()
                print ("action ", jeu.actions, " current_player = ", b)
                action = int(input("Player %s: " % (current_player)))
        
            if action == 10:
                loop=False
                break 
            
            if action in jeu.actions: 
                jeu.play(state, action)
            else: 
                print ("----- > Wrong move, try again!")
                
            if jeu.asWin():
                if current_player == playerAI:
                    print("-------------------------------")
                    print("--------->  AI WINS <----------")
                    print("-------------------------------")
                else:
                    print("-------------------------------")
                    print("----------> YOU WIN <----------")
                    print("-------------------------------")
             
            if jeu.nb_move == 9 and jeu.end_game == 0:
                print("-------------------------------")
                print("---> No winner, No looser <----")
                print("-------------------------------")
            
            #clear_output(wait=True)


In [5]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import RMSprop, sgd
import numpy as np
import random
from IPython.display import clear_output

# The Deep Q network

model = Sequential()

model.add(Dense(150, init='lecun_uniform', input_shape=(9,)))
model.add(Activation('relu'))
model.add(Dropout(0.2)) 
model.add(Dense(150, init='lecun_uniform'))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(9, init='lecun_uniform'))
model.add(Activation('linear'))
model.compile(loss='mse', optimizer="rmsprop")

print(model.summary())

# Parameters for Q-learning

epochs = 1000
gamma = 0.99 # discount factor
epsilon = 1 # epsilon-greddy

update=0 
alpha=0.1 # learning rate
experience_replay=True
batchSize = 40 # mini batch size
buffer = 1000
replay = [] # init vector buffer
h=0 # current size of the vector buffer

# Initialize game

jeu = oxo()

# Learn

for i in range(epochs):
    jeu = oxo()
    state = jeu.board
    current_player = jeu.player(state)
    
    while(jeu.end_game==0):
        current_player = jeu.player(state)
        qval = model.predict(np.array(state).reshape(1,len(state)), batch_size=batchSize)
        if (random.random() < epsilon): # exploration exploitation strategy    
            rand = np.random.randint(0,len(jeu.actions))
            action = jeu.actions[rand]
        else: #choose best action from Q(s,a) values
            qval_av_action = [-9999]*9
            for ac in jeu.actions:
                qval_av_action[ac] = qval[0][ac]
            action = (np.argmax(qval_av_action))
        #Take action, observe new state S'
        #Observe reward
        jeu.play(state, action)
        new_state = jeu.board
        # choose new reward values
        reward = -5
        if jeu.payoff(current_player, new_state) == current_player:
            reward = 2
        if jeu.payoff(current_player, new_state) == 0:
            reward = 1
        if jeu.payoff(current_player, new_state) == -1:
            reward = 0
            
        if not experience_replay:
            #Get max_Q(S',a)
            newQ = model.predict(np.array(new_state).reshape(1,len(state)), batch_size=batchSize)
            maxQ = np.max(newQ)
            y = np.zeros((1,9))
            y[:] = qval[:]
            if reward != 0: #non-terminal state
                update = (reward + gamma * maxQ)
            else:
                update = reward
            y[0][action] = update #target output
            print("Game #: %s" % (i,))
            model.fit(np.array(state).reshape(1, len(state)), y, batch_size=batchSize, nb_epoch=1, verbose=1)
            state = new_state
            clear_output(wait=True)
            
        else:
            #Experience replay storage
            if (len(replay) < buffer): #if buffer not filled, add to it
                replay.append((state, action, reward, new_state))
            else: #if buffer full, overwrite old values
                if (h < (buffer-1)):
                    h += 1
                else:
                    h = 0
                replay[h] = (state, action, reward, new_state)
                #randomly sample our experience replay memory
            
            if(len(replay)>batchSize):
                minibatch = random.sample(replay, batchSize)
                X_train = []
                y_train = []
                for memory in minibatch:
                    #Get max_Q(S',a)
                    old_state, action, reward, new_state = memory
                    old_qval = model.predict(np.array(old_state).reshape(1,len(old_state)), batch_size=1)
                    newQ = model.predict(np.array(new_state).reshape(1,len(new_state)), batch_size=1)
                    maxQ = np.max(newQ)
                    y = old_qval[:]
                    if reward != 0: #non-terminal state
                        update = (reward + (gamma * maxQ))
                    else: #terminal state
                        update = reward
                    y[0][action] = update
                    X_train.append(np.array(old_state).reshape(len(old_state),))
                    y_train.append(np.array(y).reshape(9,))
    
                X_train = np.array(X_train)
                y_train = np.array(y_train)
                print("Game #: %s" % (i,))
                model.fit(X_train, y_train, batch_size=batchSize, nb_epoch=1, verbose=1)
                state = new_state
            clear_output(wait=True)
        # update exploitation / exploration strategy
        if epsilon > 0.1:
            epsilon -= (1.0/epochs)
    
        # save the model every 1000 epochs
        if i%1000 == 0:
            model.save("model_dql_oxo_dense.dqf")




ImportError: cannot import name 'RMSprop' from 'keras.optimizers' (C:\Users\Yu rui\AppData\Roaming\Python\Python38\site-packages\keras\optimizers.py)