In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense,Flatten,Reshape
from keras.layers.convolutional import Conv2D

gamma = 0.99  
epsilon = 1  
epsilon_min = 0.1  
epsilon_max = 1.0  
epsilon_interval = (
    epsilon_max - epsilon_min
)  
batch_size = 32  
max_steps_per_episode = 200
num_actions = 4096

In [2]:
from IPython.display import clear_output
optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

running_reward = 0
episode_count = 0
frame_count = 0

epsilon_random_frames = 50000
epsilon_greedy_frames = 1000000.0
max_memory_length = 100000
update_after_actions = 4
update_target_network = 100
loss_function = keras.losses.Huber()
len_episodes = 0
iterations = 100

In [9]:
import chess
from board_conversion import *
class ChessEnv():
    def __init__(self):
        self.board = chess.Board()
        self.action_history = []
        self.state_history = []
        self.state_next_history = []
        self.rewards_history = []
        self.done_history = []
        self.episode_reward_history = []
        pass
    def translate_board(self):
        return translate_board(self.board)
    def reset(self):
        self.board = chess.Board()
        if len(self.rewards_history) > max_memory_length:
            del self.rewards_history[:1]
            del self.state_history[:1]
            del self.state_next_history[:1]
            del self.action_history[:1]
            del self.done_history[:1]
        return translate_board(self.board)
    
    def step(self,action):
        reward = 0
        done = False
        
        state = self.translate_board()
        self.board.push(action)
        
        state_next = self.board
        state_next = translate_board(state_next)
        
        if self.board.is_checkmate():
            reward = 100
        if self.board.is_game_over():
            done = True

        self.action_history.append(move2num[action])
        self.state_history.append(state)
        self.state_next_history.append(state_next)
        self.done_history.append(done)
        self.rewards_history.append(reward)
        return state_next,reward,done
        
    def update_q_values(self):
        indices = np.random.choice(range(len(self.done_history)), size=batch_size)
            
        state_sample = np.array([self.state_history[i] for i in indices])
        state_next_sample = np.array([self.state_next_history[i] for i in indices])
        rewards_sample = [self.rewards_history[i] for i in indices]
        action_sample = [self.action_history[i] for i in indices]
        done_sample = tf.convert_to_tensor(
            [float(self.done_history[i]) for i in indices]
        )
        
        future_rewards = model_target.model.predict(state_next_sample)
        
        updated_q_values = rewards_sample + gamma * tf.reduce_max(
            future_rewards, axis=1
        )

        updated_q_values = updated_q_values * (1 - done_sample) - done_sample

        masks = tf.one_hot(action_sample, num_actions)
        return state_sample,masks,updated_q_values
    
env = ChessEnv()

In [34]:
dictionary = {'white':[]}

dictionary['white'].append(0)
dictionary['white'].append(1)

In [10]:
class Q_model():
    def __init__(self):
        self.model = self.create_q_model()

    def create_q_model(self):
    # Network defined by the Deepmind paper
        input_layer = keras.Input(shape=(8, 8, 12))

        # Convolutions on the frames on the screen
        x = Conv2D(filters=64,kernel_size = 2,strides = (2,2))(input_layer)
        x = Conv2D(filters=128,kernel_size=2,strides = (2,2))(x)
        x = Conv2D(filters=256,kernel_size=2,strides = (2,2))(x)
        x = Flatten()(x)

        action = Dense(4096,activation = 'softmax')(x)
        return keras.Model(inputs=input_layer, outputs=action)
    
    def predict(self,env):
        state_tensor = tf.convert_to_tensor(env.translate_board())
        state_tensor = tf.expand_dims(state_tensor, 0)
        action_probs = self.model(state_tensor, training=False)
        action_space = filter_legal_moves(env.board,action_probs[0])
        action = np.argmax(action_space, axis=None)
        move= num2move[action]
        return move,action
    
    def explore(self,env):
        action_space = np.random.randn(4096)
        action_space = filter_legal_moves(env.board,action_space)
        action = np.argmax(action_space, axis=None)
        move= num2move[action]
        return move,action
        
    
model = Q_model()
model_target = Q_model()

In [16]:
for _ in range(iterations):
    state = np.array(env.reset())
    episode_reward = 0
    len_episodes += 1
    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            move,action = model.explore(env)
        else:
            move,action = model.predict(env)
            
        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)
        
        state_next, reward, done = env.step(move)

        episode_reward += reward

        if frame_count % update_after_actions == 0 and len(env.done_history) > batch_size:
            state_sample,masks,updated_q_values = env.update_q_values()
            
            with tf.GradientTape() as tape:
                q_values = model.model(state_sample)
                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)
                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.model.set_weights(model.model.get_weights())
            template = "running reward: {:.2f} at episode {}, frame count {}"
            print(template.format(running_reward, episode_count, frame_count))
            
        env.episode_reward_history.append(episode_reward)
        if done:
            break

    episode_count += 1

running reward: 0.00 at episode 0, frame count 300
running reward: 0.00 at episode 1, frame count 400
running reward: 0.00 at episode 1, frame count 500
running reward: 0.00 at episode 2, frame count 600
running reward: 0.00 at episode 2, frame count 700
running reward: 0.00 at episode 3, frame count 800
running reward: 0.00 at episode 3, frame count 900
running reward: 0.00 at episode 4, frame count 1000


KeyboardInterrupt: 

In [8]:
tf.version

<module 'tensorflow._api.v2.version' from 'C:\\Users\\v_sim\\Miniconda3\\lib\\site-packages\\tensorflow\\_api\\v2\\version\\__init__.py'>