In [None]:
### Reinforcement learning
#Input = board (One-hot)
#Output = move
"""
player functions to use in play_game method:
random_player - randomly chooses from available moves
human_player - takes keyboard input to choose from the list of available moves. input example : "1, 0, 2, 0" moves the
               chess piece from 1,0 coordinates to 2,0 coordinates
min_max_player - runs the min-max algorithm to choose the best available move
make_move - player trained with reinforced learning
"""
import random
import itertools
import numpy as np


def _new_board():
    """Return a emprty chess board we can use for simulating a game.
    Returns:
        8x8 tuple of ints
    """
    return (('R', 'N', 'B', 'Q', 'K', 'B', 'N', 'R'),
            ('P', 'P', 'P', 'P', 'P', 'P', 'P', 'P'),
            (0, 0, 0, 0, 0, 0, 0, 0),
           (0, 0, 0, 0, 0, 0, 0, 0),
           (0, 0, 0, 0, 0, 0, 0, 0),
           (0, 0, 0, 0, 0, 0, 0, 0),
           ('p', 'p', 'p', 'p', 'p', 'p', 'p', 'p'),
           ('r', 'n', 'b', 'q', 'k', 'b', 'n', 'r'))


def apply_move(board_state, move):
    """Returns a copy of the given board_state with the desired move applied.
    Args:
        board_state (3x3 tuple of int): The given board_state we want to apply the move to.
        move (int, int): The position we want to make the move in.
        side (int): The side we are making this move for, 1 for the first player, -1 for the second player.
    Returns:
        (3x3 tuple of int): A copy of the board_state with the given move applied for the given side.
    """
    from_x, from_y, to_x, to_y = move

    def get_tuples():
        temp = list(board_state)
        
        for x in range(8):
            for y in range(8):
                if(from_x == x and from_y == y):
                    chess_piece=temp[x][y]
                    temp[x]=list(temp[x])
                    temp[x][y]=0
                    temp[x]=tuple(temp[x])
                    
        for x in range(8):
            for y in range(8):
                if(to_x == x and to_y == y):
                    temp[x]=list(temp[x])
                    temp[x][y]=chess_piece
                    temp[x]=tuple(temp[x])
        
        
        return tuple(temp)
        
    return tuple(get_tuples())


def is_white(board_state, x, y):
    #returns if the piece on x,y coordinates is white or not
    if (board_state[x][y]=='p' or board_state[x][y]=='r' or board_state[x][y]=='n' or board_state[x][y]=='b' or board_state[x][y]=='q' or board_state[x][y]=='k') :
        return True
    else:
        return False
    
def is_black(board_state, x, y):
    #returns if the piece on x,y coordinates is black or not
    if (board_state[x][y]=='P' or board_state[x][y]=='R' or board_state[x][y]=='N' or board_state[x][y]=='B' or board_state[x][y]=='Q' or board_state[x][y]=='K') :
        return True
    else:
        return False
    
def available_moves(board_state, side):
    """Get all legal moves for the current board_state.
    Args:
        board_state: The board_state we want to check for valid moves.
    Returns:
        Generator of (int, int): All the valid moves that can be played in this position.
    """
    for x, y in itertools.product(range(8), range(8)):
        
        if (side == 1):
        #white player:
        
            if (board_state[x][y] == 'p'):
            #white pawn:
            
                if (x>0 and board_state[x-1][y] == 0): #forward 1
                    yield (x, y, x-1, y)
                if (x==6 and board_state[x-1][y] == 0 and board_state[x-2][y] == 0): #forward 2
                    yield (x, y, x-2, y)
                if (x>0 and y>0 and is_black(board_state, x-1, y-1)): # forward 1 left 1
                    yield (x, y, x-1, y-1)
                if (x>0 and y<7 and is_black(board_state, x-1, y+1)): # forward 1 right 1
                    yield (x, y, x-1, y+1)

                    
            if (board_state[x][y] == 'k'):
            #white king:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1],[1,0],[-1,0],[0,-1],[0,1]] #directions the chess piece can move in
                for i in directions:
                    if(x+i[0]<8 and x+i[0]>-1 and y+i[1]<8 and y+i[1]>-1):
                        if(is_white(board_state,x+i[0],y+i[1])==False):
                            yield(x, y, x+i[0], y+i[1])
                    
                    
            if (board_state[x][y] == 'r'):
            #white rook:
            
                directions=[[1,0],[-1,0],[0,-1],[0,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_white(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_black(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
 
            if (board_state[x][y] == 'b'):
            #white bishop:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_white(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_black(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
                                
                                
                                
            if (board_state[x][y] == 'q'):
            #white queen:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1],[1,0],[-1,0],[0,-1],[0,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_white(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_black(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
                                
            if (board_state[x][y] == 'n'):
            #white knight:
            
                directions=[[1,2],[1,-2],[-1,2],[-1,-2],[2,1],[2,-1],[-2,1],[-2,-1]]
                for i in directions:
                    if(x+i[0]<8 and x+i[0]>-1 and y+i[1]<8 and y+i[1]>-1):
                        if(is_white(board_state,x+i[0],y+i[1])==False):
                            yield(x, y, x+i[0], y+i[1])


        if (side == -1):
        #black player:
            
            if (board_state[x][y] == 'P' and side == -1):
                #black pawn:
            
                if (x<7 and board_state[x+1][y] == 0): #forward 1
                    yield (x, y, x+1, y)
                if (x==1 and board_state[x+1][y] == 0 and is_black(board_state, x+2, y)==False): #forward 2
                    yield (x, y, x+2, y)
                if (x<7 and y>0 and is_white(board_state, x+1, y-1)): # forward 1 left 1
                    yield (x, y, x+1, y-1)
                if (x<7 and y<7 and is_white(board_state, x+1, y+1)): # forward 1 right 1
                    yield (x, y, x+1, y+1)

                    
                    
            if (board_state[x][y] == 'K'):
            #white king:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1],[1,0],[-1,0],[0,-1],[0,1]]
                for i in directions:
                    if(x+i[0]<8 and x+i[0]>-1 and y+i[1]<8 and y+i[1]>-1):
                        if(is_black(board_state,x+i[0],y+i[1])==False):
                            yield(x, y, x+i[0], y+i[1])
                            
                            
                    

            if (board_state[x][y] == 'R'):
            #black rook:
            
                directions=[[1,0],[-1,0],[0,-1],[0,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_black(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_white(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
                                
            if (board_state[x][y] == 'B'):
            #black bishop:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_black(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_white(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
                                
                                
            if (board_state[x][y] == 'Q'):
            #black queen:
            
                directions=[[1,1],[-1,-1],[1,-1],[-1,1],[1,0],[-1,0],[0,-1],[0,1]]
                for i in directions:
                        blocked=0
                        coordinates=[x,y]
                        while(blocked==0 and coordinates[0]+i[0]<8 and coordinates[0]+i[0]>-1 and coordinates[1]+i[1]<8 and coordinates[1]+i[1]>-1):
                            coordinates[0]+=i[0]
                            coordinates[1]+=i[1]
                            if(is_black(board_state,coordinates[0],coordinates[1])):
                                blocked=1
                            else:
                                if(is_white(board_state,coordinates[0],coordinates[1])):
                                    blocked=1
                                yield (x, y, coordinates[0], coordinates[1])
                                
                                
            if (board_state[x][y] == 'N'):
            #black knight:
            
                directions=[[1,2],[1,-2],[-1,2],[-1,-2],[2,1],[2,-1],[-2,1],[-2,-1]]
                for i in directions:
                    if(x+i[0]<8 and x+i[0]>-1 and y+i[1]<8 and y+i[1]>-1):
                        if(is_black(board_state,x+i[0],y+i[1])==False):
                            yield(x, y, x+i[0], y+i[1])

                            

def has_winner(board_state):
    """Determine if a player has won on the given board_state.
    Args:
        board_state (8x8 tuple of int): The current board_state we want to evaluate.
    Returns:
        int: 1 if player one has won, -1 if player 2 has won, otherwise 0.
    """
    black_king=0
    white_king=0
    
    for x in range(8):
        for y in range(8):
            if(board_state[x][y] == 'k'):
                white_king=1
            if(board_state[x][y] == 'K'):
                black_king=1       
    
    if black_king==0:
        return 1
    if white_king==0:
        return -1
    
    return 0  # no one has won, return 0 for a draw

def print_board(board_state,move):
  for i in range (len(board_state)):
    for j in range (len(board_state[0])):
      if((move[0]==i and move[1]==j) or (move[2]==i and move[3]==j)):
        print("\033[1;33m%s" %board_state[i][j], end = '  ')
      else:
        print("\033[1;30m%s" %board_state[i][j], end = '  ')
    print("\n")

#converting board state to fen
from more_itertools import run_length


def convert_cell(value):
    if value == 0:
        return None
    else:
        return value


def convert_rank(rank):
    return ''.join(
        value * count if value else str(count)
        for value, count in run_length.encode(map(convert_cell, rank))
    )


def fen_from_board(board):
    return '/'.join(map(convert_rank, board)) + ' w KQkq - 0 1'

    
def play_game(plus_player_func, minus_player_func, log=0, max_game_length=100):
    """Run a single game of tic-tac-toe until the end, using the provided function args to determine the moves for each
    player.
    Args:
        plus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
            current board_state and side this player is playing, and returns the move the player wants to play.
        minus_player_func ((board_state(3 by 3 tuple of int), side(int)) -> move((int, int))): Function that takes the
            current board_state and side this player is playing, and returns the move the player wants to play.
        log (bool): If True progress is logged to console, defaults to False
    Returns:
        int: 1 if the plus_player_func won, -1 if the minus_player_func won and 0 for a draw
    """
    board_state = _new_board()
    player_turn = 1
    turn = 0
    points=[]
    while True:
        _available_moves = list(available_moves(board_state, player_turn))
        
        turn=turn+1
        
        if len(_available_moves) == 0:
            # draw
            if log:
                print("no moves left, game ended a draw")
            
            return 0, points, turn
        if player_turn > 0:
            move = plus_player_func(board_state, 1)
        else:
            move = minus_player_func(board_state, -1)

        if move not in _available_moves:
            # if a player makes an invalid move the other player wins
            if log:
                print("illegal move ", move)
            
            if player_turn > 0:
                points.append(player_turn*(-100))
            return -player_turn, points, turn
        
        pre_move_point = position_points(board_state)
        
        board_state = apply_move(board_state, move)
        
        post_move_point = position_points(board_state)
        
        if player_turn > 0:
            points.append(post_move_point-pre_move_point)
        
        if log==1:
            
            print(fen_from_board(board_state))
            print_board(board_state,move)
            print("___________")
            

        if log==2:
            
            print(np.matrix(board_state))
            print("___________")

        winner = has_winner(board_state)
        if winner != 0:
            if log:
                print("we have a winner, side: %s" % player_turn)
            return winner, points, turn
        player_turn = -player_turn
        if turn == max_game_length:
            return 0, points, turn

def random_player(board_state, side):
    """A player func that can be used in the play_game method. Given a board state it chooses a move randomly from the
    valid moves in the current state.
    Args:
        board_state (3x3 tuple of int): The current state of the board
        _: the side this player is playing, not used in this function because we are simply choosing the moves randomly
    Returns:
        (int, int): the move we want to play on the current board
    """
    moves = list(available_moves(board_state, side))
    return random.choice(moves)


def human_player(board_state,side):
    moves = list(available_moves(board_state,side))
    print(moves)
    move = 0
    while(move not in list(available_moves(board_state,side))):
        move=input()
        move = tuple(map(int, move.split(', '))) 
    return(move)

def position_points(board_state):
    pieces=['p','r','n','b','q','k','P','R','N','B','Q','K']
    points=[1,5,3,3,9,100,-1,-5,-3,-3,-9,-100]
    point=0
    for i in range(8):
        for j in range(8):
            for k in range(12):
                if(board_state[i][j]==pieces[k]):
                    point=point+points[k]
    return point
                    

if __name__ == '__main__':
    # example of playing a game
    play_game(policy_gradient_player, random_player, log=1)

In [None]:
#policy_gradient


import time
import collections
import numpy as np
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
print(tf.__version__)

HIDDEN_NODES = (500, 500, 500)  # number of hidden layer neurons
INPUT_NODES = 8 * 8 * 12  # board size * one hot vector length
BATCH_SIZE = 100  # every how many games to do a parameter update?
LEARN_RATE = 1e-4
OUTPUT_NODES = 8 * 8 * 8 * 8 # board size * board size = number of possible moves
PRINT_RESULTS_EVERY_X = 100 # every how many games to print the results
WEIGHTS_SAVED=False # if True starts from pre saved weights, if False starts with random weights
ILLEGAL_MOVES_ALLOWED = True
TRAIN_MODE = 2 # 0 - Learning to make legal moves , 1 - Learning to win the game, 2 - Learning to win the game, reward= evaulation function

input_placeholder = tf.compat.v1.placeholder("float", shape=(None, INPUT_NODES))
reward_placeholder = tf.compat.v1.placeholder("float", shape=(None,))
actual_move_placeholder = tf.compat.v1.placeholder("float", shape=(None, OUTPUT_NODES))

if (WEIGHTS_SAVED==False):
  hidden_weights_1 = tf.Variable(tf.random.truncated_normal((INPUT_NODES, HIDDEN_NODES[0]), stddev=1. / np.sqrt(INPUT_NODES)))   #random values from a normal distribution
  hidden_weights_2 = tf.Variable(
      tf.random.truncated_normal((HIDDEN_NODES[0], HIDDEN_NODES[1]), stddev=1. / np.sqrt(HIDDEN_NODES[0])))
  hidden_weights_3 = tf.Variable(
      tf.random.truncated_normal((HIDDEN_NODES[1], HIDDEN_NODES[2]), stddev=1. / np.sqrt(HIDDEN_NODES[1]))) 
  output_weights = tf.Variable(tf.random.truncated_normal((HIDDEN_NODES[-1], OUTPUT_NODES), stddev=1. / np.sqrt(OUTPUT_NODES)))

  bias_1 = tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[0],)))
  bias_2 = tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[1],)))
  bias_3 = tf.Variable(tf.constant(0.01, shape=(HIDDEN_NODES[2],)))
  bias_4 = tf.Variable(tf.constant(0.01, shape=(OUTPUT_NODES,)))


else:
    %store -r saved_weights
    hidden_weights_1 = tf.Variable(saved_weights[0])  #saved values
    hidden_weights_2 = tf.Variable(saved_weights[1])
    hidden_weights_3 = tf.Variable(saved_weights[2])
    output_weights = tf.Variable(saved_weights[3])

    bias_1 = tf.Variable(saved_weights[4])
    bias_2 = tf.Variable(saved_weights[5])
    bias_3 = tf.Variable(saved_weights[6])
    bias_4 = tf.Variable(saved_weights[7])
    
    #other neural network for saved player
    saved_input_placeholder = tf.compat.v1.placeholder("float", shape=(None, INPUT_NODES))

    %store -r saved_weights
    saved_hidden_weights_1 = tf.Variable(saved_weights[0])  #saved values
    saved_hidden_weights_2 = tf.Variable(saved_weights[1])
    saved_hidden_weights_3 = tf.Variable(saved_weights[2])
    saved_output_weights = tf.Variable(saved_weights[3])

    saved_bias_1 = tf.Variable(saved_weights[4])
    saved_bias_2 = tf.Variable(saved_weights[5])
    saved_bias_3 = tf.Variable(saved_weights[6])
    saved_bias_4 = tf.Variable(saved_weights[7])

    saved_hidden_layer_1 = tf.nn.relu(tf.matmul(saved_input_placeholder, saved_hidden_weights_1) + saved_bias_1)
    saved_hidden_layer_2 = tf.nn.relu(tf.matmul(saved_hidden_layer_1, saved_hidden_weights_2) + saved_bias_2)
    saved_hidden_layer_3 = tf.nn.relu(tf.matmul(saved_hidden_layer_2, saved_hidden_weights_3) + saved_bias_3)
    saved_output_layer = tf.nn.softmax(tf.matmul(saved_hidden_layer_3, saved_output_weights) + saved_bias_4)

hidden_layer_1 = tf.nn.relu(tf.matmul(input_placeholder, hidden_weights_1) + bias_1)
hidden_layer_2 = tf.nn.relu(tf.matmul(hidden_layer_1, hidden_weights_2) + bias_2)
hidden_layer_3 = tf.nn.relu(tf.matmul(hidden_layer_2, hidden_weights_3) + bias_3)
output_layer = tf.nn.softmax(tf.matmul(hidden_layer_3, output_weights) + bias_4)

#other neural network for saved player
saved_input_placeholder = tf.compat.v1.placeholder("float", shape=(None, INPUT_NODES))



policy_gradient = tf.reduce_sum(input_tensor=tf.reshape(reward_placeholder, (-1, 1)) * actual_move_placeholder * output_layer)
train_step = tf.compat.v1.train.AdamOptimizer(LEARN_RATE).minimize(-policy_gradient)

sess = tf.compat.v1.Session()
sess.run(tf.compat.v1.initialize_all_variables())


board_states, actual_moves, rewards = [], [], []
episode_number = 1
results = collections.deque()


def move_transform(move):
# transforms a move with n^4 length into a nxnxnxn move
    m = list(move).index(1)
    x1 = m % 8
    m = (m-x1)/8
    y1 = m % 8
    m = (m-y1)/8
    x2 = m % 8
    m = (m-x2)/8
    y2 = m % 8
    return x1,y1,x2,y2 

def board_transform(board_state):
# transforms board state into input for the neural network
    board_state_temp=[]
    for i in board_state:
        board_state_temp.append(list(i))
    pieces1=['p','r','n','b','q','k','P','R','N','B','Q','K']
    
    
    for i in range (8):
        for j in range (8):
            one_hot=[0]*12
            for k in range (12):
                if(board_state[i][j]==pieces1[k]):
                    one_hot[k]=1
                    board_state_temp[i][j]=one_hot
                if(board_state[i][j]==0):
                    board_state_temp[i][j]=one_hot
    for i in range(8):
        board_state_temp[i]=np.ravel(board_state_temp[i])
    return board_state_temp

def make_move(board_state, side):

    board_state_temp = board_transform(board_state)
    board_state_flat = np.ravel(board_state_temp)
    board_states.append(board_state_flat)
    probability_of_actions = sess.run(output_layer, feed_dict={input_placeholder: [board_state_flat]})[0]
    move_list=0
    
    # we choose new moves until the move is allowed in the current board state.
    while(True):

        try:
            move = np.random.multinomial(1, probability_of_actions)

        except ValueError:
            # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
            # so need to reduce slightly to be a valid value
            
            move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7))

        if ILLEGAL_MOVES_ALLOWED==True:
            break
        
        if (move_list==0):
            move_list=list(available_moves(board_state,side))
        
        if(move_transform(move) in (move_list)):
            break
        
        # we don't allow to choose the same move again, by changing its probability to 0
        probability_of_actions[list(move).index(1)]=0
        if sum(probability_of_actions)==0:
          break
        probability_of_actions=probability_of_actions*(1/sum(probability_of_actions))
        
    actual_moves.append(move)


    return move_transform(move)




def policy_gradient_player(board_state, side):
# Chooses the move with the highest probability instead of randomly choosing
    
    board_state_temp = board_transform(board_state)
    board_state_flat = np.ravel(board_state_temp)
    board_states.append(board_state_flat)
    probability_of_actions = sess.run(output_layer, feed_dict={input_placeholder: [board_state_flat]})[0]

    move=[0]*8*8*8*8
    move[list(probability_of_actions).index(max(list(probability_of_actions)))] = 1

    
    # we choose new moves until the move is allowed in the current board state.
    while(move_transform(move) not in (list(available_moves(board_state,side)))):
        
        if ILLEGAL_MOVES_ALLOWED==True:
            break
        
        # we don't allow to choose the same move again, by changing its probability to 0
        probability_of_actions[list(move).index(1)]=0
        if sum(probability_of_actions)==0:
          break       
        probability_of_actions=probability_of_actions*(1/sum(probability_of_actions))

        move=[0]*8*8*8*8
        move[list(probability_of_actions).index(max(list(probability_of_actions)))] = 1



    actual_moves.append(move)


    return move_transform(move)

def saved_player(board_state, side):
# Chooses the move with the highest probability instead of randomly choosing
    
    board_state_temp = board_transform(board_state)
    board_state_flat = np.ravel(board_state_temp)
    board_states.append(board_state_flat)
    probability_of_actions = sess.run(output_layer, feed_dict={input_placeholder: [board_state_flat]})[0]

    # we choose new moves until the move is allowed in the current board state.
    while(True):

        try:
            move = np.random.multinomial(1, probability_of_actions)

        except ValueError:
            # sometimes because of rounding errors we end up with probability_of_actions summing to greater than 1.
            # so need to reduce slightly to be a valid value
            
            move = np.random.multinomial(1, probability_of_actions / (sum(probability_of_actions) + 1e-7))

        
        if ILLEGAL_MOVES_ALLOWED==True:
            break
            
        
        if(move_transform(move) in (list(available_moves(board_state,side)))):
          break
        
        # we don't allow to choose the same move again, by changing its probability to 0
        probability_of_actions[list(move).index(1)]=0
        if sum(probability_of_actions)==0:
          break
        probability_of_actions=probability_of_actions*(1/sum(probability_of_actions))
    
    actual_moves.append(move)


    return move_transform(move)


turns=[]
win_rates=[]
game_lengths=[]
seconds = time.time()


while True:

    reward = play_game(make_move, random_player, log = 0)
    turn = reward[2]
    points = reward[1]
    reward = reward[0]
    results.append(reward)
    if len(results) > PRINT_RESULTS_EVERY_X:
        results.popleft()

    last_game_length = len(board_states) - len(rewards)


    #Learning to make legal moves
    #Reward = game length
    if TRAIN_MODE == 0:
        #the reward is the length of the game - the number of legal
        #rewards += ([last_game_length-1] * (last_game_length-1))
        rewards += ([1] * (last_game_length-1))
        rewards += ([-1])
        turns+=([(turn-1)/2])
        
    
    #Learning to win the game
    #Reward = 1 if won -1 if lost
    if TRAIN_MODE == 1:
        # we scale here so winning quickly is better winning slowly and loosing slowly better than loosing quick
        if(float(last_game_length) != 0) :
            reward /= float(last_game_length)
        rewards += ([reward] * last_game_length)
        
    
    #Reward = position evaluation
    if TRAIN_MODE == 2:
        print(points)
        rewards += (points)
    
    episode_number += 1
    
    if episode_number % BATCH_SIZE == 0:
        normalized_rewards = rewards - np.mean(rewards)

        if TRAIN_MODE==0:
            print(rewards)
                
        if np.std(normalized_rewards)!=0:
            normalized_rewards /= np.std(normalized_rewards)

        
            sess.run(train_step, feed_dict={input_placeholder: board_states,
                                        reward_placeholder: normalized_rewards,
                                        actual_move_placeholder: actual_moves})

            # clear batches
            del board_states[:]
            del actual_moves[:]
            del rewards[:]

    if episode_number % PRINT_RESULTS_EVERY_X == 0:
        print("episode: %s win_rate: %s                                     Time elapsed : %s" % (episode_number,results.count(1) / (PRINT_RESULTS_EVERY_X), round(time.time()-seconds,3) ))
        seconds = time.time()
        win_rates.append(results.count(1) / (PRINT_RESULTS_EVERY_X))
        if TRAIN_MODE == 0:
            print("avg game length: %s" % (sum(turns)/len(turns)))
            game_lengths.append(sum(turns)/len(turns))
            turns=[]





In [None]:
#Saving the weights and biases

saved_weights=[]

def save_weights(hidden_weights_1,hidden_weights_2,hidden_weights_3,output_weights,bias_1,bias_2,bias_3,bias_4):
  weights = []
  weights.append(sess.run(hidden_weights_1))
  weights.append(sess.run(hidden_weights_2))
  weights.append(sess.run(hidden_weights_3))
  weights.append(sess.run(output_weights))
  weights.append(sess.run(bias_1))
  weights.append(sess.run(bias_2))
  weights.append(sess.run(bias_3))
  weights.append(sess.run(bias_4))
  return weights

saved_weights=save_weights(hidden_weights_1,hidden_weights_2,hidden_weights_3,output_weights,bias_1,bias_2,bias_3,bias_4)

%store saved_weights


In [None]:
#plot
import matplotlib.pyplot as plt

random_player_win_rates = [0.5860]*len(win_rates)
random_player_illegal_win_rates = [0.08479]*len(win_rates)

f = plt.figure() 
f.set_figwidth(8) 
f.set_figheight(5) 

plt.plot(game_lengths,  label="Policy gradient játékos")
#plt.plot(random_player_win_rates, 'g', label="Random játékos")
#plt.plot(random_player_illegal_win_rates,'r', label="Illegális random játékos")
plt.ylabel('szabályos lépések száma játékonként')
plt.xlabel('játékok száma (száz játék)')
#legend=plt.legend(loc='center right')



plt.show()

In [None]:
import sys

max_depth=4

def evaluate(board_state):
    """Get a rough score for how good we think this board position is for the plus_player. Does this based on number of
    2 in row lines we have.
    Args:
        board_state (3x3 tuple of int): The board state we are evaluating
    Returns:
        int: evaluated score for the position for the plus player, posative is good for the plus player, negative good
            for the minus player
    """
    pieces=['p','r','n','b','q','k','P','R','N','B','Q','K']
    points=[1,5,3,3,9,100,-1,-5,-3,-3,-9,-100]
    point=0
    for i in range(8):
        for j in range(8):
            for k in range(12):
                if(board_state[i][j]==pieces[k]):
                    point=point+points[k]
    return point



def min_max(board_state, side, max_depth, evaluation_func=evaluate):
    """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
    move
    Args:
        board_state (3x3 tuple of int): The board state we are evaluating
        side (int): either +1 or -1
        max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
        position is.
        evaluation_func (board_state -> int): Function used to evaluate the position for the plus player
    Returns:
        (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
    """
    best_score = None
    best_score_move = None

    moves = list(available_moves(board_state,side))
    if not moves:
        # this is a draw
        return 0, None

    for move in moves:
        new_board_state = apply_move(board_state, move)
        winner = has_winner(new_board_state)
        if winner != 0:
            return winner * 10000, move
        else:
            if max_depth <= 1:
                score = evaluation_func(new_board_state)
            else:
                score, _ = min_max(new_board_state, -side, max_depth - 1)
            if side > 0:
                if best_score is None or score > best_score:
                    best_score = score
                    best_score_move = move
            else:
                if best_score is None or score < best_score:
                    best_score = score
                    best_score_move = move
    return best_score, best_score_move


def min_max_alpha_beta(board_state, side, max_depth, evaluation_func=evaluate, alpha=-sys.float_info.max,
                       beta=sys.float_info.max):
    """Runs the min_max_algorithm on a given board_sate for a given side, to a given depth in order to find the best
    move
    Args:
        board_state (3x3 tuple of int): The board state we are evaluating
        side (int): either +1 or -1
        max_depth (int): how deep we want our tree to go before we use the evaluate method to determine how good the
        position is.
        evaluation_func (board_state -> int): Function used to evaluate the position for the plus player
        alpha (float): Used when this is called recursively, normally ignore
        beta (float): Used when this is called recursively, normally ignore
    Returns:
        (best_score(int), best_score_move((int, int)): the move found to be best and what it's min-max score was
    """
    best_score_move = None
    moves = list(available_moves(board_state,side))
    if not moves:
        return 0, None

    for move in moves:
        new_board_state = apply_move(board_state, move)
        winner = has_winner(new_board_state)
        if winner != 0:
            return winner * 10000, move
        else:
            if max_depth <= 1:
                score = evaluate(new_board_state)
            else:
                score, _ = min_max_alpha_beta(new_board_state, -side, max_depth - 1, evaluation_func, alpha, beta)

        if side > 0:
            if score > alpha:
                alpha = score
                best_score_move = move
        else:
            if score < beta:
                beta = score
                best_score_move = move
        if alpha >= beta:
            break

    return alpha if side > 0 else beta, best_score_move


def min_max_player(board_state, side):
    print (evaluate(board_state))
    return min_max_alpha_beta(board_state, side, max_depth)[1]  