In [5]:
import sys
import numpy as np
import random

In [1]:
class Node(object):
    def __init__(self,name="root",value=None):
        self.value = value
        self.name = name
        self.children = []
       
    def add_or_update_child(self,name,value):
        for child in self.children:
            if child.name == name:
                child.value = value
                return child
        
        new_node = Node(name,value)
        self.children.append(new_node)
        return new_node
    
    def find_or_create_child(self,name,value=None):
        for child in self.children:
            if child.name == name:
                return child
        
        new_node = Node(name,value)
        self.children.append(new_node)
        return new_node
    
    def __str__(self, level=0):
        ret = "\t"*level+str(self.name)+":"+str(self.value)+"\n"
        for child in self.children:
            ret += child.__str__(level+1)
        return ret

In [14]:
class RLAgent(object):
    def __init__(self,player_num,alpha,x_rate):
        self.player_num = str(player_num)
        self.alpha = alpha
        self.x_rate = x_rate
        self.root_node = Node()
        self.curr_node = self.root_node
        self.vals = {}

    def key_for_game_board(self,gb):
        return ",".join([str(x) for x in gb.flatten()])
        
    def reset(self):
        self.curr_node = self.root_node;
    
    def val_for_game_board(self,gb):
        key = self.key_for_game_board(gb)
        if key not in self.vals:
            val = 0.50
            state = state_for_game_board(gb)
            if state in ['1','2']:
                if state == self.player_num:
                    val = 1.0
                else:
                    val = 0.0
            elif state == "draw":
                val = 0.0

            self.vals[key] = val

        return self.vals[key]

    def update_val_for_game_board(self,gb,update_val):
        key = self.key_for_game_board(gb)
        if key not in self.vals:
            self.vals[key] = 0.50
        self.vals[key] = self.vals[key] + self.alpha*(update_val - self.vals[key])

        return self.vals[key]

    def make_move(self,gb,move):
        new_gb = np.copy(gb)
        if new_gb[move] != 0.0:
            print "Invalid move {}".format(move)
            raise RuntimeError
        new_gb[move] = int(self.player_num)
        return new_gb

    def choose_move(self,gb,valid_moves):
        move_vals = [(self.val_for_game_board(self.make_move(gb,move)),move) for move in valid_moves]
        move_vals.sort(key=lambda tup: tup[0],reverse=True)
        explore = random.random()
        if len(move_vals) > 1 and explore <= self.x_rate:
            move = random.sample(move_vals[1:],1)[0][1]
        else:
            move = move_vals[0][1]
            val = self.update_val_for_game_board(gb,move_vals[0][0])
            
            self.curr_node.value = val
        
        new_gb = self.make_move(gb,move)
        key = self.key_for_game_board(new_gb)
        self.curr_node = self.curr_node.find_or_create_child(key,0.0)
        
        return new_gb

def run_games(num_games,p1,p2,db):
    results = [0,0,0]

    print "Starting games"
    i = 0
    
    p1_agent = False
    p2_agent = False

    if type(p1) == str and "agent" in p1:
        agent,alpha,x_rate = p1.split("_")
        p1 = RLAgent(1,float(alpha),float(x_rate))
        p1_agent = True

    if type(p2) == str and "agent" in p2:
        agent,alpha,x_rate = p2.split("_")
        p2 = RLAgent(2,float(alpha),float(x_rate))
        p2_agent = True

    while i < num_games:
        results = play_a_game(results,p1,p2,db)
        i += 1
    results = calculate_win_rates(results)
    print "Results were:"
    print "\t({},{},{})".format(results[0],results[1],results[2])
    if(p1_agent):
        print "Agent 1 Q-Vals were:"
        print p1.root_node
    if(p2_agent):
        print "Agent 2 Q-Vals were:"
        print p2.root_node


def play_a_game(res,p1,p2,db):
    game_board = np.zeros((3,3))

    state = "live"
    while state == "live":
        game_board = player_choose_move(1,p1,game_board)
        state = state_for_game_board(game_board)
        if state == "live":
            game_board = player_choose_move(2,p2,game_board)
            state = state_for_game_board(game_board)
            
        if db:
            print game_board

    reset_player(p1)
    reset_player(p2)
    
    return update_results(state,res)

def state_for_game_board(gb):
    for j in range(3):
        if list(gb[j]) == [1.0,1.0,1.0] or list(gb[:,j]) == [1.0,1.0,1.0]:
            return "1"
        elif list(gb[j]) == [2.0,2.0,2.0] or list(gb[:,j]) == [2.0,2.0,2.0]:
            return "2"
    diag1 = [gb[0,0],gb[1,1],gb[2,2]]
    diag2 = [gb[2,0],gb[1,1],gb[0,2]]

    if diag1 == [1.0,1.0,1.0] or diag2 == [1.0,1.0,1.0]:
        return "1"
    elif diag1 == [2.0,2.0,2.0] or diag2 == [2.0,2.0,2.0]:
        return "2"

    if ~(gb == np.zeros((3,3))).any():
        return "draw"

    return "live"

def player_choose_move(tile,player,gb):
    valid_moves = []
    for j in range(3):
        for k in range(3):
            if gb[j,k] == 0:
                valid_moves.append((j,k))

    if player == "random":
        return random_choose_move(tile,gb,valid_moves)
    elif player == "human":
        return human_choose_move(tile,gb,valid_moves)
    else:
        return player.choose_move(gb,valid_moves)

def reset_player(player):
    if type(player) != str:
        player.reset()

def random_choose_move(tile,gb,valid_moves):
    move = random.sample(valid_moves,1)[0]
    gb[move] = tile

    return gb

def human_choose_move(tile,gb,valid_moves):
    move = input("Enter your move (x,y):")
    while move not in valid_moves:
        print "Invalid move."
        move = tuple([int(x) for x in input("Enter your move (x,y):").split(",")])

    gb[move] = tile

    return gb

def update_results(state,res):
    if state == "1":
        res[0] += 1
    elif state == "2":
        res[2] += 1
    elif state == "draw":
        res[1] += 1
    else:
        print "invalid state {}".format(state)	
    return res

def calculate_win_rates(res):
    for player in ["player_1","player_2"]:
        tot = sum(res)
        res = [float(x)/float(tot) for x in res]

    return res

In [15]:
run_games(100,"agent_0.75_0.05","random",False)

Starting games
Results were:
	(0.78,0.02,0.2)
Agent 1 Q-Vals were:
root:0.5
	1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0:0.5
		1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0:0.999877929688
			1.0,1.0,1.0,0.0,2.0,2.0,0.0,0.0,0.0:0.0
			1.0,1.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0:0.0
			1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,2.0:0.0
			1.0,1.0,2.0,1.0,2.0,0.0,0.0,0.0,0.0:0.9921875
				1.0,1.0,2.0,1.0,2.0,0.0,1.0,0.0,2.0:0.0
			1.0,1.0,1.0,0.0,2.0,0.0,2.0,0.0,0.0:0.0
		1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0:0.999877929688
			1.0,1.0,1.0,2.0,0.0,2.0,0.0,0.0,0.0:0.0
			1.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0:0.875
				1.0,1.0,2.0,2.0,1.0,0.0,0.0,1.0,2.0:0.0
			1.0,1.0,1.0,2.0,0.0,0.0,0.0,0.0,2.0:0.0
		1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0:0.5
			1.0,1.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0:0.0
			1.0,1.0,1.0,2.0,0.0,0.0,0.0,2.0,0.0:0.0
			1.0,1.0,1.0,0.0,2.0,0.0,0.0,2.0,0.0:0.0
			1.0,1.0,2.0,1.0,0.0,0.0,0.0,2.0,0.0:0.5
				1.0,1.0,2.0,1.0,1.0,0.0,2.0,2.0,0.0:0.96875
					1.0,1.0,2.0,1.0,1.0,2.0,2.0,2.0,1.0:0.0
		1.0,1.0,0.0,0.0,0.0,2.0,0.0,