In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice

In [2]:
State = namedtuple('Position', ['x', 'o'])

In [3]:
MAGIC = [2, 7, 6,
         9, 5, 1,
         4, 3, 8]

In [4]:
def print_board(pos):
    for r in range(3):
        for c in range(3):
            i = r * 3 + c
            if MAGIC[i] in pos.x:
                print('X', end=' ')
            elif MAGIC[i] in pos.o:
                print('O', end=' ')
            else:
                print(MAGIC[i], end=' ')
        print()

In [5]:
def win(elements):
    return any(sum(c) == 15 for c in combinations(elements,3))

def state_value(pos: State):
    if win(pos.x):
        return 1
    elif win(pos.o):
        return -1
    else:
        return 0

In [6]:
def generate_states():
    states = set()
    generate(0,State(set(),set()),states)
    generate(1,State(set(),set()),states)
    return states

def generate(player,state,discovered):
    if(len(state.x) + len(state.o) == 0):
        discovered.add((State(frozenset(), frozenset()),player))
    if(win(state.x)):
        return
    if(win(state.o)):
        return
    available = set(range(1,9+1)) - state.x - state.o
    if available:
        for move in list(available):
            tmp = State(set(state.x),set(state.o))
            tmp[player].add(move)
            discovered.add((State(frozenset(tmp.x),frozenset(tmp.o)),1-player))
            generate(1-player,tmp,discovered)


In [7]:
possible_states = generate_states()

In [8]:
value_dictionary = defaultdict(float)
policy_dict = defaultdict(int)

for state, to_move in possible_states:
    available = list(set(range(1, 9+1)) - state.x - state.o)
    if available and not to_move:   #Mapping required only for states in which WE are to move
        move = choice(available)    #Building a first random policy
        policy_dict[state] = move   #(Key,Value) = (State,Action_to_perform)

In [9]:
def reward(state):
    return state_value(state)

def possible_actions(state):
    availables = list(set(range(1, 9+1)) - state.x - state.o)
    possibles = []
    for i in availables:
        tmp = set(state.x)
        tmp.add(i)
        possibles.append((i, State(frozenset(tmp),frozenset(state.o)))) #(Key,Value) = (Possible_action,Resultant_state)
    return possibles    #All possible existing actions starting from state 

def model_next_state(state):    #A reasonable model of our opponent
    availables = list(set(range(1, 9+1)) - state.x - state.o)
    
    for c in combinations(state.o,2):   #If he can win with a move he goes for it
        for el in availables:
            if sum(c) + el == 15:
                tmp = set(state.o)
                tmp.add(el)
                return [State(frozenset(state.x),frozenset(tmp))] 
    
    for c in combinations(state.x,2):   #If we are going to win in a move he stops us
        for el in availables:
            if sum(c) + el == 15:
                tmp = set(state.o)
                tmp.add(el)
                return [State(frozenset(state.x),frozenset(tmp))]
    
    possibles = []
    
    for i in availables:    #Otherwise he goes random 
        tmp = set(state.o)
        tmp.add(i)
        possibles.append(State(frozenset(state.x),frozenset(tmp)))
    return possibles

def apply_action(state,action):
    tmp = set(state.x)
    tmp.add(action)
    return State(frozenset(tmp),frozenset(state.o))
    

In [10]:
teta = 0.001    #parameter to tweak termination of policy_evaluation function
dr = .1     #experiments show that larger values yield not optimal policy 

In [11]:
def policy_evaluation(teta, dr):
    delta = 1
    while delta > teta:
        delta = 0
        for state in list(policy_dict.keys()):
            old_value = value_dictionary[state]
            action = policy_dict[state]
            new_value = 0
            candidates = model_next_state(apply_action(state,action))   #Given the action suggested by actual policy compute all opponent possible response
            for possible_state in candidates:
                new_value += (1/len(candidates)) * (reward(possible_state) + dr * value_dictionary[possible_state]) # V(s) = sum(prob(s',r|s,policy(s)))[reward(s')+dr*V(s')] for all possible s'
            value_dictionary[state] = new_value
            delta = max(delta, abs(old_value - value_dictionary[state]))

def policy_improvement(dr):
    policy_stable = True
    for state in list(policy_dict.keys()):
        old_action = policy_dict[state]
        max_expected = -10
        best_action = old_action
        for action, resultant_state in possible_actions(state): #Selecting action with most high expected return 
            candidates = model_next_state(resultant_state)
            expected = 0
            for possible_state in candidates:
                expected += (1/len(candidates)) * (reward(possible_state) + dr * value_dictionary[possible_state])
            if expected > max_expected:
                best_action = action
                max_expected = expected
        policy_dict[state] = best_action    #Updating policy
        if old_action != best_action:
            policy_stable = False
    return policy_stable

In [12]:
stable = False
while not stable:
    policy_evaluation(teta,dr)    #Makes the value function consistent with current policy
    stable = policy_improvement(dr) #Improve policy with respect to the updaated value function

In [13]:
def almost_random_policy(state):    #Our opponent, he plays random unless he can win in a move o we are going to win in a move
    available = set(range(1,9+1)) - state.x - state.o
    for c in combinations(state.o,2):
        for el in available:
            if sum(c) + el == 15:
                return el
    for c in combinations(state.x,2):
        for el in available:
            if sum(c) + el == 15:
                return el
    return choice(list(available))
    

In [14]:
def play_game(n_games):
    scores = [0,0]
    for _ in range(n_games):
        state = State(set(), set())
        available = set(range(1, 9+1))
        pl = choice([0,1])
        while available:
            if(pl):
                ply = almost_random_policy(state)   #Opponent
            else:
                ply = policy_dict[State(x=frozenset(state.x),o=frozenset(state.o))] #Our agent is following his policy
            state[pl].add(ply)
            available.remove(ply)
            if win(state[pl]):
                scores[pl] += 1
                break
            pl = 1 - pl
    return scores

In [15]:
stats = play_game(100000)

print(f"Hero won {(stats[0]/(stats[1]+stats[0]))*100}% of no tie games")

Hero won 100.0% of no tie games
