In [1]:
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm import tqdm

from tic_env import TictactoeEnv, OptimalPlayer

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import namedtuple, deque 
from random import random, sample, randint

In [3]:
from collections import defaultdict
from random import randint
from random import random

def act(q_values, grid, epsilon):
    return act_random(grid) if random()<epsilon else act_optimal(q_values, grid)

def act_random(grid):
    actions = [(x,y) for x in range(3) for y in range(3) if valid_move(grid, (x,y))]
    move = actions[randint(0,len(actions)-1)]
    return move



def act_optimal(q_values, grid):
    actions = [(x,y) for x in range(3) for y in range(3) if valid_move(grid, (x,y))]
    actions_values = list([q_value(q_values, grid, a) for a in actions])
        
    move = actions[agrmax(actions_values)]
    return move

def q_value(q_values, grid, action):
    return q_values[(grid_repr(grid), action)]
    
    
def valid_move(grid, move):
    return grid[move] == 0

def agrmax(array):
    return np.argmax(np.array(array))

def grid_repr(grid):
    between_0_2 = grid.reshape((9))+1
    power_of_3 = 3**np.arange(9)
    return (between_0_2*power_of_3).sum()

def update_qvalue_end(q_values, state, action, next_state, reward, alpha=0.05, gamma=0.99):
    if state is not None and action is not None:
        estimated_q = q_value(q_values, state, action)
        next_state_value = 0 if next_state is None else q_value(q_values, next_state, act_optimal(q_values,next_state))
        delta_q = reward + gamma*next_state_value  - estimated_q
        q_values[(grid_repr(state), action)] = estimated_q+alpha*delta_q

In [4]:
def epsilon(n, n_star, epsilon_min = 0.1, epsilon_max = 0.8):
    return np.maximum(epsilon_min, epsilon_max*(1-n/n_star))

def compute_M(q_values, epsilon, n_games = 500):
    env = TictactoeEnv()
    Turns = np.array(['X','O'])
    
   
    win_minus_loss = 0
    
    player_opt = OptimalPlayer(epsilon=epsilon, player=Turns[1])
    
    for i in range(n_games):
        grid, _, __ = env.observe()
        
        #QPlayer plays first for the first 250 games and we switch to the optimal player after n_games/2 games
        if i == n_games//2 :
            player_opt.player = Turns[0]
        
        for j in range(9):
            if env.current_player == player_opt.player:
                move = player_opt.act(grid)
            else:
                move = act_optimal(q_values, grid)

            grid, end, winner = env.step(move, print_grid=False)

            if end:
                win_minus_loss += env.reward(player=Turns[0 if n_games//2>i else 1])
                env.reset()
                break
    
   
    return (win_minus_loss)/n_games

In [5]:
def play_against_self_strategy(epsilon, n_games = 20000, average_length = 250.0):
    env = TictactoeEnv()
    Turns = np.array(['X','O'])
    
    q_values = defaultdict(lambda : 0)
   
    
   
    
    array_length = int(n_games/average_length)
    
    M_opts = np.empty(array_length, float) #list that stores the M_opt for each 'average length' games
    M_rands = np.empty(array_length, float) #list that stores the M_opt for each 'average length' games
    
    k = 0
    
    for i in tqdm(range(n_games)):
        grid, _, __ = env.observe()
        last_state_1 = None
        last_action_1 = None
        
        last_state_2 = None
        last_action_2 = None
    
        q_player_two_turn = Turns[np.mod(i,2)] #Switching who starts at each iteration between q_player_one and q_player_two
    
        for j in range(9):
            
            move = act(q_values, grid, epsilon)
            if env.current_player == 'X':
                update_qvalue_end(q_values, last_state_1, last_action_1, grid, 0)
                last_state_1 = grid
                last_action_1 = move
            else:
                update_qvalue_end(q_values, last_state_2, last_action_2, grid, 0)
                last_state_2 = grid
                last_action_2 = move
            
            grid, end, winner = env.step(move, print_grid=False)
 
            
            if end:
                    
                
                if np.mod(i+1, average_length) == 0:
                  
                    #Computing M_Opq_player_one.epsilont
                    M_opts[k] = compute_M(q_values, 0.0)
                  
                    
                    #Computing M_rand
                    M_rands[k] = compute_M(q_values, 1.0)
                    
                    k += 1
                
                update_qvalue_end(q_values, last_state_1, last_action_1, None, env.reward('X'))
                update_qvalue_end(q_values, last_state_2, last_action_2, None, env.reward('O'))
        

                env.reset()
                break
                
    return M_opts, M_rands

In [None]:
epsilons = np.array([0.0, 0.05, 0.1, 0.25, 0.5, 0.75, 1.0])
result = np.array([play_against_self_strategy(e) for e in epsilons])

100%|█████████████████████████████████████| 20000/20000 [06:18<00:00, 52.81it/s]
 34%|████████████▍                        | 6740/20000 [02:16<01:28, 150.67it/s]

In [None]:
import pickle
filename = './save_question_7'
outfile = open(filename,'wb')
pickle.dump({'M_opts':result[:,0,:] ,'M_rands':result[:,1,:]}, outfile)
outfile.close()