This is the gym (from https://github.com/aylint/gym-minesweeper).
Made some changes to get it working properly with smaller board sizes, also:
- 0,0 is always the first action and never a mine
- a random action will never be an uncovered cell

In [2]:
import sys
from six import StringIO
from random import randint

import numpy as np
import gym
from gym import spaces

# cell values, non-negatives indicate number of neighboring mines
MINE = -1
CLOSED = 9

def stringify(board):
  # int(bool()) instead of string; convert to tertiary
  # print(''.join([str(board[x][y]) for x in range(len(board)) for y in range(len(board))]))
  return ''.join([str(board[x][y]) if board[x][y] == 0 or board[x][y] == CLOSED else '1' for x in range(len(board)) for y in range(len(board))])


def is_new_move(my_board, x, y):
    return my_board[x, y] == CLOSED

def is_valid(size, x, y):
    return (x >= 0) & (x < size) & (y >= 0) & (y < size)


def is_win(my_board, num_mines):
    return np.count_nonzero(my_board == CLOSED) == 3


def is_mine(board, x, y):
    return board[x, y] == MINE


def place_mines(board_size, num_mines):
    mines_placed = 0
    board = np.zeros((board_size, board_size), dtype=int)
    while mines_placed < num_mines:
        rnd = randint(0, board_size * board_size)
        x = int(rnd / board_size)
        y = int(rnd % board_size)
        if is_valid(board_size, x, y) and not (x == 0 and y == 0):
            if not is_mine(board, x, y):
                board[x, y] = MINE
                mines_placed += 1
    return board


class MinesweeperDiscreetEnv(gym.Env):
    metadata = {"render.modes": ["ansi", "human"]}

    def __init__(self, board_size, num_mines):
        self.board_size = board_size
        self.num_mines = num_mines
        self.board = place_mines(board_size, num_mines)
        self.my_board = np.ones((board_size, board_size), dtype=int) * CLOSED
        self.num_actions = 0

#       -2 here? 
        self.observation_space = spaces.Box(low=-2, high=9,
                                            shape=(self.board_size, self.board_size), dtype=np.int)
        self.action_space = spaces.Discrete(self.board_size*self.board_size)
        self.valid_actions = np.ones((self.board_size * self.board_size), dtype=np.bool)

    def count_neighbour_mines(self, x, y):
        neighbour_mines = 0
        for _x in range(x - 1, x + 2):
            for _y in range(y - 1, y + 2):
                if is_valid(self.board_size, _x, _y):
                    if is_mine(self.board, _x, _y):
                        neighbour_mines += 1
        return neighbour_mines

    def open_neighbour_cells(self, my_board, x, y):
        for _x in range(x-1, x+2):
            for _y in range(y-1, y+2):
                if is_valid(self.board_size, _x, _y):
                    if is_new_move(my_board, _x, _y):
                        my_board[_x, _y] = self.count_neighbour_mines(_x, _y)
                        if my_board[_x, _y] == 0:
                            my_board = self.open_neighbour_cells(my_board, _x, _y)
        return my_board

    def get_next_state(self, state, x, y):
        my_board = state
        game_over = False
        if is_mine(self.board, x, y):
            my_board[x, y] = MINE
            game_over = True
        else:
            my_board[x, y] = self.count_neighbour_mines(x, y)
            if my_board[x, y] == 0:
                my_board = self.open_neighbour_cells(my_board, x, y)
        self.my_board = my_board
        return my_board, game_over

    def randomAction(self, state):
      action = 0
      while state[int(action / self.board_size)][action % self.board_size] != CLOSED:
        action = self.action_space.sample()
      return action

    def reset(self):
        self.my_board = np.ones((self.board_size, self.board_size), dtype=int) * CLOSED
        self.board = place_mines(self.board_size, self.num_mines)
        self.num_actions = 0
        self.valid_actions = np.ones((self.board_size * self.board_size), dtype=bool)

        return self.my_board

    def step(self, action):
        state = self.my_board
        x = int(action / self.board_size)
        y = int(action % self.board_size)

        next_state, reward, done, info = self.next_step(state, x, y)
        self.my_board = next_state
        self.num_actions += 1
        self.valid_actions = (next_state.flatten() == CLOSED)
        info['valid_actions'] = self.valid_actions
        info['num_actions'] = self.num_actions
        return next_state, reward, done, info

    def next_step(self, state, x, y):
        my_board = state
        if not is_new_move(my_board, x, y):
            # print("repeat", my_board, x, y)
            return my_board, -0.2, False, {}
        while True:
            state, game_over = self.get_next_state(my_board, x, y)
            if not game_over:
                if is_win(state, self.num_mines):
                    return state, 1, True, {}
                else:
                    return state, 0.2, False, {}
            else:
                return state, -1, True, {}

    def render(self, mode='human'):
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        s = stringify(self.my_board)
        outfile.write(s)
        if mode != 'human':
            return outfile

For board size 4x4, 3 mines, there are (4^2)(3+2)^(4^2) = 2.441e12 possible state-action pairs in the q-table

In [9]:
import random
import timeit

BOARD_SIZE = 4
NUM_MINES = 3

start_t = timeit.default_timer()

env = MinesweeperDiscreetEnv(board_size=BOARD_SIZE, num_mines=NUM_MINES)

total_episodes = 1000000000   # Total episodes
learning_rate = 0.7           # Learning rate
gamma = 0.1                   # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 10e-8            # Exponential decay rate for exploration prob

qtable = {}

# List of rewards
rewards = []
total_steps = 0
win, loss = 0, 0
score = 0 #tracking completion

print('10000\tcumul. reward\tavg. steps\t|qtable|\twin rate\tepsilon\t\tavg completion')

for episode in range(10000*10):
    
    # Reset the environment
    state = env.reset()
    state_str = stringify(state)

    # Is this state seen? If not, add it to qtable and initialize the action array to 0
    if not state_str in qtable: 
        qtable[state_str] = np.zeros(BOARD_SIZE * BOARD_SIZE)
    
    step = 0
    done = False
    total_rewards = 0
    
    # loop until game over
    while(True):
        total_steps += 1
        state_str = stringify(state)

        action = np.argmax(qtable[state_str]) if random.uniform(0,1) > epsilon else env.randomAction(state)

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)
        new_state_str = stringify(new_state)

        total_rewards += reward
        state = new_state
                
        if reward == 1: win +=1
        if reward == -1: loss += 1

        if not new_state_str in qtable: 
          qtable[new_state_str] = np.zeros(BOARD_SIZE * BOARD_SIZE)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]   
        qtable[state_str][action] += learning_rate * (reward + gamma * np.max(qtable[new_state_str]) - qtable[state_str][action])

        if done: 
          score += np.count_nonzero((state != 9) & (state != -1)) / (BOARD_SIZE * BOARD_SIZE - NUM_MINES)
          break

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode) 
    rewards.append(total_rewards)

    if (episode % 10000 == 0):
      print('{:}\t{:8.6f}\t{:8.6f}\t{:8}\t{:8.6f}\t{:8.6f}\t{:8.6f}'.format(
        int(episode/10000),
        sum(rewards)/(episode+1),
        (total_steps/(episode+1)),
        sum(map(lambda k: len(k), qtable.items())),
        win/(episode+1),
        epsilon,
        score/(episode+1)
      ))


stop = timeit.default_timer()
print("time" ,str(stop-start_t))


10000	cumul. reward	avg. steps	|qtable|	win rate	epsilon		avg completion
0	-0.800000	2.000000	       6	0.000000	1.000000	0.615385
1	-0.338746	3.714129	   12770	0.059294	0.999010	0.581973
2	-0.335833	3.729864	   18616	0.059247	0.998022	0.583905
3	-0.327729	3.754642	   23288	0.060798	0.997034	0.587014
4	-0.326852	3.754956	   26734	0.061198	0.996048	0.587151
5	-0.329557	3.743385	   29478	0.060999	0.995062	0.585613
6	-0.329995	3.749004	   31900	0.060249	0.994078	0.585149
7	-0.329955	3.750546	   34060	0.060128	0.993094	0.585394
8	-0.329956	3.749903	   35872	0.060224	0.992112	0.585661
9	-0.330090	3.749003	   37442	0.060266	0.991130	0.585806
time 82.38725410599994
