In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import numpy as np
from tqdm import tqdm
from collections import deque

from scipy.ndimage import convolve

import time

import random

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
# Replace '/path/to/chromedriver' with the actual path to the chromedriver executable
driver = webdriver.Chrome('chromedriver')
url = 'https://minesweeperonline.com/#beginner'  # minesweeper beginner
driver.get(url)

In [None]:
class Minesweeper():
    def __init__(self, driver):
        self.driver = driver
        self.reset()

        
        
        self.end = 0
        self.win = 0
        self.lose = 0
        
        return None
    
    def update_action(self, action):
        state = self.cell_state
        reward = 0
        self.action = np.zeros((81))
        self.action[action] = 1
        # self.action = np.reshape(action, (9,9))

        cell_ij = np.unravel_index(action, (9, 9))
        
        
        reward_array_add = np.argmax(state, axis=-1,keepdims=True)[:,:,0]

        kernel = np.array([[1, 1, 1],
                    [1, 0, 1],
                    [1, 1, 1]])


        surrounding_sum = convolve(reward_array_add, kernel, mode='constant', cval=0.0)

        not_surrounded = (surrounding_sum % 9 == 0) * (surrounding_sum >= 27) * -10

        reward_array_add = np.where(reward_array_add == 9, 10, -10)

        self.reward_array = reward_array_add + not_surrounded
        
        self.reward_array = np.reshape(self.reward_array, (9,9))
        # print(state.shape, cell_ij)
        blocked = state[cell_ij[0], cell_ij[1],9] == 1

        element = self.driver.find_element_by_id(f"{cell_ij[0] + 1}_{cell_ij[1] + 1}")

        # Click on the element
        element.click()
        self.update()

        if self.end:
            if self.win:
                self.reward_array[cell_ij[0], cell_ij[1]] += 25
                reward = 50
            else:
                self.reward_array[cell_ij[0], cell_ij[1]] -= 15
                reward = -20
        elif blocked:
            if not_surrounded[cell_ij[0], cell_ij[1]] == 0:
                self.reward_array[cell_ij[0], cell_ij[1]] += 5
                reward = 5
            else:
                self.reward_array[cell_ij[0], cell_ij[1]] += 0
                reward = 0

            not_surrounded
        else:
            self.reward_array[cell_ij[0], cell_ij[1]] = -20
            reward = -5

            
        return self.cell_state[:,:,:], reward, self.reward_array, self.end, self.win, self.lose
    
    
    def update(self):
        try:
            html_content = self.driver.page_source
        except Exception as e:
            print(e)
            quit()

        self.soup = BeautifulSoup(html_content, 'html.parser')
        self.game = self.soup.find("html").find("body").find("table").find("tbody").find("tr").find("td").find("div").find("div", id="center-column").find('div', id= "game-container").find('div', id = 'game')
        self.game1 = self.game.find_all(class_="square")
        self.end = 0 if self.game.find("div", class_="facesmile") else 1
        self.win = 1 if self.game.find("div", class_="facewin") else 0
        self.lose = 1 if self.game.find("div", class_="facedead") else 0
        self.cell_state[:,:,:] = 0
        # time0 = time.time()
        for i in range(1, 10):
            for j in range(1, 10):
                state = self.game1[(i - 1) * 9+j - 1]["class"][-1]
                match state:
                    case "blank":
                        self.cell_state[i-1,j-1,9] = 1
                    case value if value.startswith("open"):
                        self.cell_state[i-1,j-1, int(value[-1])] = 1
                    case value if value.startswith("bombflagged"):
                        self.cell_state[i-1,j-1,10] = 1
                    case _:
                        return 1

        # /html/body/table/tbody/tr/td/div/div[2]/div[1]/div[2]/div[1]
        # facewin
        # facesmile
        # facedead
        # print(time.time() - time0)



        return 1
    
    def reset(self):
        # Find the element by ID
        element = self.driver.find_element_by_id(f"face")  

        # Click on the element
        element.click()

        self.cell_state = np.zeros((9,9,11), dtype=np.float32)
        self.update()



        action = np.zeros(81, dtype=np.float32)
        action[np.random.randint(81)] = 1
        self.action = np.reshape(action, (9,9))
        cell_ij = np.unravel_index(np.argmax(self.action), self.action.shape)
        element = self.driver.find_element_by_id(f"{cell_ij[0] + 1}_{cell_ij[1] + 1}") 

        # Click on the element
        element.click()
        self.update()


        self.reward_array = np.argmax(self.cell_state, axis=-1,keepdims=True)

        self.reward_array = np.where(self.reward_array == 9, 2, -10)

        # self.update_action(action, self.cell_state)

        return 1


In [None]:
def create_dqn_model(input_size, output_size):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_size, activation='relu'))
    model.add(Dense(256, activation='tanh'))
    model.add(Dense(output_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr = 0.001))
    return model

In [None]:
class DQNAgent:
    def __init__(self, state_size, action_size, gamma, epsilon, epsilon_decay, epsilon_min, model):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=100)
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        if model is None:
            self.model = create_dqn_model(state_size, action_size)
        else:
            self.model = model

    def remember(self, state, action, reward, reward_array, next_state, end):
        self.memory.append((state, action, reward, reward_array, next_state, end))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.randint(self.action_size)
        q_values = self.model.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        # print(type(self.memory))
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, reward_array, next_state, end in minibatch:
            target = reward
            
            if not end:
                res = self.model.predict(next_state, verbose=0)[0]
                # print(res)
                
                # target = (reward + self.gamma *
                #           np.amax(res))
                target = (reward)
                # print(np.amax(res), reward)
                
            
            reward_add = np.argmax(np.reshape(state, [9, 9, 11]), axis=-1,keepdims=True)

            reward_add = np.where(reward_add == 0, 1, 0)
                
            reward_add = reward_add.flatten()

            target_f = self.model.predict(state, verbose=0)
            target_f[0][reward_add] = -10
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:

if __name__ == "__main__":
    simulator = Minesweeper(driver)
    
    agent = DQNAgent(9*9*11, 9*9, 0.95, 0.9, 0.95, 0.15, None)

    end = False

    batch_size = 16

    episodes = 200

    replay_counter = 0

    for episode in range(episodes):
        simulator.reset()
        state = simulator.cell_state
        state = np.reshape(state, [1, 9 * 9* 11])
        for epoch in range(30):
            replay_counter += 1
            # simulator.render()
            action = agent.act(state)
            # print(simulator.step(action))
            # self.cell_state[:,:,:], reward, self.reward_array, self.end, self.win, self.lose
            next_state, reward, reward_array, end, win, lose = simulator.update_action(action)
            # print(action, reward)
            # time.sleep(3)
            # print(reward)
            reward = reward if not end else -10
            next_state = np.reshape(next_state, [1, (9*9*11)])
            agent.remember(state, action, reward, reward_array, next_state, end)
            state = next_state
            if end:
                print(f"Episode: {episode} out of {episodes}\tEpoch: {epoch}\tEpsilon: {agent.epsilon:.2}\tTraining iters: {replay_counter//20}")
                break
            if len(agent.memory) > batch_size and replay_counter % 20 == 0:
                agent.replay(batch_size)
        model_save = agent.model
        if episode % 10 == 0:
            model_save.save("minesweeper_dqn_model.h5")




In [None]:
# model_save  = tf.keras.models.load_model('minesweeper_dqn_model.h5')

gamer = Minesweeper(driver)

In [None]:
gamer.update()
res = model_save.predict(np.reshape(gamer.cell_state.flatten(), (1,-1)))[0]
index = np.argmax(res)
print(np.unravel_index(index, (9,9)))


# # print(reward_array.shape)
# print('\n'.join([''.join([f'{item:4}' for item in row]) 
#       for row in gamer.reward_array[:,:]]))

# print(reward_array.shape)
print('\n'.join(['   '.join([f'{item:.1f}' for item in row]) 
      for row in np.reshape(res,(9,9))]))


In [None]:
next_state, reward, reward_array, end, win, lose = gamer.update_action(index)