In [2]:

!pip install keras
!pip install keras-rl2
!pip install tensorflow==2.14.0

from keras.models import Sequential
from keras.layers import Dense, Flatten


from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.optimizers.legacy import Adam



In [3]:
import numpy as np
import random
from gym import Env, spaces

class TicTacToeEnv(Env):
    def __init__(self):
        super().__init__()
        self.action_space = spaces.Discrete(9)  # 9 células do tabuleiro
        self.observation_space = spaces.Box(0, 2, (9,), dtype=int)  # Tabuleiro 3x3 achatado com algumas configurações legais, recomendo a documentação
        self.reset()

    def reset(self):
        self.board = np.zeros(9, dtype=int)  # Tabuleiro 1D (array)
        self.done = False
        self.current_player = 1
        return self.board

    def step(self, action):
        if self.done or self.board[action] != 0:
            return self.board, -10, True, {}  # Penalidade por jogada inválida

        self.board[action] = self.current_player # vai na posição escolhina e coloca o número
        if self.check_winner(self.current_player):
            return self.board, 10, True, {}  # Vitória
        if not (self.board == 0).any():
            return self.board, 1, True, {}  # Empate

        # Alternando o jogador
        self.current_player = 3 - self.current_player #alterna entre 1 e 2
        return self.board, 0, False, {}

    def render(self, mode='human'):
        board = self.board.reshape(3, 3)
        print("\n".join([" ".join(map(str, row)) for row in board]))
        print()

    def check_winner(self, player):
        board = self.board.reshape(3, 3)
        return any(
            all(board[row, :] == player) or
            all(board[:, col] == player) or
            all(np.diag(board) == player) or
            all(np.diag(np.fliplr(board)) == player)
            for row in range(3) for col in range(3)
        )

# Criar o ambiente
env = TicTacToeEnv()

# Testando o agente em 10 episódios
for episode in range(10):
    state = env.reset()  # Reiniciar o ambiente a cada episódio
    done = False
    while not done:
        action = env.action_space.sample()  # Ação aleatória para o jogador
        next_state, reward, done, info = env.step(action)  # Passo no ambiente
        env.render()  # Mostrar estado
        if done:
            print(f"Fim do episódio {episode + 1}")


0 0 0
0 0 0
0 0 1

0 0 0
0 0 0
2 0 1

0 0 0
1 0 0
2 0 1

0 0 0
1 0 0
2 2 1

0 0 0
1 0 0
2 2 1

Fim do episódio 1
0 0 0
0 1 0
0 0 0

2 0 0
0 1 0
0 0 0

2 0 0
0 1 0
1 0 0

2 0 2
0 1 0
1 0 0

2 0 2
0 1 0
1 0 1

2 2 2
0 1 0
1 0 1

Fim do episódio 2
0 0 1
0 0 0
0 0 0

0 0 1
2 0 0
0 0 0

0 0 1
2 0 1
0 0 0

0 0 1
2 0 1
0 0 0

Fim do episódio 3
1 0 0
0 0 0
0 0 0

1 0 0
0 2 0
0 0 0

1 0 0
0 2 0
0 0 0

Fim do episódio 4
0 0 0
0 0 0
0 0 1

0 0 0
0 0 0
2 0 1

0 0 0
0 0 0
2 0 1

Fim do episódio 5
0 0 0
0 0 1
0 0 0

0 0 0
0 0 1
0 2 0

0 0 0
0 0 1
1 2 0

0 0 0
2 0 1
1 2 0

0 0 0
2 0 1
1 2 1

0 0 0
2 2 1
1 2 1

0 1 0
2 2 1
1 2 1

0 1 0
2 2 1
1 2 1

Fim do episódio 6
0 0 1
0 0 0
0 0 0

0 0 1
0 2 0
0 0 0

1 0 1
0 2 0
0 0 0

1 0 1
0 2 0
0 0 0

Fim do episódio 7
0 0 0
0 0 1
0 0 0

2 0 0
0 0 1
0 0 0

2 0 0
0 0 1
1 0 0

2 0 0
2 0 1
1 0 0

2 1 0
2 0 1
1 0 0

2 1 0
2 0 1
1 2 0

2 1 0
2 0 1
1 2 1

2 1 0
2 0 1
1 2 1

Fim do episódio 8
0 0 1
0 0 0
0 0 0

0 0 1
0 0 0
0 2 0

0 0 1
0 0 0
0 2 0

Fim do episódio 9
0 

In [4]:
# Construção do modelo
def build_model(input_shape, action_space):
    model = Sequential([
        Flatten(input_shape=(1,) + input_shape),  # Entrada achatada, para o ESP, necessário corrigir isso
        Dense(64, activation='relu'),
        Dense(64, activation='relu'),
        Dense(action_space, activation='linear')  # Saída Q(s, a)
    ])
    return model

model = build_model(env.observation_space.shape, env.action_space.n)
model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 9)                 0         
                                                                 
 dense (Dense)               (None, 64)                640       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 9)                 585       
                                                                 
Total params: 5385 (21.04 KB)
Trainable params: 5385 (21.04 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# Configurar memória e política
memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()

# Criar o agente DQN
dqn = DQNAgent(model=model, nb_actions=env.action_space.n, memory=memory,
               nb_steps_warmup=10, target_model_update=1e-2, policy=policy)
dqn.compile(optimizer=Adam(), metrics=['mae'])


In [23]:
# Treinar o agente
dqn.fit(env, nb_steps=150000, visualize=False, verbose=2)

Training for 150000 steps ...


  updates=self.state_updates,


      5/150000: episode: 1, duration: 0.278s, episode steps:   5, steps per second:  18, episode reward: -10.000, mean reward: -2.000 [-10.000,  0.000], mean action: 4.800 [0.000, 8.000],  loss: --, mae: --, mean_q: --
     10/150000: episode: 2, duration: 0.009s, episode steps:   5, steps per second: 561, episode reward: -10.000, mean reward: -2.000 [-10.000,  0.000], mean action: 4.800 [0.000, 8.000],  loss: --, mae: --, mean_q: --


  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  updates=self.state_updates,
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_

     14/150000: episode: 3, duration: 1.283s, episode steps:   4, steps per second:   3, episode reward: -10.000, mean reward: -2.500 [-10.000,  0.000], mean action: 5.000 [0.000, 8.000],  loss: 9.314233, mae: 0.333115, mean_q: 0.352299
     17/150000: episode: 4, duration: 0.073s, episode steps:   3, steps per second:  41, episode reward: -10.000, mean reward: -3.333 [-10.000,  0.000], mean action: 5.333 [4.000, 8.000],  loss: 10.518156, mae: 0.334749, mean_q: 0.280068
     20/150000: episode: 5, duration: 0.044s, episode steps:   3, steps per second:  68, episode reward: -10.000, mean reward: -3.333 [-10.000,  0.000], mean action: 5.333 [4.000, 8.000],  loss: 8.910376, mae: 0.301563, mean_q: 0.239767
     25/150000: episode: 6, duration: 0.078s, episode steps:   5, steps per second:  64, episode reward: -10.000, mean reward: -2.000 [-10.000,  0.000], mean action: 3.800 [0.000, 8.000],  loss: 13.008311, mae: 0.392807, mean_q: 0.221060


  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)
  batch_idxs = np.random.random_integers(low, high - 1, size=size)


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
 115581/150000: episode: 16847, duration: 0.047s, episode steps:   5, steps per second: 106, episode reward: 20.000, mean reward:  4.000 [ 0.000, 20.000], mean action: 2.800 [0.000, 5.000],  loss: 0.218422, mae: 13.676763, mean_q: 20.203373
 115587/150000: episode: 16848, duration: 0.055s, episode steps:   6, steps per second: 109, episode reward: 20.000, mean reward:  3.333 [ 0.000, 20.000], mean action: 4.500 [1.000, 8.000],  loss: 0.315888, mae: 13.819595, mean_q: 20.111082
 115594/150000: episode: 16849, duration: 0.063s, episode steps:   7, steps per second: 112, episode reward: -10.000, mean reward: -1.429 [-10.000,  0.000], mean action: 3.286 [0.000, 7.000],  loss: 0.316160, mae: 14.097364, mean_q: 20.033827
 115598/150000: episode: 16850, duration: 0.044s, episode steps:   4, steps per second:  91, episode reward: -10.000, mean reward: -2.500 [-10.000,  0.000], mean action: 4.250 [0.000, 7.000],  loss: 0.2

<keras.src.callbacks.History at 0x7bdc1d9db5b0>

In [None]:
# Testar o agente
#dqn.test(env, nb_episodes=3, visualize=True)

In [5]:
#model.save('my_model_1')
# Carregamento dos pesos do modelo (não suporta mais load_model)
model.load_weights( "/content/drive/MyDrive/mod")


<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x78f1972244c0>

In [9]:
# Função para jogar contra o modelo DQN treinado
def play_game(model):
    env = TicTacToeEnv()

    # Usando o modelo carregado diretamente
    state = env.reset()
    done = False

    while not done:
        # Jogador humano (Jogador 1)
          # Jogada do agente (Jogador 2) usando o modelo treinado
        print("AI play/Jogada da IA:")
        state_flat = state.reshape(1, 1, 9)  # Agora o estado tem 3 dimensões: (1, 1, 9)
        action = np.argmax(model.predict(state_flat))  # Usando o modelo diretamente para escolher a ação
        print(f"AI chose the position / IA escolheu a posição {action}")
        state, reward, done, info = env.step(action)
        if done:
            env.render()
            print("AI win / IA ganhou!" if reward == 10 else "Draw/Empate!")
            break
        env.render()

        action = int(input("Choose your move / Escolha sua jogada (0-8): "))  # Jogada do jogador humano
        if state[action] != 0:
            print("Jogada inválida! Tente novamente.")
            continue
        state, reward, done, info = env.step(action)
        if done:
            env.render()
            print("you win / Você ganhou!" if reward == 10 else "Draw / Empate!")
            break



# Supondo que o modelo já esteja treinado e você tenha o modelo disponível
# Chame a função passando o modelo já carregado:
play_game(model)


AI play/Jogada da IA:
AI chose the position / IA escolheu a posição 0
1 0 0
0 0 0
0 0 0

Choose your move / Escolha sua jogada (0-8): 3
AI play/Jogada da IA:
AI chose the position / IA escolheu a posição 4
1 0 0
2 1 0
0 0 0



KeyboardInterrupt: Interrupted by user

In [19]:
import shutilr
shutil.make_archive('modelo_1', 'zip', '/content/my_model_1')

'/content/minha_pasta.zip'