## Importar librerias

In [2]:
import tensorflow as tf      
import numpy as np           
from vizdoom import *        

import random                
import time                  
from skimage import transform

from collections import deque
import matplotlib.pyplot as plt

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 

# Desactivar la ejecución ansiosa
tf.compat.v1.disable_eager_execution()

# Restablecer el gráfico
tf.compat.v1.reset_default_graph()

2024-09-22 13:38:06.287746: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-22 13:38:06.299118: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-22 13:38:06.302704: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-22 13:38:06.312910: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Configurar Ambiente


In [3]:

def create_environment():
    game = DoomGame()
    
    # Load the correct configuration
    game.load_config("basic.cfg")
    
    # Load the correct scenario (in our case basic scenario)
    game.set_doom_scenario_path("basic.wad")
    
    game.init()
    
    # Here our possible actions
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

def test_environment():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.init()
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]

    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            print(action)
            reward = game.make_action(action)
            print ("\treward:", reward)
            time.sleep(0.02)
        print ("Result:", game.get_total_reward())
        time.sleep(2)
    game.close()

In [4]:
game, possible_actions = create_environment()


### Pre procesamiento


In [5]:

def preprocess_frame(frame):
    # Verifica si hay NaN en el frame y reemplázalos por 0
    if np.isnan(frame).any():
        frame = np.nan_to_num(frame, nan=0.0)

    # Verifica que la imagen tenga dimensiones adecuadas (2D o 3D)
    if frame.ndim == 3:
        frame = frame[..., 0]  # Si tiene más de 2 dimensiones, selecciona un solo canal

    # Verifica que el frame tenga el tamaño esperado
    if frame.shape[0] < 40 or frame.shape[1] < 40:
        print("La imagen de entrada es demasiado pequeña para procesar")
        return np.zeros((84, 84)) 

    # Recortar la pantalla (ajustar para quitar partes irrelevantes)
    cropped_frame = frame[30:-10, 30:-30]

    # Normalizar los valores de los píxeles
    normalized_frame = cropped_frame / 255.0

    # Verificar si hay NaN después de la normalización
    if np.isnan(normalized_frame).any():
        normalized_frame = np.nan_to_num(normalized_frame, nan=0.0)

    # Asegurar que el frame tenga dimensiones válidas antes del redimensionado
    if normalized_frame.size == 0 or normalized_frame.shape[0] == 0 or normalized_frame.shape[1] == 0:
        raise ValueError("La imagen después de la normalización está vacía o tiene dimensiones no válidas")

    # Redimensionar el frame a 84x84 usando antialiasing
    preprocessed_frame = transform.resize(normalized_frame, (84, 84), anti_aliasing=True)
    print("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA")

    return preprocessed_frame

### APILAR FRAMES


In [6]:
stack_size = 4 # We stack 4 frames

# Initialize deque with zero-images one array for each image
stacked_frames  =  deque([np.zeros((84,84), dtype=int) for i in range(stack_size)], maxlen=4) 

def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((84,84), dtype=int) for i in range(stack_size)], maxlen=4)
        
        # Because we're in a new episode, copy the same frame 4x
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        stacked_frames.append(frame)
        
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2)
        
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)

        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2) 
    
    return stacked_state, stacked_frames

## Configurar Hiperparametros

In [7]:
### MODEL HYPERPARAMETERS
state_size = [84,84,4]      # Our input is a stack of 4 frames hence 84x84x4 (Width, height, channels) 
action_size = game.get_available_buttons_size()              # 3 possible actions: left, right, shoot
learning_rate =  0.0002      # Alpha (aka learning rate)

### TRAINING HYPERPARAMETERS
total_episodes = 500        # Total episodes for training
max_steps = 100              # Max possible steps in an episode
batch_size = 64             

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01            # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Q learning hyperparameters
gamma = 0.95               # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size   # Number of experiences stored in the Memory when initialized for the first time
memory_size = 1000000          # Number of experiences the Memory can keep

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False

## Definir nuestra Deep Q-learning Neural Network


In [8]:
import tensorflow as tf

class DQNetwork(tf.keras.Model):
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        super(DQNetwork, self).__init__(name=name)
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate

        # Definir capas
        self.conv1 = tf.keras.layers.Conv2D(
            filters=32,
            kernel_size=[8, 8],
            strides=[4, 4],
            padding="valid",
            kernel_initializer=tf.keras.initializers.GlorotUniform(),
            name="conv1"
        )
        
        self.conv1_batchnorm = tf.keras.layers.BatchNormalization(
            epsilon=1e-5,
            name='batch_norm1'
        )
        
        self.conv2 = tf.keras.layers.Conv2D(
            filters=64,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding="valid",
            kernel_initializer=tf.keras.initializers.GlorotUniform(),
            name="conv2"
        )
        
        self.conv2_batchnorm = tf.keras.layers.BatchNormalization(
            epsilon=1e-5,
            name='batch_norm2'
        )

        self.conv3 = tf.keras.layers.Conv2D(
            filters=128,
            kernel_size=[4, 4],
            strides=[2, 2],
            padding="valid",
            kernel_initializer=tf.keras.initializers.GlorotUniform(),
            name="conv3"
        )

        self.conv3_batchnorm = tf.keras.layers.BatchNormalization(
            epsilon=1e-5,
            name='batch_norm3'
        )
        
        self.flatten = tf.keras.layers.Flatten()
        
        self.fc = tf.keras.layers.Dense(
            units=512,
            activation=tf.nn.elu,
            kernel_initializer=tf.keras.initializers.GlorotUniform(),
            name="fc1"
        )
        
        self.output_layer = tf.keras.layers.Dense(
            units=3, 
            activation=None,
            kernel_initializer=tf.keras.initializers.GlorotUniform(),
            name="output"
        )

        # Optimizer
        self.optimizer = tf.keras.optimizers.RMSprop(learning_rate=self.learning_rate)

    def call(self, inputs, training=False):
        # Flujo de datos a través de las capas
        x = self.conv1(inputs)
        x = self.conv1_batchnorm(x, training=training)
        x = tf.nn.elu(x)
        
        x = self.conv2(x)
        x = self.conv2_batchnorm(x, training=training)
        x = tf.nn.elu(x)
        
        x = self.conv3(x)
        x = self.conv3_batchnorm(x, training=training)
        x = tf.nn.elu(x)
        
        x = self.flatten(x)
        x = self.fc(x)
        output = self.output_layer(x)
        
        return output
    
    def compute_loss(self, actions, target_Q, output):
        # Q es el valor Q predicho
        Q = tf.reduce_sum(tf.multiply(output, actions), axis=1)
        
        # La pérdida es la diferencia entre nuestros valores Q predichos y Q_target
        loss = tf.reduce_mean(tf.square(target_Q - Q))
        return loss

    def train_step(self, inputs, actions, target_Q):
        with tf.GradientTape() as tape:
            output = self(inputs, training=True)
            loss = self.compute_loss(actions, target_Q, output)
        
        gradients = tape.gradient(loss, self.trainable_variables)
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        return loss


In [9]:
# Reset the graph
#tf.reset_default_graph()
tf.compat.v1.reset_default_graph()


# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate)

## Definir priorizaciónm de experiencias 

In [10]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [21]:
# Instantiate memory
memory = Memory(max_size = memory_size)

# Render the environment
game.new_episode()

for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        # First we need a state
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
    
    # Random action
    action = random.choice(possible_actions)
    
    # Get the rewards
    reward = game.make_action(action)
    
    # Look if the episode is finished
    done = game.is_episode_finished()
    
    # If we're dead
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        game.new_episode()
        
        # First we need a state
        state = game.get_state().screen_buffer
        print(f"Dimensiones del estado: {state.shape}")  # Esto imprimirá las dimensiones del estado

        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Get the next state
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our state is now the next_state
        state = next_state

La imagen de entrada es demasiado pequeña para procesar


## Entrenar el modelo 

In [26]:
"""
This function will do the part
With ϵ select a random action atat, otherwise select at=argmaxaQ(st,a)
"""
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()

    # Here we'll use an improved version of our epsilon greedy strategy used in Q-learning notebook
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Make a random action (exploration)
        action = random.choice(possible_actions)
        
    else:
        # Get action from Q-network (exploitation)
        # Estimate the Qs values state
        #Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        Qs = DQNetwork(np.expand_dims(state, axis=0))  # Usamos el modelo directamente

        
        # Take the biggest Q value (= the best action)
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
                
    return action, explore_probability

In [28]:
# Inicializar decay_step al principio del entrenamiento
decay_step = 0

# Si estás en modo de entrenamiento
if training:
    for episode in range(total_episodes):
        step = 0
        episode_rewards = []

        # Iniciar un nuevo episodio
        game.new_episode()
        state = game.get_state().screen_buffer

        # Apilar los frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)

        while step < max_steps:
            step += 1
            decay_step += 1  # Aumenta el contador para reducir epsilon en la exploración

            # Predecir la acción a tomar
            action, explore_probability = predict_action(explore_start, explore_stop, decay_rate, decay_step, state, possible_actions)

            # Tomar la acción en el juego
            reward = game.make_action(action)
            done = game.is_episode_finished()
            episode_rewards.append(reward)

            if done:
                next_state = np.zeros((84, 84), dtype=int)
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                total_reward = np.sum(episode_rewards)
                print(f'Episodio: {episode}, Recompensa total: {total_reward}, Pérdida: {loss:.4f}, Probabilidad de exploración: {explore_probability:.4f}')
                memory.add((state, action, reward, next_state, done))
                step = max_steps
            else:
                next_state = game.get_state().screen_buffer
                next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                memory.add((state, action, reward, next_state, done))
                state = next_state

            # Aprendizaje...
            # (El resto de tu código)


La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para procesar
La imagen de entrada es demasiado pequeña para p

NotImplementedError: Cannot convert a symbolic tf.Tensor (DQNetwork_1/output_1/Add:0) to a numpy array. This error may indicate that you're trying to pass a Tensor to a NumPy call, which is not supported.

In [None]:
!pip install tensorflow