# Implémentation d'un algorithme de Deep Q-Learning autour du problème de gestion de la circulation sur un carrefour automobile

Prérequis : Connaissance en Q-Learning (sous-domaine du Reinforcement Learning/Apprentissage par renforcement), Deep Learning avec réseaux de neurones profonds, Package [Cityflow](https://github.com/cityflow-project/CityFlow) installé dans un environnement Anaconda dédié dans un os Linux 

In [25]:
import cityflow 
import os 
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

### Initialisation de l'environnement 

In [54]:
path = os.getcwd()
path

'/home/vintel38/MyCityFlow'

In [55]:
config_path=os.path.join(path,"examples","config.json")
eng = cityflow.Engine(config_path, thread_num=1)

### Initialisation du buffer et de ses opérations possibles

In [49]:
from collections import deque 

replay_buffer = deque(maxlen=2000)

def sample_buffer(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, next_states, rewards = [np.array([experience[field_index] for experience in batch]) 
                                             for field_index in range(5)]
    return states, actions, next_states, rewards

def fill_buffer(experiences):
    replay_buffer.append(experiences)
    
## random shuffling of numpy.random.shuffle of replay_buffer

In [None]:
replay_buffer

In [40]:
for i in range(50):
    fill_buffer(i)

### Initialisation de l'agent et de ses opérations possibles

In [53]:
def create_DNN(n_inputs, n_outputs):
    model = tf.keras.models.Sequential([
        keras.layers.Dense(64, activation="relu", input_shape=n_inputs),
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dense(n_outputs)
    ])
    return model

def epsilon_policy(state, epsilon, n_outputs):
    if epsilon > np.random.rand():
        return np.random.randint(n_outputs)
    else:
        Q_values = model.predict(state[np.newaxis])
        return np.argmax(Q_values[0])
    
def play_step(eng, state, epsilon, n_outputs):
    action = epsilon_policy(state, epsilon, n_outputs)
    prereward=get_lane_waiting_vehicle_count().sum()
    eng.next_step()
    next_state=get_lane_waiting_vehicle_count()
    reward=prereward-get_lane_waiting_vehicle_count().sum()
    fill_buffer=(state, action, next_state, reward)
    return state, action, next_state, reward

In [47]:
model=create_DNN([128], 5)

In [45]:
np.random.randint(3)

0

### Procédure d'entraînement 