In [88]:
import spacy 
import numpy as np
import emb_spacy
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation, Bidirectional, GRU
from keras.metrics import Accuracy, Precision, Recall, F1Score, MeanSquaredError




from emb_spacy import get_embedding
from input import count_examples_and_max_length, pad_sentences_from_file
import label

### Lecture des données 

In [89]:
# Ouverture en lecture des données
with open("train_corpus", "r", encoding="utf-8") as file:
    data = file.readlines()

# Compter les exemples et trouver la taille maximale
num_examples, MAX_SEQ_SIZE = count_examples_and_max_length(data)

# Affichage des résultats
print("Nombre d'exemples :", num_examples) 
print("Taille maximale de la phrase :", MAX_SEQ_SIZE)

vec_word, sortie = pad_sentences_from_file("train_corpus", MAX_SEQ_SIZE)
print("vecteur : ", vec_word.shape)
# vec_word.shape : (3945, 41)
# sortie.shape : (3945, 41)

Nombre d'exemples : 3945
Taille maximale de la phrase : 41
vecteur :  (3945, 41)


### Création des vecteurs d'entrée et de sortie

In [90]:
# Création du vecteur d'entrée 
entree = np.zeros((num_examples, MAX_SEQ_SIZE, 300))
for i, sentence in enumerate(vec_word):
    for j, word in enumerate(sentence):
        if word:
            entree[i, j] = get_embedding(word)
        else:
            entree[i, j] = np.zeros(300)

print("entree shape : ", entree.shape)
print("sortie shape : ", sortie.shape)

entree shape :  (3945, 41, 300)
sortie shape :  (3945, 41)


In [91]:
labels = label.extract_label("atis.train")
label_one_hot_dict = {label_: label.get_vector_from_label(label_) for label_ in labels} # création d'un dictionnaire associant chaque label à son vecteur one hot 

nbLabels = len(labels)
embedding_size = len(entree[0][0])

tailleDictionnaire = emb_spacy.get_size_dict()  

In [92]:
# print(sortie[0])
sortie_one_hot = np.zeros((sortie.shape[0], sortie.shape[1], len(label_one_hot_dict['O'])), dtype=int)
zero_vec = np.zeros(len(label_one_hot_dict['O']), dtype=int)
for i in range(sortie.shape[0]):
    for j in range(sortie.shape[1]):
        label = sortie[i, j]
        if label == '0':
            sortie_one_hot[i, j] = zero_vec
        else:
            sortie_one_hot[i, j] = label_one_hot_dict[label]

### Modèle LSTM

#### Configuration du modèle

In [93]:

config = {
    'hidden_size': 128, # Taille de la couche cachée du RNN
    'dropout_rate': 0.5,  # Taux de dropout
    'nb_labels': nbLabels
}  

# Définir l'entrée du modèle
# pas besoin de mettre le nb d'ex car il les fait passer un par un 
input_layer = Input(shape=(MAX_SEQ_SIZE, embedding_size), dtype='float32')

# Ajouter une couche LSTM bidirectionnelle
# X = Bidirectional(LSTM(units=config['hidden_size'], return_sequences=True))(input_layer)
X = LSTM(units=config['hidden_size'], return_sequences=True)(input_layer)
X = Dropout(config['dropout_rate'])(X)
# X = LSTM(units = 128)(X)
# X = Dropout(config['dropout_rate'])(X)

X = Dense(units=config['nb_labels'])(X)
X = Activation(activation='softmax')(X)
print('config ok')

# Création du model
model = Model(inputs=input_layer, outputs=X)
print("create ok")

# Compilation du modèle
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])
# NB : j'ai enlevé sparse car j'ai fait un vecteur one hot en sortie et sparse c'est pour quand c'est pas des one hot
print("compil ok")
print(model.summary())

config ok
create ok
compil ok


None


### Entraînement du modèle LSTM
- x : entree de la forme nb_example x MAX_SEQ_SIZE x embedding_size
- y : sortie de la forme nb_example x MAX_SEQ_SIZE
- batch_size : nb d'échantillon à utiliser à chaque itération lors de l'entrainement
    - un batch_size + gd = accélère l'entrainement mais besoin de + de mémoire GPU
    - plus petit = ralentit l'entrainement mais meilleure convergence du modèle
- epochs : nb d'itération sur l'ens des données d'entrainement 
- validation_split : spécifie le fraction des données à utiliser comme données de validation

In [94]:
model.fit(x=entree, y=sortie_one_hot, batch_size=128, epochs=10, validation_split=0.2)
print('training ok')

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 149ms/step - accuracy: 0.1659 - loss: 0.6732 - precision_4: 0.8403 - recall_4: 0.2006 - val_accuracy: 0.9548 - val_loss: 0.2175 - val_precision_4: 0.9327 - val_recall_4: 0.7463
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 126ms/step - accuracy: 0.3911 - loss: 0.2078 - precision_4: 0.9515 - recall_4: 0.7769 - val_accuracy: 0.9700 - val_loss: 0.1573 - val_precision_4: 0.9575 - val_recall_4: 0.8352
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 135ms/step - accuracy: 0.4583 - loss: 0.1415 - precision_4: 0.9734 - recall_4: 0.8402 - val_accuracy: 0.9654 - val_loss: 0.1320 - val_precision_4: 0.9608 - val_recall_4: 0.8500
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 147ms/step - accuracy: 0.4232 - loss: 0.1182 - precision_4: 0.9745 - recall_4: 0.8624 - val_accuracy: 0.9739 - val_loss: 0.1169 - val_precision_4: 0.9624 - val_recall_4:

### Modèle GRU

#### Configuration du modèle

In [95]:

config = {
    'hidden_size': 128, # Taille de la couche cachée du RNN
    'dropout_rate': 0.5,  # Taux de dropout
    'nb_labels': nbLabels
}  

# Définir l'entrée du modèle
# pas besoin de mettre le nb d'ex car il les fait passer un par un 
input_layer = Input(shape=(MAX_SEQ_SIZE, embedding_size), dtype='float32')

# Ajouter une couche LSTM bidirectionnelle
# X = Bidirectional(LSTM(units=config['hidden_size'], return_sequences=True))(input_layer)
X = GRU(units=config['hidden_size'], return_sequences=True)(input_layer)
X = Dropout(config['dropout_rate'])(X)
# X = LSTM(units = 128)(X)
# X = Dropout(config['dropout_rate'])(X)

X = Dense(units=config['nb_labels'])(X)
X = Activation(activation='softmax')(X)
print('config ok')

# Création du model
model_GRU = Model(inputs=input_layer, outputs=X)
print("create ok")

# Compilation du modèle
model_GRU.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', Precision(), Recall()])
# NB : j'ai enlevé sparse car j'ai fait un vecteur one hot en sortie et sparse c'est pour quand c'est pas des one hot
print("compil ok")
print(model_GRU.summary())

config ok
create ok
compil ok


None


#### Entrainement du modèle

In [96]:
model_GRU.fit(x=entree, y=sortie_one_hot, batch_size=128, epochs=10, validation_split=0.2)
print('training ok')

Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 127ms/step - accuracy: 0.1466 - loss: 0.6987 - precision_5: 0.7676 - recall_5: 0.2708 - val_accuracy: 0.5356 - val_loss: 0.2301 - val_precision_5: 0.9401 - val_recall_5: 0.7162
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 115ms/step - accuracy: 0.3210 - loss: 0.2369 - precision_5: 0.9355 - recall_5: 0.7364 - val_accuracy: 0.9368 - val_loss: 0.1873 - val_precision_5: 0.9536 - val_recall_5: 0.8019
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 111ms/step - accuracy: 0.3584 - loss: 0.1954 - precision_5: 0.9502 - recall_5: 0.7785 - val_accuracy: 0.9468 - val_loss: 0.1620 - val_precision_5: 0.9608 - val_recall_5: 0.8336
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 118ms/step - accuracy: 0.3505 - loss: 0.1648 - precision_5: 0.9633 - recall_5: 0.8106 - val_accuracy: 0.9575 - val_loss: 0.1468 - val_precision_5: 0.9619 - val_recall_5:

### Prétraitement des données de test

In [97]:
with open("test_corpus", "r", encoding="utf-8") as file:
    data_test = file.readlines()
    
num_examples_test, MAX_SEQ_SIZE_TEST = count_examples_and_max_length(data_test)


vec_word_test, label_test_real = pad_sentences_from_file("test_corpus", MAX_SEQ_SIZE)

print("Nombre d'exemples :", num_examples_test) 
print("Taille maximale de la phrase :", MAX_SEQ_SIZE_TEST)

# Création du vecteur d'entrée 
test_data_input = np.zeros((num_examples_test, MAX_SEQ_SIZE, 300))
for i, sentence in enumerate(vec_word_test):
    for j, word in enumerate(sentence):
        if word:
            test_data_input[i, j] = get_embedding(word)
        else:
            test_data_input[i, j] = np.zeros(300)

Nombre d'exemples : 1033
Taille maximale de la phrase : 32


In [98]:
print(test_data_input.shape)
print(label_test_real.shape)
print(entree.shape)

(1033, 41, 300)
(1033, 41)
(3945, 41, 300)


In [99]:
label_test_real_one_hot = np.zeros((label_test_real.shape[0], label_test_real.shape[1], len(label_one_hot_dict['O'])), dtype=int)
zero_vec = np.zeros(len(label_one_hot_dict['O']), dtype=int)
for i in range(label_test_real.shape[0]):
    for j in range(label_test_real.shape[1]):
        label = label_test_real[i, j]
        if label == '0':
            label_test_real_one_hot[i, j] = zero_vec
        else:
            label_test_real_one_hot[i, j] = label_one_hot_dict[label]

print(label_test_real_one_hot.shape)

(1033, 41, 81)


### Prédictions

#### Avec le modèle LSTM

In [100]:
test_predictions = model.predict(test_data_input)

print(test_predictions.shape)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
(1033, 41, 81)


In [101]:
predicted_labels_index = np.argmax(test_predictions, axis=-1)
# print(predicted_labels_index[10])
predicted_labels = [[labels[idx] for idx in sample] for sample in predicted_labels_index]
predicted_labels = np.array(predicted_labels)
# print(predicted_labels[10])

In [102]:
_ = model.evaluate(test_data_input, label_test_real_one_hot)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.9827 - loss: 0.0830 - precision_4: 0.9681 - recall_4: 0.8913


In [103]:
def write_predictions_to_file(file_path, vec_word_test, predicted_labels, label_test_real=None, exam=False):
    if not exam:
        with open(file_path, 'w', encoding="utf-8") as file:
            for words, real_labels, predicted_labels in zip(vec_word_test, label_test_real, predicted_labels):
                for word, real_label, predicted_label in zip(words, real_labels, predicted_labels):
                    if real_label == '0' or word == '0':
                        break
                    word_length = max(len(word), 8)
                    file.write(f"{word.ljust(word_length)}\t{real_label.ljust(10)}\t{predicted_label}\n")
                file.write("===========================================================\n")
    else:
        with open(file_path, 'w', encoding="utf-8") as file:
            for words, predicted_labels in zip(vec_word_test, predicted_labels):
                for word, predicted_label in zip(words, predicted_labels):
                    if word == '0':
                        break
                    word_length = max(len(word), 8)
                    file.write(f"{word.ljust(word_length)}\t{predicted_label}\n")
                file.write("\n")

In [104]:
write_predictions_to_file("results/predictions_LSTM.txt", vec_word_test, label_test_real, predicted_labels)

#### Avec le modèle GRU

In [105]:
test_predictions_GRU = model_GRU.predict(test_data_input)

print(test_predictions_GRU.shape)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
(1033, 41, 81)


In [106]:
predicted_labels_index_GRU = np.argmax(test_predictions_GRU, axis=-1)
# print(predicted_labels_index[10])
predicted_labels_GRU = [[labels[idx] for idx in sample] for sample in predicted_labels_index_GRU]
predicted_labels_GRU = np.array(predicted_labels_GRU)
# print(predicted_labels[10])

In [107]:
_ = model_GRU.evaluate(test_data_input, label_test_real_one_hot)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9774 - loss: 0.1122 - precision_5: 0.9650 - recall_5: 0.8796


In [108]:
write_predictions_to_file("results/predictions_GRU.txt", vec_word_test, label_test_real, predicted_labels_GRU)

# PARTIE EXAMEN

#### Configuration des données à prédire

In [109]:
with open("atis.test.talil.txt", "r", encoding="utf-8") as file:
    data_exam = file.readlines()
    
num_examples_exam, MAX_SEQ_SIZE_EXAM = count_examples_and_max_length(data_exam)


vec_word_exam = pad_sentences_from_file("atis.test.talil.txt", MAX_SEQ_SIZE, exam=True)

print("Nombre d'exemples :", num_examples_exam) 
print("Taille maximale de la phrase :", MAX_SEQ_SIZE_EXAM)

# Création du vecteur d'entrée 
exam_data_input = np.zeros((num_examples_exam, MAX_SEQ_SIZE, 300))
for i, sentence in enumerate(vec_word_exam):
    for j, word in enumerate(sentence):
        if word:
            exam_data_input[i, j] = get_embedding(word)
        else:
            exam_data_input[i, j] = np.zeros(300)

Nombre d'exemples : 893
Taille maximale de la phrase : 29


In [110]:
exam_data_input.shape

(893, 41, 300)

#### Prédictions avec LSTM

In [111]:
exam_predictions = model.predict(exam_data_input)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step


In [112]:
predicted_labels_index = np.argmax(exam_predictions, axis=-1)
# print(predicted_labels_index[10])
predicted_labels = [[labels[idx] for idx in sample] for sample in predicted_labels_index]
predicted_labels = np.array(predicted_labels)
# print(predicted_labels[10])

In [113]:
write_predictions_to_file("results/predictions_LSTM_EXAM.txt", vec_word_exam, predicted_labels, exam=True)

#### Prédiction avec GRU

In [114]:
exam_predictions_GRU = model_GRU.predict(exam_data_input)

[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


In [115]:
predicted_labels_index = np.argmax(exam_predictions_GRU, axis=-1)
# print(predicted_labels_index[10])
predicted_labels = [[labels[idx] for idx in sample] for sample in predicted_labels_index]
predicted_labels = np.array(predicted_labels)
# print(predicted_labels[10])

In [116]:
write_predictions_to_file("results/predictions_GRU_EXAM.txt", vec_word_exam, predicted_labels, exam=True)