# UE4 : Entrainement de modèles de reconnaissance d’entités nommées (NER)


### Install package :

In [1]:
import json 
import tensorflow as tf
import numpy as np
import pandas as pd
import datasets
import keras_tuner
from datasets import list_datasets, load_dataset
from tf2crf import CRF, ModelWithCRFLoss, ModelWithCRFLossDSCLoss
from sklearn.metrics import confusion_matrix# ConfusionMatrixDisplay
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm
 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


### Load dataset :

In [2]:
corpusCasM2 = datasets.load_dataset('corpusCasM2')

Using custom data configuration default
Reusing dataset corpus_cas_m2 (C:\Users\nvgioi\.cache\huggingface\datasets\corpus_cas_m2\default\1.0.0\1e18a23d323119b28ca96ca83baeaf676e58908f30122e75a54225833f284766)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 500.24it/s]


In [3]:
corpusCasM2

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 8305
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2545
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2122
    })
})

In [4]:
train = corpusCasM2['train']
print(train[0])
print(len(train))

{'id': '647_000', 'tokens': ['En', '1988', ',', 'F.G.', ',', 'un', 'homme', 'de', '58', 'ans', ',', 'tabagique', 'ancien', ',', 'a', 'présenté', 'une', 'hématurie', 'totale', 'en', 'rapport', 'avec', 'une', 'tumeur', 'vésicale', 'unique', ',', 'paraméatique', 'gauche', ',', 'sans', 'anomalie', 'urographique', '.'], 'ner_tags': [12, 0, 12, 12, 12, 12, 12, 12, 0, 1, 12, 4, 5, 12, 12, 12, 12, 4, 5, 12, 12, 12, 12, 4, 5, 5, 5, 5, 5, 12, 12, 4, 5, 12]}
8305


In [5]:
test = corpusCasM2['test']
print(test[0])
print(len(test))

{'id': '743_000', 'tokens': ['Nous', 'rapportons', 'l’', 'observation', 'd’', 'un', 'patient', 'de', '38', 'ans', ',', 'sans', 'antécédents', 'pathologiques', 'particuliers', ',', 'ayant', 'consulté', 'pour', 'des', 'lombalgies', 'chroniques', 'associées', 'à', 'de', 'signes', 'obstructifs', 'et', 'irritatifs', 'du', 'bas', 'appareil', 'urinaire', 'et', 'un', 'épisode', 'd’', 'hématurie', ',', 'le', 'tout', 'évoluant', 'depuis', '06', 'mois', '.'], 'ner_tags': [12, 12, 12, 12, 12, 12, 12, 12, 0, 1, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 4, 5, 12, 12, 12, 4, 5, 12, 4, 4, 5, 5, 5, 12, 12, 12, 12, 4, 12, 12, 12, 12, 2, 3, 3, 12]}
2545


In [6]:
validation = corpusCasM2['validation']
print(validation[0])
print(len(validation))

{'id': '730_000', 'tokens': ['Une', 'jeune', 'femme', 'de', '28', 'ans', 'rencontre', 'un', 'homme', 'de', '41', 'ans', 'dans', 'un', 'club', 'de', 'sport', '.'], 'ner_tags': [12, 12, 12, 12, 0, 1, 12, 12, 12, 12, 0, 1, 12, 12, 12, 12, 12, 12]}
2122


### X_train, X_test, X_val : list of words 

In [7]:
# words tokens dans la variable X
X_train = [ex['tokens'] for ex in train]
    
X_test = [ex['tokens'] for ex in test]

print(X_train[0])
print(len(X_train))
print(X_test[0])
print(len(X_test))

['En', '1988', ',', 'F.G.', ',', 'un', 'homme', 'de', '58', 'ans', ',', 'tabagique', 'ancien', ',', 'a', 'présenté', 'une', 'hématurie', 'totale', 'en', 'rapport', 'avec', 'une', 'tumeur', 'vésicale', 'unique', ',', 'paraméatique', 'gauche', ',', 'sans', 'anomalie', 'urographique', '.']
8305
['Nous', 'rapportons', 'l’', 'observation', 'd’', 'un', 'patient', 'de', '38', 'ans', ',', 'sans', 'antécédents', 'pathologiques', 'particuliers', ',', 'ayant', 'consulté', 'pour', 'des', 'lombalgies', 'chroniques', 'associées', 'à', 'de', 'signes', 'obstructifs', 'et', 'irritatifs', 'du', 'bas', 'appareil', 'urinaire', 'et', 'un', 'épisode', 'd’', 'hématurie', ',', 'le', 'tout', 'évoluant', 'depuis', '06', 'mois', '.']
2545


In [8]:
X_val = [ex['tokens'] for ex in validation]

print(X_val[0])
print(len(X_val))

['Une', 'jeune', 'femme', 'de', '28', 'ans', 'rencontre', 'un', 'homme', 'de', '41', 'ans', 'dans', 'un', 'club', 'de', 'sport', '.']
2122


### Y_train, Y_test, Y_val : list of tags

In [9]:
# ner_tags dans la variable Y

Y_train = [ex['ner_tags'] for ex in train]

Y_test = [ex['ner_tags'] for ex in test]

print(Y_train[0])
print(len(Y_train))
print(Y_test[0])
print(len(Y_test))

[12, 0, 12, 12, 12, 12, 12, 12, 0, 1, 12, 4, 5, 12, 12, 12, 12, 4, 5, 12, 12, 12, 12, 4, 5, 5, 5, 5, 5, 12, 12, 4, 5, 12]
8305
[12, 12, 12, 12, 12, 12, 12, 12, 0, 1, 12, 12, 2, 12, 12, 12, 12, 12, 12, 12, 4, 5, 12, 12, 12, 4, 5, 12, 4, 4, 5, 5, 5, 12, 12, 12, 12, 4, 12, 12, 12, 12, 2, 3, 3, 12]
2545


In [10]:
Y_val = [ex['ner_tags'] for ex in validation]

print(Y_val[0])
print(len(Y_val))

[12, 12, 12, 12, 0, 1, 12, 12, 12, 12, 0, 1, 12, 12, 12, 12, 12, 12]
2122


In [11]:
# number of tags with tag "O"
n_tags = len(set(sum(Y_train, [])))
n_tags

13

In [12]:
# number of tags without tag "O" : 
cat_vocab = 12


In [13]:
# "B-date","I-date","B-duration","I-duration","B-problem","I-problem","B-treatment","I-treatment","B-test","I-test",
# "B-frequency","I-frequency","O"

#Y_train_id = Y_train = tag2idx
#Y_test_id = Y_test

In [14]:
#Y_train_id = np.array([map(float, line.split()) for line in Y_train])

### tokenize X_train, X_test, X_val

In [15]:
# longueur maximale des tokens dans dataset X_train, variable max_len : length of the longest sequence

maxlen = max([len(s) for s in X_train])
maxlen


347

In [16]:
# Créer le tokenizer
num_w = 20000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words= num_w)

In [17]:
# Entrainer le tokenizer sur le train set 

tokenizer.fit_on_texts(X_train)

In [18]:
tokenizer.word_index

{'de': 1,
 '.': 2,
 ',': 3,
 'une': 4,
 'la': 5,
 'et': 6,
 'à': 7,
 'l’': 8,
 'le': 9,
 'un': 10,
 'a': 11,
 '(': 12,
 ')': 13,
 'd’': 14,
 'en': 15,
 'des': 16,
 '\n\n': 17,
 'du': 18,
 'par': 19,
 '-': 20,
 'avec': 21,
 "l'": 22,
 'été': 23,
 'les': 24,
 '/': 25,
 '\n': 26,
 'est': 27,
 'était': 28,
 'au': 29,
 "d'": 30,
 'pour': 31,
 'il': 32,
 'dans': 33,
 'x': 34,
 'ans': 35,
 'sans': 36,
 'mg': 37,
 'patient': 38,
 'examen': 39,
 'après': 40,
 ':': 41,
 'patiente': 42,
 'l': 43,
 'figure': 44,
 'jour': 45,
 'gauche': 46,
 'pas': 47,
 'sur': 48,
 'mois': 49,
 '2': 50,
 '1': 51,
 'son': 52,
 'deux': 53,
 'qui': 54,
 'avait': 55,
 'n’': 56,
 'traitement': 57,
 '3': 58,
 'on': 59,
 'droit': 60,
 'que': 61,
 'cm': 62,
 'elle': 63,
 'étaient': 64,
 'ont': 65,
 'droite': 66,
 'rénale': 67,
 'masse': 68,
 'tumeur': 69,
 'clinique': 70,
 'bilan': 71,
 'réalisée': 72,
 'sont': 73,
 'échographie': 74,
 'ne': 75,
 'depuis': 76,
 'plus': 77,
 '4': 78,
 'ce': 79,
 ';': 80,
 'ml': 81,
 'état':

In [19]:
# Transformer les textes en vecteurs numeriques à l'aide du tokenizer

X_train_seq = tokenizer.texts_to_sequences(X_train)

X_test_seq = tokenizer.texts_to_sequences(X_test)

In [20]:
X_val_seq = tokenizer.texts_to_sequences(X_val)

In [21]:
print(X_train_seq[0])
print(X_train[0])

[15, 3232, 3, 7186, 3, 10, 208, 1, 1736, 35, 3, 1597, 1359, 3, 11, 219, 4, 174, 192, 15, 338, 21, 4, 69, 114, 1274, 3, 7187, 46, 3, 36, 380, 3905, 2]
['En', '1988', ',', 'F.G.', ',', 'un', 'homme', 'de', '58', 'ans', ',', 'tabagique', 'ancien', ',', 'a', 'présenté', 'une', 'hématurie', 'totale', 'en', 'rapport', 'avec', 'une', 'tumeur', 'vésicale', 'unique', ',', 'paraméatique', 'gauche', ',', 'sans', 'anomalie', 'urographique', '.']


In [22]:
print(X_test_seq[0])
print(X_test[0])

[168, 2091, 8, 1268, 14, 10, 38, 1, 634, 35, 3, 36, 116, 489, 521, 3, 226, 358, 31, 16, 1250, 1767, 559, 7, 1, 179, 5387, 6, 8589, 18, 753, 868, 117, 6, 10, 475, 14, 174, 3, 9, 413, 232, 76, 2560, 49, 2]
['Nous', 'rapportons', 'l’', 'observation', 'd’', 'un', 'patient', 'de', '38', 'ans', ',', 'sans', 'antécédents', 'pathologiques', 'particuliers', ',', 'ayant', 'consulté', 'pour', 'des', 'lombalgies', 'chroniques', 'associées', 'à', 'de', 'signes', 'obstructifs', 'et', 'irritatifs', 'du', 'bas', 'appareil', 'urinaire', 'et', 'un', 'épisode', 'd’', 'hématurie', ',', 'le', 'tout', 'évoluant', 'depuis', '06', 'mois', '.']


In [23]:
print(X_val_seq[0])
print(X_val[0])

[4, 378, 304, 1, 821, 35, 13293, 10, 208, 1, 3770, 35, 33, 10, 1, 2]
['Une', 'jeune', 'femme', 'de', '28', 'ans', 'rencontre', 'un', 'homme', 'de', '41', 'ans', 'dans', 'un', 'club', 'de', 'sport', '.']


## Pad the obtained sequences so that they all have the same size (cf the `pad_sequences` function)

In [24]:
# Padding des sequences X_train_seq , Y_train

pad_X_train_seq  = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq,dtype='int32',
                                                             maxlen = maxlen, 
                                                             padding='post', value = num_w - 1)   #padding value = numwords - 1 ?
pad_Y_train = tf.keras.preprocessing.sequence.pad_sequences(Y_train,dtype='int32',
                                                             maxlen = maxlen, 
                                                             padding='post', value = 12)  #padding  #12 = tag "O"



In [25]:
pad_X_train_seq.shape, pad_Y_train.shape

((8305, 347), (8305, 347))

In [26]:
# Padding des sequences X_test_seq , Y_test

pad_X_test_seq  = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq,dtype='int32',
                                                             maxlen = maxlen,  #maxlen verifie  value = numwords - 1 ? 
                                                             padding='post', value = num_w - 1)
pad_Y_test  = tf.keras.preprocessing.sequence.pad_sequences(Y_test,dtype='int32',
                                                             maxlen = maxlen,  #maxlen verifie
                                                             padding='post', value = 12)  

In [27]:
pad_X_test_seq.shape, pad_Y_test.shape

((2545, 347), (2545, 347))

In [28]:
# Padding des sequences X_val_seq , Y_val

pad_X_val_seq  = tf.keras.preprocessing.sequence.pad_sequences(X_val_seq,dtype='int32',
                                                             maxlen = maxlen,  #maxlen verifie  value = numwords - 1 ? 
                                                             padding='post', value = num_w - 1)
pad_Y_val  = tf.keras.preprocessing.sequence.pad_sequences(Y_val,dtype='int32',
                                                             maxlen = maxlen,  #maxlen verifie
                                                             padding='post', value = 12)

In [29]:
pad_X_val_seq.shape, pad_Y_val.shape

((2122, 347), (2122, 347))

In [30]:
# Verifier les longueurs maximauxs du X et Y 

print(len(pad_X_train_seq[0]))
print(len(pad_Y_train[0]))

print(len(pad_X_train_seq))
print(len(pad_Y_train))

print(len(pad_X_test_seq[0]))
print(len(pad_Y_test[0]))

print(len(pad_X_test_seq))
print(len(pad_Y_test)) 

print(len(pad_X_val_seq[0]))
print(len(pad_Y_val[0]))

#print(pad_X_test_seq[0])

347
347
8305
8305
347
347
2545
2545
347
347


In [31]:
#explain here

input_dim = max([max(s) for s in X_train_seq]) + 1
output_dim = 32
input_length = maxlen

print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)


input_dim:  14004 
output_dim:  32 
input_length:  347 
n_tags:  13


### Build the model layout

In [32]:
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
from tensorflow.keras.optimizers import Adam

In [33]:
opt = Adam(learning_rate=9e-5)

In [34]:
class MyHyperModel(keras_tuner.HyperModel):
    def __init__(self, input_length, num_w, n_tags):
        self.input_length = input_length
        self.num_w = num_w
        self.n_tags = n_tags
        
    def build(self, hp):
        opt = Adam(learning_rate=9e-5)
        inputs = Input(shape=(self.input_length,), dtype='int32')
        output = Embedding(self.num_w + 1, output_dim=hp.Int("output_dim", min_value=32, max_value=128, step=32))(inputs)
        output = Bidirectional(LSTM(hp.Int("output_Bidirectional", min_value=32, max_value=128, step=32), return_sequences=True))(output)
        output = TimeDistributed(Dense(16, 
                                       activation=hp.Choice("activation_Dense", ["softmax", "sigmoid", "relu", "elu"])))(output)
        crf = CRF(self.n_tags)
        output= crf(output)
        basemodel = Model(inputs, output)
        model = ModelWithCRFLoss(basemodel, sparse_target=True)
        model.compile(optimizer=opt)
        return model


In [35]:
tuner = keras_tuner.RandomSearch(
    hypermodel=MyHyperModel(input_length, num_w, n_tags),
    objective=keras_tuner.Objective("val_val_accuracy", direction="min"),
    max_trials=30,
    overwrite=True)

In [36]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_val_accuracy",
    min_delta=0,
    patience=10,
    verbose=0,
    mode="max",
    baseline=None,
    restore_best_weights=False,
)

In [37]:
tuner.search(pad_X_train_seq, np.array(pad_Y_train), callbacks=[callback], verbose=1, epochs=5000, 
             validation_data=(pad_X_val_seq, np.array(pad_Y_val)))

Trial 8 Complete [06h 05m 49s]
val_val_accuracy: 0.9841322302818298

Best val_val_accuracy So Far: 0.0014911710750311613
Total elapsed time: 18h 32m 53s

Search: Running Trial #9

Value             |Best Value So Far |Hyperparameter
128               |128               |output_dim
96                |64                |output_Bidirectional
sigmoid           |softmax           |activation_Dense

Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5

KeyboardInterrupt: 

In [38]:
best_hps= tuner.get_best_hyperparameters(1)[0]
output_dim = best_hps.get('output_dim')
output_dim

128

In [39]:
activation_Dense = best_hps.get('activation_Dense')
activation_Dense

'softmax'

In [40]:
# Evaluate the model with best model
tuner.get_best_models()[0].evaluate(pad_X_test_seq, np.array(pad_Y_test))



[None, None]

In [41]:
predictions = tuner.get_best_models()[0].predict(pad_X_test_seq)



In [None]:
# # import tensorflow.keras.layers as kl

# # Créer un réseau à base de LSTM avec au minimum:
# # Embedding
# # Dropout
# # LSTM
# # Dropout
# # Classifieur
# model = tf.keras.models.Sequential()

# model.add(kl.Embedding(NUMWORDS, 30, input_length = MAXLEN))
# model.add(kl.Dropout(0.2))
# model.add(kl.Bidirectional(kl.LSTM(64)))
# model.add(kl.Dropout(0.2))
# model.add(kl.Dense(cat_vocab+1, activation="softmax"))

# # Compiler le modèle 

# model.compile(loss="sparse_categorical_crossentropy", 
#               optimizer="adam", 
#               metrics=['accuracy'])

# # Afficher le summary du modèle

# model.summary()

In [None]:
# # Fitter le modèle
# model.fit( X_train_seq, Y_train_id, batch_size=128, epochs = 5)

In [None]:
# # Evaluer le modèle
# model.evaluate(X_test_seq, Y_test_id)