In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf 
import tensorflow.keras as keras
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import plot_model

In [2]:
import os
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [3]:
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")
val = pd.read_csv("val.csv")

start = "<start>"#<start>"
stop = '<stop>'#"<end>"
padding = '<pad>'#"<PAD>"





In [4]:
train

Unnamed: 0,RawText,ICD10
0,Thrombose veineuse profonde cuisse gauche,I802
1,Hémiplégie post-traumatique,S099
2,Masculinisation avec hyperplasie surrénale,E250
3,Hyperammoniémie cérébrale,E722
4,Fistule artérioveineuse congénitale périphériq...,Q257
...,...,...
181758,Prématurité 32 SA,P073
181759,Rétinopathie E14.3 malnutrition E12.3,H360
181760,Métastase pariétale,C798
181761,Lésion cutanée de la pinta,A673


In [4]:
train["label"] = train.ICD10.apply(lambda x: [start]+ list(x)+[stop])
val["label"] = val.ICD10.apply(lambda x: [start]+ list(x)+[stop])
test["label"] = test.ICD10.apply(lambda x: [start]+ list(x)+[stop])

Unnamed: 0,RawText,ICD10,label
0,Thrombose veineuse profonde cuisse gauche,I802,"[<start>, I, 8, 0, 2, <stop>]"
1,Hémiplégie post-traumatique,S099,"[<start>, S, 0, 9, 9, <stop>]"
2,Masculinisation avec hyperplasie surrénale,E250,"[<start>, E, 2, 5, 0, <stop>]"
3,Hyperammoniémie cérébrale,E722,"[<start>, E, 7, 2, 2, <stop>]"
4,Fistule artérioveineuse congénitale périphériq...,Q257,"[<start>, Q, 2, 5, 7, <stop>]"
...,...,...,...
181758,Prématurité 32 SA,P073,"[<start>, P, 0, 7, 3, <stop>]"
181759,Rétinopathie E14.3 malnutrition E12.3,H360,"[<start>, H, 3, 6, 0, <stop>]"
181760,Métastase pariétale,C798,"[<start>, C, 7, 9, 8, <stop>]"
181761,Lésion cutanée de la pinta,A673,"[<start>, A, 6, 7, 3, <stop>]"


In [5]:
max_input_seq = max(train["label"].str.len().max(),val["label"].str.len().max(),test["label"].str.len().max())

y_train = pad_sequences(train["label"], maxlen = max_input_seq,dtype=object, padding='post', truncating='post', value= padding)
y_val   = pad_sequences(val["label"]  , maxlen = max_input_seq,dtype=object, padding='post', truncating='post', value= padding)
y_test  = pad_sequences(test["label"] , maxlen = max_input_seq,dtype=object, padding='post', truncating='post', value= padding)



In [6]:
train["label"] = [" ".join(x) for x in y_train]
val["label"]   = [" ".join(x) for x in y_val]
test["label"]  = [" ".join(x) for x in y_test]



In [7]:
train

Unnamed: 0,RawText,ICD10,label
0,Thrombose veineuse profonde cuisse gauche,I802,<start> I 8 0 2 <stop>
1,Hémiplégie post-traumatique,S099,<start> S 0 9 9 <stop>
2,Masculinisation avec hyperplasie surrénale,E250,<start> E 2 5 0 <stop>
3,Hyperammoniémie cérébrale,E722,<start> E 7 2 2 <stop>
4,Fistule artérioveineuse congénitale périphériq...,Q257,<start> Q 2 5 7 <stop>
...,...,...,...
181758,Prématurité 32 SA,P073,<start> P 0 7 3 <stop>
181759,Rétinopathie E14.3 malnutrition E12.3,H360,<start> H 3 6 0 <stop>
181760,Métastase pariétale,C798,<start> C 7 9 8 <stop>
181761,Lésion cutanée de la pinta,A673,<start> A 6 7 3 <stop>


In [8]:
MAX_TOKENS = 10000
HIDDEN_DIM = 32 
NB_CAR = 26 + 10 + 3 

source_vec_layer = keras.layers.TextVectorization(max_tokens = MAX_TOKENS)
target_vec_layer = keras.layers.TextVectorization(max_tokens = NB_CAR)

source_vec_layer.adapt(pd.concat([train["RawText"], val["RawText"], test["RawText"]]))
target_vec_layer.adapt(pd.concat([train["label"], val["label"], test["label"]]))



In [9]:
target_y = train["label"].str.slice(len(start)+1)
val_y =    val["label"].str.slice(len(start)+1)
decoder_input = train["label"].str.slice(stop=-len(stop)-1)
val_decoder_input = val["label"].str.slice(stop=-len(stop)-1)

encoded_target_y = target_vec_layer(target_y)
encoded_val_y = target_vec_layer(val_y)

In [None]:
train

In [10]:
class Encoder(keras.Model):
    def __init__(self, HIDDEN_DIM, MAX_TOKENS, source_vec_layer, ):
        super(Encoder, self).__init__()
        self.source_vec_layer = source_vec_layer
        self.embedding_layer = keras.layers.Embedding(
                input_dim =  MAX_TOKENS,
                output_dim =  HIDDEN_DIM,
                mask_zero = True,
                                   )
        self.encoder_lstm = keras.layers.LSTM(
            HIDDEN_DIM,
            dropout = 0.1,
            return_state = True 
                                    )
        self.bidirectional = keras.layers.Bidirectional(self.encoder_lstm)

    def call(self, input):
        outputs = self.source_vec_layer(input)
        outputs = self.embedding_layer(outputs)
        _, forward_h, forward_c, backward_h, backward_c = self.bidirectional(outputs)
        bd_state_h = keras.layers.Concatenate()([forward_h, backward_h])
        bd_state_c = keras.layers.Concatenate()([forward_c, backward_c])
        encoder_states = [bd_state_h, bd_state_c]
        
        return _, encoder_states
        

class Decoder(keras.Model):
    def __init__(self, HIDDEN_NIM, NB_CAR, target_vec_layer):
        super(Decoder, self).__init__()
        
        self.target_vec_layer = target_vec_layer
        self.masking_layer = keras.layers.Masking(mask_value=target_vec_layer(padding))
        self.embdding_layer = keras.layers.Embedding(input_dim =NB_CAR,output_dim =  2* HIDDEN_DIM ,mask_zero = True,)
        self.decoder_lstm = keras.layers.LSTM(
            HIDDEN_DIM * 2,
            dropout = 0.1,
            return_state= True, 
            return_sequences= True,
        )
        self.dense_layer = keras.layers.Dense(NB_CAR, activation='softmax')

    def call(self, input, encoder_states):
        outputs = self.target_vec_layer(input)
        outputs = self.masking_layer(outputs)
        outputs = self.embdding_layer(outputs)
        decoder_outputs, state_h , state_c = self.decoder_lstm(outputs,initial_state=encoder_states)
        outputs = self.dense_layer(decoder_outputs)
        return outputs, state_h , state_c
        
class Seq2seq(keras.Model):
    def __init__(self, encoder, decoder):
        super(Seq2seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder 
    def call(self, inputs):
        src, trg = inputs
        _, encoder_states = self.encoder(src)
        outputs, _, _ = self.decoder(trg, encoder_states)
        return outputs


        

In [11]:
print(target_y[0])
print(encoded_target_y[0,:])

I 8 0 2 <stop>
tf.Tensor([15  4  5  8  2], shape=(5,), dtype=int64)


In [12]:
enc = Encoder(HIDDEN_DIM, MAX_TOKENS, source_vec_layer)
dec = Decoder(HIDDEN_DIM, NB_CAR, target_vec_layer)
mod = Seq2seq(enc, dec)
mod.compile(optimizer='adam', loss='sparse_categorical_crossentropy',  metrics = ['accuracy'])

mod.fit([train["RawText"],decoder_input], 
          encoded_target_y,
          batch_size = 256,
          epochs=40, 
          #validation_split= 0.2,
          validation_data = ([val["RawText"], val_decoder_input], encoded_val_y)
       )

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1e2710e34c0>

In [19]:
vocab = target_vec_layer.get_vocabulary()

In [21]:
vocab

['',
 '[UNK]',
 'stop',
 'start',
 '8',
 '0',
 '9',
 '1',
 '2',
 '4',
 '3',
 '7',
 '5',
 '6',
 'c',
 'i',
 'pad',
 't',
 's',
 'q',
 'z',
 'k',
 'g',
 'j',
 'd',
 'm',
 'n',
 'r',
 'e',
 'a',
 'o',
 'l',
 'p',
 'f',
 'h',
 'b',
 'y',
 'x',
 'w']

In [11]:

def Decoder_test(input_seq,vocab) :
    _, states_values = enc(tf.constant([input_seq]))
    target_seq = tf.constant(["<start>"])
    stop = False
    decoder_seq = "<start>"
    decoder_nb = 1
    #print(states_values)
    while not stop :
        #dec_inptuts = [target_seq] + states_values
        output_tokens, h, c = dec(target_seq,states_values) 
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = vocab[sampled_token_index]
        decoder_seq = decoder_seq + " "+ sampled_char
        decoder_nb = decoder_nb +1 
        if (sampled_char == '<stop>' or decoder_nb > 5):
            stop = True
        target_seq = tf.constant([sampled_char])  
        states_values = [h,c]
    return decoder_seq

In [12]:
for k in range(100):
    inp = test["RawText"][k]
    decoder_seq = Decoder_test(inp,vocab)
    #print(decoder_seq)
    print(test["label"][k])
    print(decoder_seq)
    print("")

<start> R 5 2 9 <stop>
<start> r 2 2 4 stop

<start> S 3 5 2 <stop>
<start> s 3 5 1 stop

<start> S 2 7 0 <stop>
<start> j 9 3 9 stop

<start> B 9 0 9 <stop>
<start> b 9 0 9 stop

<start> A 0 7 1 <stop>
<start> a 0 7 1 stop

<start> P 7 0 4 <stop>
<start> p 7 0 2 stop

<start> P 1 5 2 <stop>
<start> p 3 5 3 stop

<start> O 8 7 3 <stop>
<start> i 8 2 8 stop

<start> I 4 9 5 <stop>
<start> r 0 0 1 stop

<start> C 7 6 0 <stop>
<start> c 7 6 0 stop

<start> M 5 4 3 <stop>
<start> z 9 0 4 stop

<start> B 6 7 9 <stop>
<start> b 6 7 7 stop

<start> N 2 8 8 <stop>
<start> n 1 8 1 stop

<start> H 2 1 2 <stop>
<start> h 3 1 8 stop

<start> D 4 3 1 <stop>
<start> d 4 3 1 stop

<start> F 4 8 9 <stop>
<start> f 4 8 9 stop

<start> G 4 0 3 <stop>
<start> g 4 1 8 stop

<start> O 8 7 3 <stop>
<start> i 8 2 8 stop

<start> O 6 4 0 <stop>
<start> o 6 6 0 stop

<start> I 8 9 1 <stop>
<start> i 8 9 1 stop

<start> G 9 6 9 <stop>
<start> g 9 3 8 stop

<start> X 6 8 <stop> <pad>
<start> x 6 1 stop pad

<sta