In [7]:
import json
import nltk
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import KeyedVectors
import os
from nltk.tokenize import word_tokenize

from src.core import file_manager as fm

In [5]:
# def read_annotated_df_with_old_embeddings(embedding_name, actor='patient', variation='without_others_intent/k100_without_sentences_higher_than_median'):
#     df = fm.read_json_of_dir(
#         fm.filename_from_data_dir(
#             f'embeddings/{embedding_name}/text_emb_{actor}.json.old'),
#         lines=True
#     )

#     file_name = fm.filename_from_data_dir(
#         f'output/{actor}/{variation}/{embedding_name}/annotated_sentences.csv'
#     )

#     df_annotated = pd.read_csv(file_name)

#     df_merged = pd.merge(df_annotated, df, on='txt', how='left')

#     return df_merged


# df_old = read_annotated_df_with_old_embeddings('flair_pt', variation='k100')
# df = fm.read_annotated_df_with_embeddings('flair_pt', variation='k100')
# df['embeddings'].equals(df_old['embeddings'])

# df_old = read_annotated_df_with_old_embeddings('bert_pt', variation='k100')
# df = fm.read_annotated_df_with_embeddings('bert_pt', variation='k100')
# df['embeddings'].equals(df_old['embeddings'])

# del df
# del df_old

In [6]:
df = fm.read_annotated_df_with_embeddings('flair_pt')

df.head(5)

Unnamed: 0,txt,annotated_txt,intent,tokens,embeddings
0,Tô muito nervosa Marcela com tudo isso,Tô muito nervosa Marcela com tudo isso,inform_symptoms,"[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.0005851125, -0.0007871192, 0.0001037749000..."
1,O meu problema é psicológico nem dormi es...,O meu problema é psicológico nem dormi es...,inform_symptoms,"[O, meu, problema, é, psicológico, nem, dormi,...","[[0.0023417221, -0.0006897012, 4.2738000000000..."
2,Tem algum remédio que a pessoa tome para se a...,Tem algum remédio que a pessoa tome para se a...,inform_symptoms,"[Tem, algum, remédio, que, a, pessoa, tome, pa...","[[0.0014347894, -0.0003659404, -1.178180000000..."
3,Tá certo,Tá certo,greeting,"[Tá, certo]","[[0.0047551189, -0.010857887600000001, 0.00047..."
4,Tá bom,Tá bom,greeting,"[Tá, bom]","[[0.0004357887, -0.0107698599, 0.0004520991, 0..."


## Sentences

In [18]:
import pandas as pd
from flair.embeddings import DocumentPoolEmbeddings, FlairEmbeddings, StackedEmbeddings
from flair.data import Sentence

flair_embedding_forward = FlairEmbeddings('pt-forward')
flair_embedding_backward = FlairEmbeddings('pt-backward')
document_pool_embeddings = DocumentPoolEmbeddings([flair_embedding_forward, flair_embedding_backward])
stacked_embeddings = StackedEmbeddings([flair_embedding_forward, flair_embedding_backward])


def get_word_embeddings(txt):  
  sentence = Sentence(str(txt))
  
  document_pool_embeddings.embed(sentence)
  stacked_embeddings.embed(sentence)

  word_embeddings = [token.embedding.tolist() for token in sentence.tokens]

  return word_embeddings

In [20]:
vocab = {}

for index, row in df.iterrows():
  word_embeddings = get_word_embeddings(row['txt'])
  if index % 1000 == 0:
    print(f'It was processed: {index} lines....')  

  for index, token in enumerate(row['tokens']):
    if token not in vocab:
      word_embedding = np.array(word_embeddings[index])

      if np.any(word_embedding):
        vocab[token] = word_embedding
      else:
        print(f'The token {token} has an embedding with just zeros')


n_source_words = len(vocab)

print(n_source_words)

It was processed: 0 lines....
It was processed: 1000 lines....
It was processed: 2000 lines....
It was processed: 3000 lines....
It was processed: 4000 lines....
It was processed: 5000 lines....
It was processed: 6000 lines....
It was processed: 7000 lines....
It was processed: 8000 lines....
6285


In [21]:
keys_lower = {}

for key in vocab.keys():
  if not key.lower() in keys_lower:
    keys_lower[key.lower()] = []

  keys_lower[key.lower()].append(key)


keys_count ={key: value for key, value in keys_lower.items() if len(value) > 1}

for key in keys_count.keys():
  first_key = keys_count[key][0]
  
  for index in range(1, len(keys_count[key])):
    key_to_compare = keys_count[key][index]
    
    if not (vocab[first_key] == vocab[key_to_compare]).all():
      print(f'There is a diference between: {first_key} and {key_to_compare} ')

There is a diference between: Tô and tô 
There is a diference between: muito and Muito 
There is a diference between: com and Com 
There is a diference between: tudo and Tudo 
There is a diference between: isso and Isso 
There is a diference between: O and o 
There is a diference between: meu and Meu 
There is a diference between: é and É 
There is a diference between: nem and Nem 
There is a diference between: dormi and Dormi 
There is a diference between: estou and Estou 
There is a diference between: Tem and tem 
There is a diference between: algum and Algum 
There is a diference between: remédio and Remédio 
There is a diference between: que and Que 
There is a diference between: a and A 
There is a diference between: para and Para 
There is a diference between: se and Se 
There is a diference between: Tá and tá 
There is a diference between: certo and Certo 
There is a diference between: bom and Bom 
There is a diference between: Obrigada and obrigada 
There is a diference between

### Rebuild the vocabullary

In [42]:
vocab = {}
embedding_size = len(get_word_embeddings(df.loc[0]['txt'])[0])

vocab['unknown'] = np.zeros(embedding_size)

for index, row in df.iterrows():
  word_embeddings = get_word_embeddings(row['txt'])
  
  for index, token in enumerate(row['tokens']):
    token_lower = token.lower()
    word_embedding = np.array(word_embeddings[index])
    
    if token_lower not in vocab and np.any(word_embedding):
      vocab[token_lower] = word_embedding 


n_source_words = len(vocab)

print(n_source_words)

5499


In [25]:
df['intent'].value_counts()

inform_symptoms    6229
request_inform     1143
greeting           1040
inform_medicine     428
Name: intent, dtype: int64

In [43]:
max_len = df['tokens'].apply(lambda x : len(x)).max()
print(max_len)
# max_len = 700 #75% do dataset pelo menos tem no maximo esse valor de tamanho

147


In [46]:
def create_embedding_matrix(vocab, n_words, emb_size):
    
    pret_embedding = {}
    embedding_matrix = np.zeros((n_words, emb_size))
    count = 0    

    # Dicionário com todos os ids e palavras do embedding pré-treinado
    # for index, word in enumerate(vocab.index_to_key):
    for index, word in enumerate(vocab.keys()):
      pret_embedding[word] = index

    # Construindo a embedding_matrix do embedding pré-treinado
    for item in pret_embedding.items():
      if item[1] < n_words:
        count += 1
        embedding_vector = vocab[item[0]]
        if embedding_vector is not None:
          embedding_matrix[item[1]] = embedding_vector
      
    return embedding_matrix

## Save the vectors  in a new Matrix

In [55]:
embedding_matrix = create_embedding_matrix(vocab, n_source_words, embedding_size)
embedding_matrix.shape

(5499, 4096)

In [56]:
vocab_key_2_index =  {key: index for index, key in enumerate(vocab.keys())}

In [58]:
type(embedding_matrix), type(vocab), type(vocab_key_2_index)

(numpy.ndarray, dict, dict)

## Padding

In [59]:
# Função que cria o X
def create_x(tokens):
    list_x = []
    for token in tokens:
      token_lower = token.lower()
      
      index_token = vocab_key_2_index[token_lower] if token_lower in vocab_key_2_index else 0
      
      list_x.append(index_token)
      
    return list_x


In [60]:
df['token_indexes'] = df['tokens'].apply(lambda tokens : create_x(tokens))
df.head(5)

Unnamed: 0,txt,annotated_txt,intent,tokens,embeddings,token_indexes
0,Tô muito nervosa Marcela com tudo isso,Tô muito nervosa Marcela com tudo isso,inform_symptoms,"[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.0005851125, -0.0007871192, 0.0001037749000...","[1, 2, 3, 4, 5, 6, 7]"
1,O meu problema é psicológico nem dormi es...,O meu problema é psicológico nem dormi es...,inform_symptoms,"[O, meu, problema, é, psicológico, nem, dormi,...","[[0.0023417221, -0.0006897012, 4.2738000000000...","[8, 9, 10, 11, 12, 13, 14, 15, 16]"
2,Tem algum remédio que a pessoa tome para se a...,Tem algum remédio que a pessoa tome para se a...,inform_symptoms,"[Tem, algum, remédio, que, a, pessoa, tome, pa...","[[0.0014347894, -0.0003659404, -1.178180000000...","[17, 18, 19, 20, 21, 22, 23, 24, 25, 26]"
3,Tá certo,Tá certo,greeting,"[Tá, certo]","[[0.0047551189, -0.010857887600000001, 0.00047...","[27, 28]"
4,Tá bom,Tá bom,greeting,"[Tá, bom]","[[0.0004357887, -0.0107698599, 0.0004520991, 0...","[27, 29]"


In [61]:
X = pad_sequences(maxlen=max_len, sequences=df['token_indexes'], value=0, padding='post', truncating='post')
Y = pd.get_dummies(df['intent']).values
(X.shape, Y.shape)

((8840, 147), (8840, 4))

## Create train and test

In [62]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42,stratify=Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7072, 147) (7072, 4)
(1768, 147) (1768, 4)


## LSTM Model

In [63]:
num_labels = Y.shape[1]
MAX_NB_WORDS = n_source_words
# EMBEDDING_DIM = 50
EMBEDDING_DIM = embedding_size

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1], weights=[embedding_matrix]))
# model.add(Embedding(X.shape[0], X.shape[1], input_length=X.shape[1], weights=[X]))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Optimisation functions usually calculate the gradient i.e. the partial derivative of loss function with respect to weights, 
#and the weights are modified in the opposite direction of the calculated gradient.

In [64]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 147, 4096)         22523904  
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              2130432   
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 4)                 516       
                                                                 
Total params: 24,654,852
Trainable params: 24,654,852
Non-trainable params: 0
_________________________________________________________________


In [65]:
epochs = 20
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/20


2022-06-24 18:22:02.903687: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154140672 exceeds 10% of free system memory.
2022-06-24 18:22:02.963048: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154140672 exceeds 10% of free system memory.
2022-06-24 18:22:02.963096: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154140672 exceeds 10% of free system memory.
2022-06-24 18:22:02.978826: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154140672 exceeds 10% of free system memory.
2022-06-24 18:22:03.641406: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 154140672 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


## Save model the model and vocabullary

In [71]:
working_dir = fm.filename_from_data_dir(f'output/lstm_models/patient/flair_pt')

In [72]:
print('saving the model')
model.save(f'{working_dir}/model.h5')

saving the model


In [73]:
print('saving the vocabullary')

file = open(f'{working_dir}/vocabullary.json',"w")

file.write(json.dumps(vocab_key_2_index))

file.close()

saving the vocabullary


In [75]:
print('saving the metadata')

file = open(f'{working_dir}/metadata.json',"w")

intents = pd.get_dummies(df['intent']).columns.tolist()

metadata = {
  'intents': intents,
  'vector_length': str(max_len)
}

file.write(json.dumps(metadata))

file.close()

saving the metadata


## Evaluate the model

In [76]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.297
  Accuracy: 0.926
