In [1]:
import json
import nltk
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import KeyedVectors
import os
from nltk.tokenize import word_tokenize

from src.core import file_manager as fm

2022-06-19 17:38:48.209844: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-19 17:38:48.209862: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Sentences

In [2]:
df = fm.read_annotated_df_with_embeddings('bert_pt')

count_before = df.txt.count()

In [3]:
df = df.loc[df.apply(lambda x: (len(x['tokens']) == len(x['word_embeddings'])) , axis=1)]

rows_delted = count_before - df.txt.count()

print(f'There was deleted {rows_delted} rows with bad representation')

There was deleted 21 rows with bad representation


In [4]:
vocab = {}

for index, row in df.iterrows():
  for index, token in enumerate(row['tokens']):
    if token not in vocab:
      vocab[token] = row['word_embeddings'][index]

n_source_words = len(vocab)

print(n_source_words)

6231


In [5]:
df['intent'].value_counts()

inform_symptoms    5474
request_inform     1278
greeting           1175
inform_medicine     889
Name: intent, dtype: int64

In [6]:
max_len = df['tokens'].apply(lambda x : len(x)).max()
print(max_len)
# max_len = 700 #75% do dataset pelo menos tem no maximo esse valor de tamanho

103


In [7]:
def create_embedding_matrix(vocab, n_words, emb_size):
    
    pret_embedding = {}
    embedding_matrix = np.zeros((n_words, emb_size))
    count = 0

    # Dicionário com todos os ids e palavras do embedding pré-treinado
    # for index, word in enumerate(vocab.index_to_key):
    for index, word in enumerate(vocab.keys()):
      pret_embedding[word] = index

    # Construindo a embedding_matrix do embedding pré-treinado
    for item in pret_embedding.items():
      if item[1] < n_words:
        count += 1
        embedding_vector = vocab[item[0]]
        if embedding_vector is not None:
          embedding_matrix[item[1]] = embedding_vector
      
    return embedding_matrix

## Save the vectors  in a new Matrix

In [8]:
# embedding_size = 300
embedding_size = len(df.loc[0].word_embeddings[0])
embedding_matrix = create_embedding_matrix(vocab, n_source_words, embedding_size)
embedding_matrix.shape

(6231, 768)

In [9]:
vocab_key_2_index =  {key: index for index, key in enumerate(vocab.keys())}

## Padding

In [10]:
# Função que cria o X
def create_x(tokens):
    list_x = []
    for token in tokens:
      if(token in vocab_key_2_index):
        list_x.append(vocab_key_2_index[token])
      
    return list_x


In [11]:
df['token_indexes'] = df['tokens'].apply(lambda tokens : create_x(tokens))
df.head(5)

Unnamed: 0,txt,annotated_txt,intent,embeddings,tokens,word_embeddings,token_indexes
0,Tô muito nervosa Marcela com tudo isso,Tô muito nervosa Marcela com tudo isso,inform_symptoms,"[[-0.17518604000000002, -0.38116214, 0.1568575...","[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.10563439000000001, -0.6496529600000001, 0....","[0, 1, 2, 3, 4, 5, 6]"
1,Com muito medo,Com muito medo,inform_symptoms,"[[0.40117407, -0.22014676, 0.1467775, 0.185312...","[Com, muito, medo]","[[0.5744509, -0.58911455, 0.11015732600000001,...","[7, 1, 8]"
2,Não,Não,greeting,"[[0.0857483, -0.41115219999999997, 0.28060192,...",[Não],"[[0.0857483, -0.41115219999999997, 0.28060192,...",[9]
3,Tem algum remédio que a pessoa tome para se a...,Tem algum remédio que a pessoa tome para se a...,request_inform,"[[-0.018409189, -0.17020282, 0.273364780000000...","[Tem, algum, remédio, que, a, pessoa, tome, pa...","[[-0.026540479000000002, -0.44967073, 0.199957...","[10, 11, 12, 13, 14, 15, 16, 17, 18, 19]"
4,Tá certo,Tá certo,greeting,"[[0.2476745, -0.31360537, 0.040109605, 0.37392...","[Tá, certo]","[[0.29746079999999997, -0.3857502, 0.25457925,...","[20, 21]"


In [12]:
X = pad_sequences(maxlen=max_len, sequences=df['token_indexes'], value=0, padding='post', truncating='post')
Y = pd.get_dummies(df['intent']).values
(X.shape, Y.shape)

((8816, 103), (8816, 4))

## Create train and test

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42,stratify=Y)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(7052, 103) (7052, 4)
(1764, 103) (1764, 4)


## LSTM Model

In [14]:
num_labels = Y.shape[1]
MAX_NB_WORDS = n_source_words
# EMBEDDING_DIM = 50
EMBEDDING_DIM = embedding_size

model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1], weights=[embedding_matrix]))
# model.add(Embedding(X.shape[0], X.shape[1], input_length=X.shape[1], weights=[X]))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.1))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Optimisation functions usually calculate the gradient i.e. the partial derivative of loss function with respect to weights, 
#and the weights are modified in the opposite direction of the calculated gradient.

2022-06-19 17:39:11.658851: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-19 17:39:11.659963: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-19 17:39:11.660013: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-06-19 17:39:11.660068: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-06-19 17:39:11.660100: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 103, 768)          4785408   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              426496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 4)                 516       
                                                                 
Total params: 5,212,420
Trainable params: 5,212,420
Non-trainable params: 0
_________________________________________________________________


In [16]:
path_model = fm.filename_from_data_dir(f'output/patient/lstm_models/bert_pt.h5')
#model.load_weights(path_model)

In [17]:
epochs = 20
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1, 
                    callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


In [18]:
working_dir = fm.filename_from_data_dir(f'output/lstm_models/patient/bert_pt')

In [19]:
print('saving the model')
model.save(f'{working_dir}/model.h5')

saving the model


In [20]:
print('saving the vocabullary')

file = open(f'{working_dir}/vocabullary.json',"w")

file.write(json.dumps(vocab_key_2_index))

file.close()

saving the vocabullary


In [21]:
print('saving the metadata')

file = open(f'{working_dir}/metadata.json',"w")

intents = pd.get_dummies(df['intent']).columns.tolist()

metadata = {
  'intents': intents,
  'vector_length': str(max_len)
}

file.write(json.dumps(metadata))

file.close()

saving the metadata


## Evaluate the model

In [22]:
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.227
  Accuracy: 0.926
