In [1]:
import nltk
import pandas as pd
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, LSTM, Bidirectional, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.callbacks import ModelCheckpoint, EarlyStopping
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import KeyedVectors
import os
from nltk.tokenize import word_tokenize

from src.core import file_manager as fm

2022-06-16 17:50:11.999340: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-16 17:50:11.999355: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def build_vocabulary_from_df(dataframe):
  keyed_vectors = {}

  for row_index, row in dataframe.iterrows():
    for index, token in enumerate(row['tokens']):
      if token not in keyed_vectors:
          keyed_vectors[token] = row['word_embeddings'][index]
  return Vocabulary(keyed_vectors)

      
class Vocabulary:
  def __init__(self, keyed_vectors):
    self.keyed_vectors = keyed_vectors
    self.keys_to_index =  self.get_keys_to_index()
    self.embedding_matrix = self.build_embedding_matrix()

  def get_keys_to_index(self):
    return {key: index for index, key in enumerate(self.keyed_vectors.keys())}

  def build_embedding_matrix(self):        
    first_key_of_keyed_vectors = list(self.keyed_vectors.keys())[0]
    emb_size = len(self.keyed_vectors[first_key_of_keyed_vectors])
    n_words = len(self.keyed_vectors)
    
    pret_embedding = {}
    embedding_matrix = np.zeros((n_words, emb_size))
    count = 0

    # Dicionário com todos os ids e palavras do embedding pré-treinado
    # for index, word in enumerate(vocab.index_to_key):
    for index, word in enumerate(self.keys_to_index.keys()):
      pret_embedding[word] = index

    # Construindo a embedding_matrix do embedding pré-treinado
    for item in pret_embedding.items():
      if item[1] < n_words:
        count += 1
        embedding_vector = self.keys_to_index[item[0]]
        if embedding_vector is not None:
          embedding_matrix[item[1]] = embedding_vector
      
    return embedding_matrix

  def get_x_representation(self, tokens):
    list_x = []
    for token in tokens:
      if(token in self.keys_to_index):
        list_x.append(self.keys_to_index[token])
      
    return list_x

  

In [12]:
def filter_tokens_with_good_representation(df):
  return df.loc[df.apply(lambda x: (len(x['tokens']) == len(x['word_embeddings'])) , axis=1)]


def get_tokens_not_present_in_vocabullary(df, vocabulary):
  tokens_with_bad_representation = df[df.apply(lambda x: (len(x['tokens']) != len(x['word_embeddings'])) , axis=1)]

  tokens_not_present_in_vocab = []
  for row_index, row in tokens_with_bad_representation.iterrows():
    for token in row['tokens']:
      if token not in vocabulary.keyed_vectors.keys():
        tokens_not_present_in_vocab.append(token)

  return set(tokens_not_present_in_vocab)

In [13]:
def build_model(X, Y, embedding_matrix):
  vocabulary_lenth, embedding_dimensionality = embedding_matrix.shape
  
  model = Sequential()
  model.add(Embedding(vocabulary_lenth, embedding_dimensionality, input_length=X.shape[1], weights=[embedding_matrix]))
  model.add(Bidirectional(LSTM(64)))
  model.add(Dropout(0.1))
  model.add(Dense(Y.shape[1], activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [16]:
def prepare_data_for_train_and_test(embedding_name):
  print('Reading file')
  df_original = fm.read_annotated_df_with_embeddings(embedding_name)

  count_before = df_original.txt.count()
  print('Applying filter')
  df = filter_tokens_with_good_representation(df_original)

  rows_delted = count_before - df.txt.count()
  if rows_delted:
    print(f'There was deleted {rows_delted} rows with bad representation')

  print('building vocabulary')
  vocabulary = build_vocabulary_from_df(df)

  tokens_not_present_in_vocab = get_tokens_not_present_in_vocabullary(df_original, vocabulary)
  if tokens_not_present_in_vocab:
    print(f'There are {len(tokens_not_present_in_vocab)} tokens not present in the vocabulary')

  print('Creating data for train and test')
  tokens_max_len = df['tokens'].apply(lambda x : len(x)).max()
  
  # df['token_indexes'] = df['tokens'].apply(vocabulary.get_x_representation)
  df['token_indexes'] = df.loc[:, 'tokens'].apply(vocabulary.get_x_representation)

  X = pad_sequences(maxlen=tokens_max_len, sequences=df['token_indexes'], value=0, padding='post', truncating='post')
  Y = pd.get_dummies(df['intent']).values

  return vocabulary, X, Y


def train_model(embedding_name, vocabulary, X, Y):
  print('Building model')
  model = build_model(X, Y, vocabulary.embedding_matrix)
  model.summary()

  X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20, random_state = 42,stratify=Y)

  print('Trainning model')
  history = model.fit(X_train, Y_train, epochs=20, batch_size=64,validation_split=0.1, 
                      callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  print('Saving model')
  path_model = fm.filename_from_data_dir(f'output/patient/lstm_models/{embedding_name}.h5')
  model.save(path_model)

  print('Evaluating the model')
  accuracy = model.evaluate(X_test,Y_test)
  print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

In [15]:
embedding_name = 'bert_pt'

vocabulary, X, Y = prepare_data_for_train_and_test(embedding_name)

Reading file
Applying filter
There was deleted 21 rows with bad representation
building vocabulary
There are 84 tokens not present in the vocabulary
Creating data for train and test


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['token_indexes'] = df.loc[:, 'tokens'].apply(vocabulary.get_x_representation)


In [18]:
train_model(embedding_name, vocabulary, X, Y)

Building model


2022-06-16 17:54:29.560355: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-16 17:54:29.561100: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-16 17:54:29.561272: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2022-06-16 17:54:29.561397: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2022-06-16 17:54:29.561581: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Co

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 103, 768)          4785408   
                                                                 
 bidirectional (Bidirectiona  (None, 128)              426496    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 4)                 516       
                                                                 
Total params: 5,212,420
Trainable params: 5,212,420
Non-trainable params: 0
_________________________________________________________________
Trainning model
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/

In [19]:
embedding_name = 'glove'

vocabulary, X, Y = prepare_data_for_train_and_test(embedding_name)

Reading file
Applying filter
building vocabulary
Creating data for train and test


In [20]:
train_model(embedding_name, vocabulary, X, Y)

Building model
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 155, 300)          2145000   
                                                                 
 bidirectional_1 (Bidirectio  (None, 128)              186880    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 4)                 516       
                                                                 
Total params: 2,332,396
Trainable params: 2,332,396
Non-trainable params: 0
_________________________________________________________________
Trainning model
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Ep

In [36]:
#model.load_weights(path_model)

## Using the model

In [93]:
def print_pred_real_label(index):
  #new_narrative = [train_df['RELATO'][index]]
  #seq = tokenizer.texts_to_sequences(new_narrative)
  #padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
  pred = model.predict(X[index-1:index])
  labels = df['intent'].to_list()
  print(np.argmax(pred))
  print(pred, labels[np.argmax(pred)])

In [40]:
index = 10

pred = model.predict(X[index-1:index])

pred

array([[1.2611243e-05, 3.7041116e-06, 9.9998200e-01, 1.6522043e-06]],
      dtype=float32)

In [41]:
np.argmax(pred)

2

In [21]:
vocabulary.embedding_matrix.shape

(7150, 300)

In [22]:
X.shape

(8900, 155)