In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Dense
from keras.callbacks import EarlyStopping
import numpy as np
import os
import random as rn
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf


from src.core import file_manager as fm

2022-07-07 21:17:09.410625: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-07 21:17:09.410638: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
def apply_seed(verbosity=True):
  if verbosity:
    print('Applying seed')
  SEED = 42
  os.environ['PYTHONHASHSEED'] = str(SEED)
  np.random.seed(SEED)
  rn.seed(SEED)
  tf.random.set_seed(SEED)

In [4]:
def build_model(embedding_dim, num_labels):
  model = Sequential()
  model.add(Dense(64, activation='softmax', input_shape=(embedding_dim,)))
  model.add(Dropout(0.1))
  model.add(Dense(num_labels, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [5]:
def run_pipeline(embedding_name):
  apply_seed()
  print('loading df')
  
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y = pd.get_dummies(df['intent']).values

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

  model = build_model(X.shape[1], Y.shape[1])

  print('training model')
  history = model.fit(X_train, Y_train, epochs=20, batch_size=64,validation_split=0.1, verbose=0,
                      callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  accr = model.evaluate(X_test,Y_test)
  print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))
  
  path_model = fm.filename_from_data_dir(f'output/neural_models/patient/{embedding_name}/model.h5')
  # print(f'Saving the model at: {path_model}')
  model.save(path_model)

In [7]:
run_pipeline('bert_pt')

Applying seed
loading df
training model
Test set
  Loss: 0.289
  Accuracy: 0.916


In [8]:
run_pipeline('flair_pt')

Applying seed
loading df
training model
Test set
  Loss: 0.258
  Accuracy: 0.927


In [9]:
run_pipeline('glove')

Applying seed
loading df
training model
Test set
  Loss: 0.187
  Accuracy: 0.967


In [11]:
run_pipeline('lasbe')

Applying seed
loading df
training model
Test set
  Loss: 0.188
  Accuracy: 0.973


In [12]:
run_pipeline('use')

Applying seed
loading df
training model
Test set
  Loss: 0.177
  Accuracy: 0.978


### Read data and use in test

In [44]:
intent_indexes_dict = {
  'greeting': 0,
  'inform_medicine': 1,
  'inform_symptoms': 2,
  'request_inform': 3,
}

In [50]:
anottated_manual_path = fm.filename_from_data_dir('output/patient/manual_label/sentences_with_label_manual.csv')

anottated_manual_df = pd.read_csv(anottated_manual_path)

data_to_valid = anottated_manual_df[anottated_manual_df['intent'] != 'others'][:1000]

data_to_valid['intent_index'] = data_to_valid['intent'].map(intent_indexes_dict)

data_to_valid

Unnamed: 0,txt,intent,intent_index
1,Meu nariz tá entupido né mas não tá me incomod...,inform_symptoms,2
3,"Boa noite, Valéria. Desde sexta que estou com ...",inform_symptoms,2
4,estava espirrando muito,inform_symptoms,2
5,Dipirona não vai tratar,inform_medicine,1
6,E como a tossi desaparece? Sem tomar remédio?,request_inform,3
...,...,...,...
1858,ainda com sensação de fraqueza e cansaço leve ...,inform_symptoms,2
1860,Há 3 dias começou a tosse,inform_symptoms,2
1863,So calafrio,inform_symptoms,2
1868,Mas a garganta continuar com um pouco de pus,inform_symptoms,2


In [51]:
data_to_valid['intent'].value_counts()

inform_symptoms    600
request_inform     195
greeting           106
inform_medicine     99
Name: intent, dtype: int64

In [24]:
correct_label_manual = data_to_valid['intent_index'].to_numpy()

correct_label_manual.shape

(1000,)

In [52]:
def read_df_embeddings(embedding_name, actor='patient'):
  df_embeddings = fm.read_json_of_dir(
      fm.filename_from_data_dir(
          f'embeddings/{embedding_name}/text_emb_{actor}.json'),
      lines=True
  )

  annotated_sentences = pd.read_csv(fm.filename_from_data_dir('output/patient/annotated_sentences.csv'))

  annotated_sentences['embeddings'] = df_embeddings['embeddings']

  return annotated_sentences[['txt', 'embeddings']]

def get_validation_data_with_embeddings(embedding_name):
    df_with_embeddings = read_df_embeddings(embedding_name)

    df_merged = pd.merge(data_to_valid, df_with_embeddings, on='txt', how='left')

    return df_merged

def run_validation_pipeline(embedding_name):
  print('Loading validation data....')
  df_with_embeddings = get_validation_data_with_embeddings(embedding_name)
  x_validation = np.array(df_with_embeddings['embeddings'].map(lambda x: np.array(x[0])).to_list())

  print(f'The embedding: {embedding_name} has a dimensionality of: {x_validation.shape[1]}')

  print('Loading model....')
  path_model = fm.filename_from_data_dir(f'output/neural_models/patient/{embedding_name}/model.h5')
  model = load_model(path_model)

  print('Running pridictions....')
  predictions = model.predict(x_validation)
  intent_predicteds = np.array([np.argmax(prediction) for prediction in predictions])


  correct_predict_manual = np.equal(intent_predicteds, correct_label_manual).sum()
  accuracy = correct_predict_manual / len(correct_label_manual)

  print(f'\nThe accuracy is {accuracy}')

In [53]:
run_validation_pipeline('bert_pt')

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.928


In [57]:
run_validation_pipeline('flair_pt')

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running pridictions....

The accuracy is 0.835


In [56]:
run_validation_pipeline('glove')

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running pridictions....

The accuracy is 0.702


In [54]:
run_validation_pipeline('lasbe')

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.917


In [55]:
run_validation_pipeline('use')

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running pridictions....

The accuracy is 0.885


In [None]:
def read_df_embeddings(embedding_name, actor='patient'):
  df_embeddings = fm.read_json_of_dir(
      fm.filename_from_data_dir(
          f'embeddings/{embedding_name}/text_emb_{actor}.json'),
      lines=True
  )

  annotated_sentences = pd.read_csv(fm.filename_from_data_dir('output/patient/annotated_sentences.csv'))

  annotated_sentences['embeddings'] = df_embeddings['embeddings']

  return annotated_sentences[['txt', 'embeddings']]

def get_validation_data_with_embeddings(embedding_name):
    df_with_embeddings = read_df_embeddings(embedding_name)

    df_merged = pd.merge(data_to_valid, df_with_embeddings, on='txt', how='left')

    return df_merged

def run_validation_pipeline(embedding_name):
  print('Loading validation data....')
  df_with_embeddings = get_validation_data_with_embeddings(embedding_name)
  x_validation = np.array(df_with_embeddings['embeddings'].map(lambda x: np.array(x[0])).to_list())

  print(f'The embedding: {embedding_name} has a dimensionality of: {x_validation.shape[1]}')

  print('Loading model....')
  path_model = fm.filename_from_data_dir(f'output/neural_models/patient/{embedding_name}/model.h5')
  model = load_model(path_model)

  print('Running pridictions....')
  predictions = model.predict(x_validation)
  intent_predicteds = np.array([np.argmax(prediction) for prediction in predictions])


  correct_predict_manual = np.equal(intent_predicteds, correct_label_manual).sum()
  accuracy = correct_predict_manual / len(correct_label_manual)

  print(f'\nThe accuracy is {accuracy}')

In [None]:
  apply_seed()
  print('loading df')
  
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y = pd.get_dummies(df['intent']).values

  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [25]:
embedding_name = ''
actor = 'patient'
# variation='without_others_intent/k100_without_sentences_higher_than_median'
variation='k100/'
    

file_name = fm.filename_from_data_dir(
        f'output/{actor}/{variation}/{embedding_name}/annotated_sentences.csv'
    )

df_annotated = pd.read_csv(file_name)

df_with_embeddings = fm.read_annotated_df_with_embeddings(embedding_name, variation=variation)

df_annotated['txt'].count(), df_with_embeddings['txt'].count(), df_with_embeddings[df_with_embeddings['embeddings'].isnull()]['txt'].count()

FileNotFoundError: [Errno 2] No such file or directory: '/home/valmir/dev/python/intent_classifier/data/output/patient/k100//annotated_sentences.csv'