In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Dense
from keras.callbacks import EarlyStopping
import numpy as np
import os
import random as rn
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf


from src.core import file_manager as fm

2022-07-22 08:50:37.762900: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-22 08:50:37.762914: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
def apply_seed(verbosity=True):
  if verbosity:
    print('Applying seed')
  SEED = 42
  os.environ['PYTHONHASHSEED'] = str(SEED)
  np.random.seed(SEED)
  rn.seed(SEED)
  tf.random.set_seed(SEED)

In [4]:
def build_model(embedding_dim, num_labels):
  model = Sequential()
  model.add(Dense(64, activation='softmax', input_shape=(embedding_dim,)))
  model.add(Dropout(0.1))
  model.add(Dense(num_labels, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [5]:
def save_df(df_data, embedding_name, suffix, actor = 'patient'):
  path_output = fm.filename_from_data_dir(f'output/neural_models/{actor}/{embedding_name}/data_{suffix}.csv')
  
  df_data.drop('embeddings', axis=1).to_csv(path_output, index=False)

def prepare_train_test_data(embedding_name, actor='patient'):
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

  save_df(df_train, embedding_name, suffix='train', actor=actor)
  save_df(df_test, embedding_name, suffix='test', actor=actor)

  X_train = np.array(df_train['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y_train = pd.get_dummies(df_train['intent']).values

  X_test = np.array(df_test['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y_test = pd.get_dummies(df_test['intent']).values

  return X_train, X_test, Y_train, Y_test

In [6]:
def run_pipeline(embedding_name, actor='patient'):
  apply_seed()
  
  X_train, X_test, Y_train, Y_test = prepare_train_test_data(embedding_name=embedding_name, actor=actor)

  model = build_model(X_train.shape[1], Y_train.shape[1])

  print('training model')
  history = model.fit(X_train, Y_train, epochs=20, batch_size=64,validation_split=0.1, verbose=0,
                      callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

  accr = model.evaluate(X_test,Y_test)
  print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))
  
  path_model = fm.filename_from_data_dir(f'output/neural_models/{actor}/{embedding_name}/model.h5')
  # print(f'Saving the model at: {path_model}')
  model.save(path_model)

In [35]:
run_pipeline('bert_pt')

Applying seed
training model
Test set
  Loss: 0.289
  Accuracy: 0.916


In [36]:
run_pipeline('flair_pt')

Applying seed
training model
Test set
  Loss: 0.258
  Accuracy: 0.927


In [37]:
run_pipeline('glove')

Applying seed
training model
Test set
  Loss: 0.187
  Accuracy: 0.967


In [38]:
run_pipeline('lasbe')

Applying seed
training model
Test set
  Loss: 0.188
  Accuracy: 0.973


In [39]:
run_pipeline('use')

Applying seed
training model
Test set
  Loss: 0.177
  Accuracy: 0.978


### Read data and use in test

In [7]:
intent_indexes_dict = {
  'greeting': 0,
  'inform_medicine': 1,
  'inform_symptoms': 2,
  'request_inform': 3,
}

In [59]:
anottated_manual_path = fm.filename_from_data_dir('output/patient/manual_label/not_used_sentences_with_label_manual.csv')

anottated_manual_df = pd.read_csv(anottated_manual_path)

data_to_valid = anottated_manual_df.loc[anottated_manual_df['intent'] != 'others']

data_to_valid['intent_index'] = data_to_valid.loc[:,'intent'].map(intent_indexes_dict)

data_to_valid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_to_valid['intent_index'] = data_to_valid.loc[:,'intent'].map(intent_indexes_dict)


Unnamed: 0,txt,intent,intent_index
5,Tem algum remedio pra tipo aperto no peito,request_inform,3
9,Cansada,inform_symptoms,2
13,Aí continuo tonando de 6 em 6 horas,inform_medicine,1
15,Onde posso fzr,request_inform,3
22,"Fraqueza, não estar, falta de at",inform_symptoms,2
...,...,...,...
988,Sucos gelados não pode?,request_inform,3
992,Desde já grata,greeting,0
995,Ele me relatou que os batimentos cardíacos del...,inform_symptoms,2
996,Melhor q já me atendeu aqui foi vc obg,greeting,0


In [9]:
anottated_manual_path = fm.filename_from_data_dir('output/patient/manual_label/sentences_with_label_manual.csv')
anottated_manual_path = fm.filename_from_data_dir('output/patient/manual_label/untrained_sentences_with_label_manual.csv')

anottated_manual_df = pd.read_csv(anottated_manual_path)

data_to_valid = anottated_manual_df[anottated_manual_df['intent'] != 'others'][:100]

data_to_valid['intent_index'] = data_to_valid['intent'].map(intent_indexes_dict)

data_to_valid

Unnamed: 0,txt,intent,intent_index
1,"Sim,Dipirona 1g e vitamina c",inform_medicine,1
2,Ate hoje nao sinto cheiro de nada ... Por msis...,inform_symptoms,2
5,A dor já tem uns dois dias que sinto,inform_symptoms,2
8,Um pouco de coriza,inform_symptoms,2
11,Ok.fico muito agradecida.,greeting,0
...,...,...,...
296,"Tomei hoje, dipirona pela manha e agora as 17:00",inform_medicine,1
300,As vezes há a dificuldade para falar tbm,inform_symptoms,2
301,"Ainda sobre a nimesulida, ele tomou apenas 01 ...",inform_medicine,1
303,No momento o único sintoma que tenho é a resp...,inform_symptoms,2


In [60]:
data_to_valid = pd.concat(
  [data_to_valid[data_to_valid['intent'] ==  intent][:25] for intent in data_to_valid.intent.unique()],
  ignore_index=True
  )

# data_to_valid = data_to_valid[:200]

In [10]:
data_to_valid['intent'].value_counts()

inform_symptoms    56
request_inform     20
inform_medicine    14
greeting           10
Name: intent, dtype: int64

In [11]:
correct_label_manual = data_to_valid['intent_index'].to_numpy()

correct_label_manual.shape

(100,)

In [12]:
def read_df_embeddings(embedding_name, actor='patient'):
  df_embeddings = fm.read_json_of_dir(
      fm.filename_from_data_dir(
          f'embeddings/{embedding_name}/text_emb_{actor}.json'),
      lines=True
  )

  annotated_sentences = pd.read_csv(fm.filename_from_data_dir('output/patient/annotated_sentences.csv'))

  annotated_sentences['embeddings'] = df_embeddings['embeddings']

  return annotated_sentences[['txt', 'embeddings']]

def get_validation_data_with_embeddings(embedding_name):
    df_with_embeddings = read_df_embeddings(embedding_name)

    df_merged = pd.merge(data_to_valid, df_with_embeddings, on='txt', how='left')

    return df_merged

def run_validation_pipeline(embedding_name):
  print('Loading validation data....')
  df_with_embeddings = get_validation_data_with_embeddings(embedding_name)
  x_validation = np.array(df_with_embeddings['embeddings'].map(lambda x: np.array(x[0])).to_list())

  print(f'The embedding: {embedding_name} has a dimensionality of: {x_validation.shape[1]}')

  print('Loading model....')
  path_model = fm.filename_from_data_dir(f'output/neural_models/patient/{embedding_name}/model.h5')
  model = load_model(path_model)

  print('Running pridictions....')
  predictions = model.predict(x_validation)
  intent_predicteds = np.array([np.argmax(prediction) for prediction in predictions])


  correct_predict_manual = np.equal(intent_predicteds, correct_label_manual).sum()
  accuracy = correct_predict_manual / len(correct_label_manual)

  print(f'\nThe accuracy is {accuracy}')

#### with not seen data weighted

In [None]:
run_validation_pipeline('bert_pt')

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.74


In [None]:
run_validation_pipeline('flair_pt')

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running pridictions....

The accuracy is 0.61


In [None]:
run_validation_pipeline('glove')

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running pridictions....

The accuracy is 0.36


In [None]:
run_validation_pipeline('lasbe')

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.81


In [None]:
run_validation_pipeline('use')

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running pridictions....

The accuracy is 0.74


#### with not seen data not weighted

In [63]:
run_validation_pipeline('bert_pt')

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.84


In [64]:
run_validation_pipeline('flair_pt')

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running pridictions....

The accuracy is 0.71


In [65]:
run_validation_pipeline('glove')

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running pridictions....

The accuracy is 0.545


In [66]:
run_validation_pipeline('lasbe')

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.865


In [67]:
run_validation_pipeline('use')

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running pridictions....

The accuracy is 0.82


#### not trained data

In [14]:
run_validation_pipeline('bert_pt')

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.89


In [15]:
run_validation_pipeline('flair_pt')

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running pridictions....

The accuracy is 0.83


In [16]:
run_validation_pipeline('glove')

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running pridictions....

The accuracy is 0.6


In [17]:
run_validation_pipeline('lasbe')

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.89


In [18]:
run_validation_pipeline('use')

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running pridictions....

The accuracy is 0.91


#### other experiments

In [53]:
# run_validation_pipeline('bert_pt')

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.928


In [57]:
# run_validation_pipeline('flair_pt')

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running pridictions....

The accuracy is 0.835


In [56]:
# run_validation_pipeline('glove')

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running pridictions....

The accuracy is 0.702


In [54]:
# run_validation_pipeline('lasbe')

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running pridictions....

The accuracy is 0.917


In [55]:
# run_validation_pipeline('use')

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running pridictions....

The accuracy is 0.885


#### sentence intersections

In [133]:
df = fm.read_annotated_df_with_embeddings('bert_pt')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

479

In [134]:
df = fm.read_annotated_df_with_embeddings('flair_pt')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

448

In [135]:
df = fm.read_annotated_df_with_embeddings('glove')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

403

In [136]:
df = fm.read_annotated_df_with_embeddings('lasbe')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

450

In [137]:
df = fm.read_annotated_df_with_embeddings('use')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

382

In [44]:
def describe_data_intersection(embedding_name):
  apply_seed(verbosity=False)
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y = pd.get_dummies(df['intent']).values
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

  data = df[['correct_txt', 'intent', 'embeddings']]

  data_train, data_test = train_test_split(data, test_size=0.3, random_state=42)

  if not (X_train == np.array(data_train['embeddings'].map(lambda x: np.array(x[0])).to_list())).all():
    print('The train data it isn\'t equal...')

  count_examples_in_train_test_data = data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()
  count_examples_in_train_data = data_to_valid[data_to_valid['txt'].isin(data_train['correct_txt'])]['txt'].count()

  # count_examples_in_data = data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

  clustering_labels = data_train[data_train['correct_txt'].isin(data_to_valid['txt'])].sort_values('correct_txt')['intent'].to_numpy()

  manual_labels = data_to_valid[data_to_valid['txt'].isin(data_train['correct_txt'])].sort_values('txt')['intent'].to_numpy()

  count_correct_labels = np.equal(clustering_labels, manual_labels).sum()

  print(f'The data has {len(X)} sentences and {len(X_train)} examples in the train data')
  print(f'The total of sentences of manual anotation in the train/test data is {count_examples_in_train_test_data}')
  print(f'There is {count_examples_in_train_data} examples of manual anotation in the train data')
  print(f'There is {count_correct_labels} examples of manual anotation with correct label in the train data')

In [45]:
describe_data_intersection('bert_pt')

The data has 8837 sentences and 6185 examples in the train data
The total of sentences of manual anotation in the train/test data is 479
There is 317 examples of manual anotation in the train data
There is 298 examples of manual anotation with correct label in the train data


In [46]:
describe_data_intersection('flair_pt')

The data has 8840 sentences and 6188 examples in the train data
The total of sentences of manual anotation in the train/test data is 448
There is 314 examples of manual anotation in the train data
There is 278 examples of manual anotation with correct label in the train data


In [47]:
describe_data_intersection('glove')

The data has 8900 sentences and 6230 examples in the train data
The total of sentences of manual anotation in the train/test data is 403
There is 282 examples of manual anotation in the train data
There is 225 examples of manual anotation with correct label in the train data


In [48]:
describe_data_intersection('lasbe')

The data has 8007 sentences and 5604 examples in the train data
The total of sentences of manual anotation in the train/test data is 450
There is 312 examples of manual anotation in the train data
There is 290 examples of manual anotation with correct label in the train data


In [49]:
describe_data_intersection('use')

The data has 6928 sentences and 4849 examples in the train data
The total of sentences of manual anotation in the train/test data is 382
There is 277 examples of manual anotation in the train data
There is 257 examples of manual anotation with correct label in the train data


In [50]:
# df_annotated[df_annotated['txt'].str.contains('\'') | df_annotated['txt'].str.contains('"')]['txt'].to_numpy()