In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import numpy as np
import os
import random as rn
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
import tensorflow as tf


from src.core import file_manager as fm

In [None]:
def apply_seed(verbosity=True):
  if verbosity:
    print('Applying seed')
  SEED = 42
  os.environ['PYTHONHASHSEED'] = str(SEED)
  np.random.seed(SEED)
  rn.seed(SEED)
  tf.random.set_seed(SEED)

In [None]:
def build_model(embedding_dim, num_labels):
  model = Sequential()
  model.add(Dense(64, activation='softmax', input_shape=(embedding_dim,)))
  model.add(Dropout(0.1))
  model.add(Dense(num_labels, activation='softmax'))
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model

In [139]:
def save_df(df_data, embedding_name, suffix, actor = 'patient'):
  path_output = fm.filename_from_data_dir(f'output/neural_models/{actor}/{embedding_name}/data_{suffix}.csv')
  
  df_data.drop('embeddings', axis=1).to_csv(path_output, index=False)

def prepare_train_test_data(embedding_name, actor='patient'):
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)

  save_df(df_train, embedding_name, suffix='train', actor=actor)
  save_df(df_test, embedding_name, suffix='test', actor=actor)

  X_train = np.array(df_train['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y_train = pd.get_dummies(df_train['intent']).values

  X_test = np.array(df_test['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y_test = pd.get_dummies(df_test['intent']).values

  labels = np.array(list(range(0, pd.get_dummies(df['intent']).values.shape[1])))

  return X_train, X_test, Y_train, Y_test, labels

In [172]:
def compute_metrics(y_true, y_pred, labels):
  accuracy = metrics.accuracy_score(y_true, y_pred)
  precision = metrics.precision_score(y_true, y_pred, average='weighted', labels=labels)
  recall = metrics.recall_score(y_true, y_pred, average='weighted', labels=labels)
  f1 = metrics.f1_score(y_true, y_pred, average='weighted', labels=labels)

  return {
    'precision': precision,
    'recall': recall,
    'f1': f1,
    'accuracy': accuracy
  }

In [196]:
def run_pipeline(embedding_name, actor='patient'):
  apply_seed()
  
  X_train, X_test, Y_train, Y_test, labels = prepare_train_test_data(embedding_name=embedding_name, actor=actor)

  model = build_model(X_train.shape[1], Y_train.shape[1])

  print('training model')
  history = model.fit(X_train, Y_train, epochs=20, batch_size=64,validation_split=0.1, verbose=0,
                      callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


  path_model = fm.filename_from_data_dir(f'output/neural_models/{actor}/{embedding_name}/model.h5')
  # print(f'Saving the model at: {path_model}')
  model.save(path_model)

  # accr = model.evaluate(X_test,Y_test)
  # print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

  model_pred = model.predict(X_test)

  y_true = np.array([np.argmax(label) for label in Y_test])
  y_pred = np.array([np.argmax(prediction) for prediction in model_pred])

  return compute_metrics(y_true, y_pred, labels)

In [197]:
run_pipeline('bert_pt')

Applying seed
training model


{'precision': 0.9153824130442597,
 'recall': 0.916289592760181,
 'f1': 0.9152942105448022,
 'accuracy': 0.916289592760181}

In [198]:
run_pipeline('flair_pt')

Applying seed
training model


{'precision': 0.9277105620580804,
 'recall': 0.9268476621417798,
 'f1': 0.9235201521431274,
 'accuracy': 0.9268476621417798}

In [199]:
run_pipeline('glove')

Applying seed
training model


{'precision': 0.9672050536104059,
 'recall': 0.9670411985018726,
 'f1': 0.9660648683486441,
 'accuracy': 0.9670411985018726}

In [200]:
run_pipeline('lasbe')

Applying seed
training model


{'precision': 0.9724620632678047,
 'recall': 0.9725343320848939,
 'f1': 0.9724436745946483,
 'accuracy': 0.9725343320848939}

In [201]:
run_pipeline('use')

Applying seed
training model


{'precision': 0.977776164703149,
 'recall': 0.9778739778739779,
 'f1': 0.9778047029432841,
 'accuracy': 0.9778739778739779}

### Read data and use in test

In [202]:
def get_data_to_valid(variation='not_used_sentences_with_label_manual'):
  annotated_manual_path = fm.filename_from_data_dir(f'output/patient/manual_label/{variation}.csv')

  annotated_manual_df = pd.read_csv(annotated_manual_path)

  annotated_manual_df.loc[:,'intent_index'] = annotated_manual_df.loc[:,'intent'].map(intent_indexes_dict)

  data = annotated_manual_df.loc[annotated_manual_df['intent'] != 'others']

  return data

def get_weighted_data(df_data, length_for_intent = 25):
  return  pd.concat(
    [df_data[df_data['intent'] ==  intent][:length_for_intent] for intent in df_data.intent.unique()],
    ignore_index=True
  )

In [280]:
def read_df_embeddings(embedding_name, actor='patient'):
  df_embeddings = fm.read_json_of_dir(
      fm.filename_from_data_dir(
          f'embeddings/{embedding_name}/text_emb_{actor}.json'),
      lines=True
  )

  annotated_sentences = pd.read_csv(fm.filename_from_data_dir('output/patient/annotated_sentences.csv'))

  annotated_sentences['embeddings'] = df_embeddings['embeddings']

  return annotated_sentences[['txt', 'embeddings']]

def get_validation_data_with_embeddings(embedding_name):
    df_with_embeddings = read_df_embeddings(embedding_name)

    df_merged = pd.merge(data_to_valid, df_with_embeddings, on='txt', how='left')

    return df_merged

def run_validation_pipeline(embedding_name, correct_labels, labels):
  print('Loading validation data....')
  df_with_embeddings = get_validation_data_with_embeddings(embedding_name)
  x_validation = np.array(df_with_embeddings['embeddings'].map(lambda x: np.array(x[0])).to_list())

  print(f'The embedding: {embedding_name} has a dimensionality of: {x_validation.shape[1]}')

  print('Loading model....')
  path_model = fm.filename_from_data_dir(f'output/neural_models/patient/{embedding_name}/model.h5')
  model = load_model(path_model)

  print('Running predictions....')
  predictions = model.predict(x_validation)
  intent_predicts = np.array([np.argmax(prediction) for prediction in predictions])

  return compute_metrics(correct_labels, intent_predicts, labels)

In [204]:
intent_indexes_dict = {
  'greeting': 0,
  'inform_medicine': 1,
  'inform_symptoms': 2,
  'request_inform': 3,
}

intent_indexes = np.array(list(intent_indexes_dict.values()))

#### with not seen data weighted

In [277]:
data_to_valid = get_data_to_valid()
print(data_to_valid['intent'].value_counts())

print('\nData weighted:')
data_to_valid = get_weighted_data(data_to_valid, 30)
print(data_to_valid['intent'].value_counts())

correct_labels = data_to_valid['intent_index'].to_numpy()

correct_labels.shape

inform_symptoms    102
request_inform      47
inform_medicine     33
greeting            30
Name: intent, dtype: int64

Data weighted:
request_inform     30
inform_symptoms    30
inform_medicine    30
greeting           30
Name: intent, dtype: int64


(120,)

In [282]:
run_validation_pipeline('bert_pt', correct_labels, intent_indexes)

Loading validation data....
The embedding: bert_pt has a dimensionality of: 768
Loading model....
Running predictions....


{'precision': 0.8372586532963892,
 'recall': 0.7583333333333333,
 'f1': 0.7502747620015597,
 'accuracy': 0.7583333333333333}

In [283]:
run_validation_pipeline('flair_pt', correct_labels, intent_indexes)

Loading validation data....
The embedding: flair_pt has a dimensionality of: 4096
Loading model....
Running predictions....


{'precision': 0.8138513513513513,
 'recall': 0.6083333333333333,
 'f1': 0.557632528761561,
 'accuracy': 0.6083333333333333}

In [284]:
run_validation_pipeline('glove', correct_labels, intent_indexes)

Loading validation data....
The embedding: glove has a dimensionality of: 300
Loading model....
Running predictions....


  _warn_prf(average, modifier, msg_start, len(result))


{'precision': 0.48044871794871796,
 'recall': 0.35833333333333334,
 'f1': 0.28138474295190713,
 'accuracy': 0.35833333333333334}

In [285]:
run_validation_pipeline('lasbe', correct_labels, intent_indexes)

Loading validation data....
The embedding: lasbe has a dimensionality of: 768
Loading model....
Running predictions....


{'precision': 0.8605144855144855,
 'recall': 0.825,
 'f1': 0.8289745552676587,
 'accuracy': 0.825}

In [286]:
run_validation_pipeline('use', correct_labels, intent_indexes)

Loading validation data....
The embedding: use has a dimensionality of: 512
Loading model....
Running predictions....


{'precision': 0.8415325670498084,
 'recall': 0.7416666666666667,
 'f1': 0.7491805813234386,
 'accuracy': 0.7416666666666667}

#### with not seen data weighted

In [None]:
run_validation_pipeline('bert_pt')

In [None]:
run_validation_pipeline('flair_pt')

In [None]:
run_validation_pipeline('glove')

In [None]:
run_validation_pipeline('lasbe')

In [None]:
run_validation_pipeline('use')

#### with not seen data not weighted

In [None]:
run_validation_pipeline('bert_pt')

In [None]:
run_validation_pipeline('flair_pt')

In [None]:
run_validation_pipeline('glove')

In [None]:
run_validation_pipeline('lasbe')

In [None]:
run_validation_pipeline('use')

#### not trained data

In [None]:
run_validation_pipeline('bert_pt')

In [None]:
run_validation_pipeline('flair_pt')

In [None]:
run_validation_pipeline('glove')

In [None]:
run_validation_pipeline('lasbe')

In [None]:
run_validation_pipeline('use')

#### other experiments

In [None]:
# run_validation_pipeline('bert_pt')

In [None]:
# run_validation_pipeline('flair_pt')

In [None]:
# run_validation_pipeline('glove')

In [None]:
# run_validation_pipeline('lasbe')

In [None]:
# run_validation_pipeline('use')

#### sentence intersections

In [None]:
df = fm.read_annotated_df_with_embeddings('bert_pt')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

In [None]:
df = fm.read_annotated_df_with_embeddings('flair_pt')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

In [None]:
df = fm.read_annotated_df_with_embeddings('glove')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

In [None]:
df = fm.read_annotated_df_with_embeddings('lasbe')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

In [None]:
df = fm.read_annotated_df_with_embeddings('use')

data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

In [None]:
def describe_data_intersection(embedding_name):
  apply_seed(verbosity=False)
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
  Y = pd.get_dummies(df['intent']).values
  X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

  data = df[['correct_txt', 'intent', 'embeddings']]

  data_train, data_test = train_test_split(data, test_size=0.3, random_state=42)

  if not (X_train == np.array(data_train['embeddings'].map(lambda x: np.array(x[0])).to_list())).all():
    print('The train data it isn\'t equal...')

  count_examples_in_train_test_data = data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()
  count_examples_in_train_data = data_to_valid[data_to_valid['txt'].isin(data_train['correct_txt'])]['txt'].count()

  # count_examples_in_data = data_to_valid[data_to_valid['txt'].isin(df['correct_txt'])]['txt'].count()

  clustering_labels = data_train[data_train['correct_txt'].isin(data_to_valid['txt'])].sort_values('correct_txt')['intent'].to_numpy()

  manual_labels = data_to_valid[data_to_valid['txt'].isin(data_train['correct_txt'])].sort_values('txt')['intent'].to_numpy()

  count_correct_labels = np.equal(clustering_labels, manual_labels).sum()

  print(f'The data has {len(X)} sentences and {len(X_train)} examples in the train data')
  print(f'The total of sentences of manual anotation in the train/test data is {count_examples_in_train_test_data}')
  print(f'There is {count_examples_in_train_data} examples of manual anotation in the train data')
  print(f'There is {count_correct_labels} examples of manual anotation with correct label in the train data')

In [None]:
describe_data_intersection('bert_pt')

In [None]:
describe_data_intersection('flair_pt')

In [None]:
describe_data_intersection('glove')

In [None]:
describe_data_intersection('lasbe')

In [None]:
describe_data_intersection('use')

In [None]:
# df_annotated[df_annotated['txt'].str.contains('\'') | df_annotated['txt'].str.contains('"')]['txt'].to_numpy()