In [1]:
import json
import keras
from keras.preprocessing.sequence import pad_sequences
import numpy as np

import sparknlp
from sparknlp.annotator import Tokenizer as SparkTokenizer
from sparknlp.base import DocumentAssembler

from src.core import file_manager as fm

2022-06-19 17:33:34.385984: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-19 17:33:34.386000: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
spark = sparknlp.start(spark32=True)

22/06/19 17:33:36 WARN Utils: Your hostname, notavel resolves to a loopback address: 127.0.1.1; using 192.168.0.7 instead (on interface wlo1)
22/06/19 17:33:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /home/valmir/.ivy2/cache
The jars for the packages stored in: /home/valmir/.ivy2/jars
com.johnsnowlabs.nlp#spark-nlp-spark32_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7dd38078-7248-4425-abdc-9c7383df8ed8;1.0
	confs: [default]


:: loading settings :: url = jar:file:/home/valmir/dev/python/intent_classifier/venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found com.johnsnowlabs.nlp#spark-nlp-spark32_2.12;3.4.2 in central
	found com.typesafe#config;1.4.1 in central
	found org.rocksdb#rocksdbjni;6.5.3 in central
	found com.amazonaws#aws-java-sdk-bundle;1.11.603 in central
	found com.github.universal-automata#liblevenshtein;3.0.0 in central
	found com.google.code.findbugs#annotations;3.0.1 in central
	found net.jcip#jcip-annotations;1.0 in central
	found com.google.code.findbugs#jsr305;3.0.1 in central
	found com.google.protobuf#protobuf-java-util;3.0.0-beta-3 in central
	found com.google.protobuf#protobuf-java;3.0.0-beta-3 in central
	found com.google.code.gson#gson;2.3 in central
	found it.unimi.dsi#fastutil;7.0.12 in central
	found org.projectlombok#lombok;1.16.8 in central
	found org.slf4j#slf4j-api;1.7.21 in central
	found com.navigamez#greex;1.0 in central
	found dk.brics.automaton#automaton;1.11-8 in central
	found org.json4s#json4s-ext_2.12;3.7.0-M11 in central
	found joda-time#joda-time;2.10.10 in central
	found org.joda#joda-con

In [3]:
class Predictor:
  def __init__(self, embedding_name):
    self.embedding_name = embedding_name
    self.working_dir = fm.filename_from_data_dir(f'output/lstm_models/patient/{self.embedding_name}')
    self.model = self.load_lstm_model()
    self.vocabullary = self.load_vocabullary()
    self.metadata = self.load_metadata()
    self.intents = self.metadata['intents']
    self.vector_length = int(self.metadata['vector_length'])
  

  def load_lstm_model(self):
    return keras.models.load_model(f'{self.working_dir}/model.h5')

  def load_vocabullary(self):
    file = open(f'{self.working_dir}/vocabullary.json',"r")

    return json.load(file)
  
  def load_metadata(self):
    file = open(f'{self.working_dir}/metadata.json',"r")

    return json.load(file)
  

  def get_tokens(self, text):
    spark_df = spark.createDataFrame([[text]]).toDF("text")

    doc_df = DocumentAssembler().setInputCol("text").setOutputCol("document").transform(spark_df)

    token_df = SparkTokenizer().setInputCols(["document"]).setOutputCol("token").fit(doc_df).transform(doc_df)

    return token_df.select('token.result').toPandas().loc[0]['result']

  def convert_text_to_input_model(self, tokens):
    return [self.vocabullary[token] if token in self.vocabullary else 0 for token in tokens]

  def print_intent_probabilities(self, prediction):
    print('This is the distribuition of the prediction by intent:')
    intent_probabilities = {intent: prediction[0][index] for index, intent in enumerate(self.intents)}

    for item in sorted(intent_probabilities.items(), key=lambda item: -item[1]):
      print(f'{item[0]} = {(item[1] *100):.4f}%')

  def print_pred_real_label(self, text, print_probabilities=False):
    tokens = self.get_tokens(text)

    indexes_vector = self.convert_text_to_input_model(tokens)

    padding_vector = pad_sequences(
      maxlen=self.vector_length, sequences=[indexes_vector], value=0, padding='post', truncating='post'
    )

    prediction = self.model.predict(padding_vector)

    index_label_predicted = np.argmax(prediction)
    
    predicted_label = self.intents[index_label_predicted]    
    percentage = f'{(prediction[0][index_label_predicted] *100):.4f}'

    print(f'The intent predicted was: {predicted_label} with: {percentage}%\n')

    if print_probabilities:
      self.print_intent_probabilities(prediction)

### Using Bert predictor

In [None]:
predictor_glove = Predictor('glove')

In [5]:
predictor_glove.print_pred_real_label('Eu estou com febre e dor de cabeça')

                                                                                

The intent predicted was: inform_symptoms with: 99.9562%



In [6]:
predictor_glove.print_pred_real_label('Olá bom dia, muito obrigado!')

The intent predicted was: greeting with: 99.9407%



In [7]:
predictor_glove.print_pred_real_label('Gostaria de saber qual remédio, eu devo tomar para fortes dores no peito?')

The intent predicted was: request_inform with: 81.4230%



In [8]:
predictor_glove.print_pred_real_label('Estou tomando Tylenol e paracetamol')

The intent predicted was: inform_medicine with: 99.4024%



### Using Bert predictor

In [9]:
predictor_bert_pt = Predictor('bert_pt')

In [10]:
predictor_bert_pt.print_pred_real_label('Eu estou com febre e dor de cabeça')

The intent predicted was: inform_symptoms with: 99.9995%



In [11]:
predictor_bert_pt.print_pred_real_label('Olá bom dia, muito obrigado!')

The intent predicted was: greeting with: 99.9946%



In [12]:
predictor_bert_pt.print_pred_real_label('Gostaria de saber qual remédio, eu devo tomar para fortes dores no peito?')

The intent predicted was: request_inform with: 99.9879%



In [13]:
predictor_bert_pt.print_pred_real_label('Estou tomando Tylenol e paracetamol')

The intent predicted was: inform_symptoms with: 75.7815%

