In [1]:
import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *

from src.core import file_manager  as fm

In [None]:
spark = sparknlp.start(spark32=True)

In [5]:
text = 'Peter Parker is a nice guy and lives in New York'

spark_df = spark.createDataFrame([[text]]).toDF("text")

spark_df.show(truncate=False)

+------------------------------------------------+
|text                                            |
+------------------------------------------------+
|Peter Parker is a nice guy and lives in New York|
+------------------------------------------------+



In [6]:
# documentAssembler = DocumentAssembler().setInputCol("txt").setOutputCol("document")

# sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

# tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

In [34]:
# text = 'Peter Parker (Spiderman) is a nice guy and lives in New York but has no e-mail!'
def get_tokens(text):
    spark_df = spark.createDataFrame([[text]]).toDF("text")

    doc_df = DocumentAssembler().setInputCol("text").setOutputCol("document").transform(spark_df)

    token_df = Tokenizer().setInputCols(["document"]).setOutputCol("token").fit(doc_df).transform(doc_df)

    return token_df.select('token.result').toPandas().loc[0]['result']

In [35]:
get_tokens('Eu te amo, gata!')

['Eu', 'te', 'amo', ',', 'gata', '!']

In [3]:
documentAssembler = DocumentAssembler().setInputCol("txt").setOutputCol("document")

sentence_detector = SentenceDetector().setInputCols(["document"]).setOutputCol("sentence")

tokenizer = Tokenizer().setInputCols(["sentence"]).setOutputCol("token")

word_embedding = WordEmbeddingsModel().pretrained("glove_6B_300", "xx") \
    .setInputCols("document", "token") \
    .setOutputCol("embeddings")

embeddings_sentenses = SentenceEmbeddings().setInputCols(['document', 'embeddings']) \
    .setOutputCol('sentence_embeddings').setPoolingStrategy('AVERAGE')

stages = [documentAssembler, sentence_detector, tokenizer, word_embedding, embeddings_sentenses]

use_clf_pipeline = Pipeline(stages=stages)

use_pipeline_model = use_clf_pipeline.fit(spark.createDataFrame([[""]]).toDF("txt"))

glove_6B_300 download started this may take some time.
Approximate size to download 426,2 MB
[ | ]glove_6B_300 download started this may take some time.
Approximate size to download 426,2 MB
Download done! Loading the resource.
[OK!]


In [4]:
path_annotated_sentences = fm.filename_from_data_dir('output/patient/annotated_sentences.csv')

data = spark.read.option("header", True).csv(path_annotated_sentences)

data = data.select('txt').limit(10)

data.show(truncate=False)

+--------------------------------------------------------------+
|txt                                                           |
+--------------------------------------------------------------+
|Tô  muito nervosa Marcela com tudo isso                       |
|O meu problema  é  psicológico    nem dormi estou  conseguindo|
|Com muito medo                                                |
|Sim                                                           |
|A 3 meses atrás  quando o meu esposo faleceu                  |
|Não                                                           |
|Tem algum remédio  que a pessoa tome para se acalmar          |
|Pra  ficar tranquila                                          |
|Tá  certo                                                     |
|Tá  bom                                                       |
+--------------------------------------------------------------+



In [5]:


# output_columns = ['txt', 'token', 'embeddings', 'sentence_embeddings']
# df_embeddings = use_pipeline_model.transform(self.annotated_sentences).select(output_columns)

result = use_pipeline_model.transform(data).limit(2)

print(result.columns)

result.show()

['txt', 'document', 'sentence', 'token', 'embeddings', 'sentence_embeddings']
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|                 txt|            document|            sentence|               token|          embeddings| sentence_embeddings|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|Tô  muito nervosa...|[{document, 0, 38...|[{document, 0, 38...|[{token, 0, 1, Tô...|[{word_embeddings...|[{sentence_embedd...|
|O meu problema  é...|[{document, 0, 61...|[{document, 0, 61...|[{token, 0, 0, O,...|[{word_embeddings...|[{sentence_embedd...|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+



In [6]:
output_columns = [
  'txt',
  result.token.result.alias('tokens'),
  result.embeddings.embeddings.alias('word_embeddings'),
  'sentence_embeddings.embeddings'
]

result.select(output_columns).show()

+--------------------+--------------------+--------------------+--------------------+
|                 txt|              tokens|     word_embeddings|          embeddings|
+--------------------+--------------------+--------------------+--------------------+
|Tô  muito nervosa...|[Tô, muito, nervo...|[[0.14694, -0.524...|[[0.08082773, -0....|
|O meu problema  é...|[O, meu, problema...|[[-0.0018332, -0....|[[0.14869732, -0....|
+--------------------+--------------------+--------------------+--------------------+



In [104]:
dir_file = fm.filename_from_data_dir('output/glove_spark.json')

result.select(output_columns).write.json(dir_file)

In [106]:
df = fm.read_json_of_dir(dir_file, lines=True)

df

Unnamed: 0,txt,tokens,word_embeddings,embeddings
0,Tô muito nervosa Marcela com tudo isso,"[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.14694000000000002, -0.5241800000000001, -0...","[[0.08082773, -0.035568573, -0.048247125, 0.06..."
1,O meu problema é psicológico nem dormi es...,"[O, meu, problema, é, psicológico, nem, dormi,...","[[-0.0018331999999999999, -0.3367, -0.13084, 0...","[[0.14869732, -0.14268143, -0.074482225, 0.045..."


In [1]:
import pandas as pd

from src.core import file_manager as fm

In [11]:
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings, WordEmbeddings, StackedEmbeddings
from flair.data import Sentence

In [12]:
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings, WordEmbeddings, StackedEmbeddings
from flair.data import Sentence

flair_embedding_forward = FlairEmbeddings('pt-forward')
flair_embedding_backward = FlairEmbeddings('pt-backward')
document_pool_embeddings = DocumentPoolEmbeddings([flair_embedding_forward, flair_embedding_backward])
stacked_embeddings = StackedEmbeddings([flair_embedding_forward, flair_embedding_backward])


def get_sentence_embedding(txt):  
  sentence = Sentence(str(txt))
  
  document_pool_embeddings.embed(sentence)
  stacked_embeddings.embed(sentence)

  data = {
    'tokens': [token.text for token in sentence.tokens],
    'word_embeddings': [token.embedding.tolist() for token in sentence.tokens],
    'embeddings': [sentence.embedding.tolist()]
  }

  return data

In [7]:
df_annotated = pd.read_csv(fm.filename_from_data_dir('output/patient/annotated_sentences.csv'))

df_annotated.head(10)

Unnamed: 0,txt,annotated_txt
0,Tô muito nervosa Marcela com tudo isso,Tô muito nervosa Marcela com tudo isso
1,O meu problema é psicológico nem dormi es...,O meu problema é psicológico nem dormi es...
2,Com muito medo,Com muito medo
3,Sim,Sim
4,A 3 meses atrás quando o meu esposo faleceu,A 3 meses atrás quando o meu esposo faleceu
5,Não,Não
6,Tem algum remédio que a pessoa tome para se a...,Tem algum remédio que a pessoa tome para se a...
7,Pra ficar tranquila,Pra ficar tranquila
8,Tá certo,Tá certo
9,Tá bom,Tá bom


In [124]:
df = pd.concat([df_annotated, df_data], axis=1)

df

Unnamed: 0,tokens,word_embeddings,embeddings
0,"[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.0035118244122713804, -0.005027011502534151...","[[0.0005851125461049378, -0.000787119206506758..."
1,"[O, meu, problema, é, psicológico, nem, dormi,...","[[-0.00020597137336153537, -0.0036803667899221...","[[0.0023417221382260323, -0.000689701177179813..."
2,"[Com, muito, medo]","[[-0.00020977245003450662, -0.0014247068902477...","[[-0.0010116567136719823, -0.00071526312967762..."
3,[Sim],"[[-0.02557721920311451, 0.0003712331235874444,...","[[-0.02557721920311451, 0.0003712331235874444,..."
4,"[A, 3, meses, atrás, quando, o, meu, esposo, f...","[[-0.00011327546963002533, -0.0050820899195969...","[[-0.0020755650475621223, -0.00134362292010337..."


Unnamed: 0,txt,annotated_txt,tokens,word_embeddings,embeddings
0,Tô muito nervosa Marcela com tudo isso,Tô muito nervosa Marcela com tudo isso,"[Tô, muito, nervosa, Marcela, com, tudo, isso]","[[0.0035118244122713804, -0.005027011502534151...","[[0.0005851125461049378, -0.000787119206506758..."
1,O meu problema é psicológico nem dormi es...,O meu problema é psicológico nem dormi es...,"[O, meu, problema, é, psicológico, nem, dormi,...","[[-0.00020597137336153537, -0.0036803667899221...","[[0.0023417221382260323, -0.000689701177179813..."
2,Com muito medo,Com muito medo,"[Com, muito, medo]","[[-0.00020977245003450662, -0.0014247068902477...","[[-0.0010116567136719823, -0.00071526312967762..."
3,Sim,Sim,[Sim],"[[-0.02557721920311451, 0.0003712331235874444,...","[[-0.02557721920311451, 0.0003712331235874444,..."
4,A 3 meses atrás quando o meu esposo faleceu,A 3 meses atrás quando o meu esposo faleceu,"[A, 3, meses, atrás, quando, o, meu, esposo, f...","[[-0.00011327546963002533, -0.0050820899195969...","[[-0.0020755650475621223, -0.00134362292010337..."


In [102]:
text = 'Estou tossindo, com febre e dor de cabeça, trysdas'

In [103]:
r = get_sentence_embedding(text)

In [104]:
len(r['tokens'])

11

In [None]:
r['word_embeddings'][-1]

In [65]:
flair_embedding_forward = FlairEmbeddings('pt-forward')
flair_embedding_backward = FlairEmbeddings('pt-backward')

stacked_embeddings = StackedEmbeddings([flair_embedding_forward, flair_embedding_backward])

sentence = Sentence('Estou com muita febre e dor de cabeça, o que posso fazer para melhorar?')

stacked_embeddings.embed(sentence)


# for token in sentence:
#     print(token.text,'=', len(token.embedding))

# sentence.tokens[0].text

AttributeError: 'Sentence' object has no attribute 'tokenizer'

In [48]:

help(sentence)

Help on Sentence in module flair.data object:

class Sentence(DataPoint)
 |  Sentence(text: Union[str, List[str]], use_tokenizer: Union[bool, flair.data.Tokenizer] = True, language_code: str = None, start_position: int = 0)
 |  
 |  A Sentence is a list of tokens and is used to represent a sentence or text fragment.
 |  
 |  Method resolution order:
 |      Sentence
 |      DataPoint
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __copy__(self)
 |  
 |  __getitem__(self, subscript)
 |  
 |  __init__(self, text: Union[str, List[str]], use_tokenizer: Union[bool, flair.data.Tokenizer] = True, language_code: str = None, start_position: int = 0)
 |      Class to hold all meta related to a text (tokens, predictions, language code, ...)
 |      :param text: original string (sentence), or a list of string tokens (words)
 |      :param use_tokenizer: a custom tokenizer (default is :class:`SpaceTokenizer`)
 |          more advanced options are :class:`SegTokTokenizer` to use seg

In [11]:
embeddings = get_sentence_embedding('Eu estou com febre')

In [12]:
len(embeddings)

1

In [12]:
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.model_selection import train_test_split

from src.core import file_manager as fm

In [4]:
intent_indexes_dict = {
  'greeting': 0,
  'inform_medicine': 1,
  'inform_symptoms': 2,
  'request_inform': 3,
}

In [29]:
path = fm.filename_from_data_dir('output/patient/sentences_with_label_manual_without_others.csv')

df_data_to_valid = pd.read_csv(path)

df_data_to_valid

Unnamed: 0,txt,intent
0,Meu nariz tá entupido né mas não tá me incomod...,inform_symptoms
1,"Boa noite, Valéria. Desde sexta que estou com ...",inform_symptoms
2,estava espirrando muito,inform_symptoms
3,Dipirona não vai tratar,inform_medicine
4,E como a tossi desaparece? Sem tomar remédio?,request_inform
...,...,...
589,Ele corre riscos?,request_inform
590,Unico sintoma so a tosse,inform_symptoms
591,Garganta doendo um pouco,inform_symptoms
592,Mas estava apouco com muito frio tomei dipiron...,inform_symptoms


In [5]:
def get_acurracy(embedding_name):
  df = fm.read_annotated_df_with_embeddings(embedding_name)

  X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
  y = df['intent'].map(intent_indexes_dict).to_numpy()

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


  clf = svm.SVC(max_iter=10000, decision_function_shape='ovo')
  clf.fit(X_train, y_train)

  predict = clf.predict(X_test)

  # lin_clf = svm.LinearSVC()
  # lin_clf.fit(X_train, y_train)
  # predict = lin_clf.predict(X_test)

  correct_predict = np.equal(predict, y_test).sum()

  acuracy = correct_predict / len(y_test)

  return acuracy

In [31]:
df = fm.read_annotated_df_with_embeddings('glove')

X = np.array(df['embeddings'].map(lambda x: np.array(x[0])).to_list())
y = df['intent'].map(intent_indexes_dict).to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


clf = svm.SVC(max_iter=10000, decision_function_shape='ovo')
clf.fit(X_train, y_train)

predict = clf.predict(X_test)

# lin_clf = svm.LinearSVC()
# lin_clf.fit(X_train, y_train)
# predict = lin_clf.predict(X_test)

correct_predict = np.equal(predict, y_test).sum()

acuracy = correct_predict / len(y_test)

acuracy

In [None]:
df_merged = pd.merge(df_data_to_valid, df, on='txt', how='left')

df_merged

In [6]:
get_acurracy('bert_pt')

0.9487179487179487

In [7]:
get_acurracy('glove')


0.9775280898876404

In [8]:
get_acurracy('flair_pt')

0.9536199095022625

In [9]:
get_acurracy('lasbe')

0.9858510195588848

In [10]:
get_acurracy('use')

0.9894179894179894