In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import pandas as pd
import numpy as np

from src.ml import pipeline
from src.core import file_manager

In [3]:
df_annotated = pipeline.get_annotated_df('use')

df_annotated.head()

Unnamed: 0,txt,label,distance,intent,annotated_txt
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garg...
1,Estou preocupado pois tenho asma,55,0.339475,inform_symptoms,Estou preocupado pois tenho [asma](SINTOMA)
2,Só ontem,10,0.259325,inform,Só ontem
3,Sim,12,0.09996,inform,Sim
4,Chenil e aerodin spray,54,0.700803,inform_medicine,Chenil e aerodin spray


In [4]:
df_embeddings = pipeline.get_embedding_dfs('use')

dict_embeddings = pipeline.generate_dict_embedding_text(df_embeddings)

In [5]:
df_annotated['label_index'] = df_annotated['intent'].map(pipeline.dict_labels)

df_annotated['embeddings'] = df_annotated['txt'].map(dict_embeddings)

In [6]:
print(pipeline.dict_labels)

{'inform': 1, 'inform_symptoms': 2, 'inform_medicine': 3, 'greeting': 4, 'request_inform': 5}


In [7]:
higher_distances_df = df_annotated[df_annotated['distance'] > 0.25]

higher_distances_df.head(2)

Unnamed: 0,txt,label,distance,intent,annotated_txt,label_index,embeddings
1,Estou preocupado pois tenho asma,55,0.339475,inform_symptoms,Estou preocupado pois tenho [asma](SINTOMA),2,"[-0.008343049, 0.03336755, -0.0464416560000000..."
2,Só ontem,10,0.259325,inform,Só ontem,1,"[0.06786111, -0.05606858, -0.0013129263, -0.03..."


In [8]:
df_to_predict = pipeline.generate_df_from_x_y(
    x_data=higher_distances_df['embeddings'].to_numpy(),
    y_data=higher_distances_df['label_index'].to_numpy()
)


df_to_predict.head(2)

Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511,label
0,-0.008343,0.033368,-0.046442,-0.052166,-0.091297,-0.001843,0.033507,0.027626,-0.082742,0.012627,...,-0.069063,0.066489,0.016825,0.01951,0.005963,0.065603,-0.042441,0.007935,0.070579,2
1,0.067861,-0.056069,-0.001313,-0.030145,-0.092034,-0.000798,0.052008,0.061108,-0.091092,-0.012032,...,-0.049187,0.080142,0.014424,0.043665,-0.016288,0.010631,-0.091025,0.029442,-0.074986,1


In [9]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.15" 2022-04-19; OpenJDK Runtime Environment (build 11.0.15+10-Ubuntu-0ubuntu0.22.04.1); OpenJDK 64-Bit Server VM (build 11.0.15+10-Ubuntu-0ubuntu0.22.04.1, mixed mode, sharing)
  Starting server from /home/valmir/dev/python/intent_classifier/venv/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpuj4xwjmu
  JVM stdout: /tmp/tmpuj4xwjmu/h2o_valmir_started_from_python.out
  JVM stderr: /tmp/tmpuj4xwjmu/h2o_valmir_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,00 secs
H2O_cluster_timezone:,America/Fortaleza
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_valmir_jl0w66
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.777 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [10]:
model_path = file_manager.filename_from_data_dir('output/h2o/models/StackedEnsemble_AllModels_1_AutoML_1_20220428_200646')

leader_model = h2o.load_model(model_path)

leader_model

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_1_AutoML_1_20220428_200646

No model summary for this model

ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 3.082669827511523e-06
RMSE: 0.001755753350420133

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.003141757357402852
RMSE: 0.05605138140494712

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.996224,0.003082,0.994889,0.998352,0.998291,0.991379,0.998208
1,auc,,0.0,,,,,
2,err,0.003776,0.003082,0.005111,0.001647,0.001709,0.008621,0.001792
3,err_count,2.2,1.788854,3.0,1.0,1.0,5.0,1.0
4,logloss,0.014103,0.007704,0.01332,0.016439,0.011068,0.025356,0.00433
5,max_per_class_error,0.027263,0.015578,0.019231,0.023256,0.009346,0.034483,0.05
6,mean_per_class_accuracy,0.991418,0.006044,0.991334,0.995349,0.998131,0.982275,0.99
7,mean_per_class_error,0.008582,0.006044,0.008666,0.004651,0.001869,0.017725,0.01
8,mse,0.003058,0.002393,0.004068,0.001898,0.001874,0.006746,0.000706
9,null_deviance,1393.2993,61.210255,1407.1559,1467.1542,1398.3177,1397.0126,1296.8563




In [12]:
hf_preds = leader_model.predict(h2o.H2OFrame(df_to_predict.drop('label', axis=1)))

predicts = hf_preds.as_data_frame().predict.to_numpy()

correct_label = df_to_predict.label.to_numpy()

print(len(predicts), len(correct_label))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
21627 21627


In [13]:
correct_predict = np.equal(predicts, correct_label).sum()

correct_predict / len(correct_label)

0.7781014472649929

In [14]:
pipeline.dict_labels

{'inform': 1,
 'inform_symptoms': 2,
 'inform_medicine': 3,
 'greeting': 4,
 'request_inform': 5}

In [15]:
pipeline.get_inverted_dict_labels()

{1: 'inform',
 2: 'inform_symptoms',
 3: 'inform_medicine',
 4: 'greeting',
 5: 'request_inform'}

In [16]:
higher_distances_df['old_intent'] = higher_distances_df['intent']
higher_distances_df['label_index_predict'] = predicts

higher_distances_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  higher_distances_df['old_intent'] = higher_distances_df['intent']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  higher_distances_df['label_index_predict'] = predicts


Unnamed: 0,txt,label,distance,intent,annotated_txt,label_index,embeddings,old_intent,label_index_predict
1,Estou preocupado pois tenho asma,55,0.339475,inform_symptoms,Estou preocupado pois tenho [asma](SINTOMA),2,"[-0.008343049, 0.03336755, -0.046441656000000005, -0.052166402, -0...",inform_symptoms,2
2,Só ontem,10,0.259325,inform,Só ontem,1,"[0.06786111, -0.05606858, -0.0013129263, -0.030144615000000003, -0...",inform,1
4,Chenil e aerodin spray,54,0.700803,inform_medicine,Chenil e aerodin spray,3,"[0.018875174, 0.024116818, 0.035090007, -0.11278464, -0.0551823080...",inform_medicine,2
5,As duas da tarde,10,0.347314,inform,As duas da tarde,1,"[0.05065007, -0.018381953, 0.015174725000000002, -0.008380645, -0....",inform,1
6,Estou um pouco ofegante,7,0.28515,inform,Estou um pouco ofegante,1,"[0.04968314, -0.024636334000000003, -0.020438543, -0.010502479, -0...",inform,1


In [17]:
higher_distances_df['intent'] = higher_distances_df['label_index_predict'].map(pipeline.get_inverted_dict_labels())

higher_distances_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  higher_distances_df['intent'] = higher_distances_df['label_index_predict'].map(pipeline.get_inverted_dict_labels())


Unnamed: 0,txt,label,distance,intent,annotated_txt,label_index,embeddings,old_intent,label_index_predict
1,Estou preocupado pois tenho asma,55,0.339475,inform_symptoms,Estou preocupado pois tenho [asma](SINTOMA),2,"[-0.008343049, 0.03336755, -0.046441656000000005, -0.052166402, -0...",inform_symptoms,2
2,Só ontem,10,0.259325,inform,Só ontem,1,"[0.06786111, -0.05606858, -0.0013129263, -0.030144615000000003, -0...",inform,1
4,Chenil e aerodin spray,54,0.700803,inform_symptoms,Chenil e aerodin spray,3,"[0.018875174, 0.024116818, 0.035090007, -0.11278464, -0.0551823080...",inform_medicine,2
5,As duas da tarde,10,0.347314,inform,As duas da tarde,1,"[0.05065007, -0.018381953, 0.015174725000000002, -0.008380645, -0....",inform,1
6,Estou um pouco ofegante,7,0.28515,inform,Estou um pouco ofegante,1,"[0.04968314, -0.024636334000000003, -0.020438543, -0.010502479, -0...",inform,1


In [22]:
desired_columns = ['txt','label','distance','intent','annotated_txt']

In [29]:
higher_distances_df[desired_columns]

Unnamed: 0,txt,label,distance,intent,annotated_txt
1,Estou preocupado pois tenho asma,55,0.339475,inform_symptoms,Estou preocupado pois tenho [asma](SINTOMA)
2,Só ontem,10,0.259325,inform,Só ontem
4,Chenil e aerodin spray,54,0.700803,inform_symptoms,Chenil e aerodin spray
5,As duas da tarde,10,0.347314,inform,As duas da tarde
6,Estou um pouco ofegante,7,0.285150,inform,Estou um pouco ofegante
...,...,...,...,...,...
25789,"Bom dia, Amanda",24,0.309552,greeting,"Bom dia, Amanda"
25790,Gostaria de fazer uma rápida consulta,51,0.428821,request_inform,Gostaria de fazer uma rápida consulta
25791,"Sou asmática e tenho crise alérgica (sinusite), quais são os sinto...",56,0.318162,request_inform,"Sou asmática e tenho crise alérgica ([sinusite](SINTOMA)), quais s..."
25792,Entendi. Não senti falta de ar. Mas ficarei atenta..,37,0.255014,inform_symptoms,Entendi. Não senti [falta de ar](SINTOMA). Mas ficarei atenta..


In [19]:
closer_distances_df = df_annotated[df_annotated['distance'] <= 0.25]

closer_distances_df.head()

Unnamed: 0,txt,label,distance,intent,annotated_txt,label_index,embeddings
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garganta](SINTOMA),2,"[-0.05788306, 0.042827144000000004, -0.03576142, -0.07932205, -0.0..."
3,Sim,12,0.09996,inform,Sim,1,"[0.11549624, -0.010514308, 0.025507968000000002, -0.02144403, -0.0..."
7,Não,18,0.191976,inform,Não,1,"[0.10408107, -0.02228124, 0.00076653576, -0.012209455000000001, -0..."
12,Ok,4,0.15104,inform,Ok,1,"[0.1262432, -0.01299934, 0.064029716, -0.007820629, -0.08764656, 0..."
14,Tá certo,4,0.159767,inform,Tá certo,1,"[0.12865065, -0.020153655000000003, 0.03307859, -0.06811492, 0.010..."


In [24]:
closer_distances_df[desired_columns]

Unnamed: 0,txt,label,distance,intent,annotated_txt
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garganta](SINTOMA)
3,Sim,12,0.099960,inform,Sim
7,Não,18,0.191976,inform,Não
12,Ok,4,0.151040,inform,Ok
14,Tá certo,4,0.159767,inform,Tá certo
...,...,...,...,...,...
25754,nem tosse,40,0.237049,inform_symptoms,nem [tosse](SINTOMA)
25775,Sinto muita dor nas costas,59,0.121369,inform_symptoms,Sinto muita [dor](SINTOMA) nas costas
25780,A dor nas costas aumentou sinto muita dor,59,0.160749,inform_symptoms,A [dor](SINTOMA) nas costas aumentou sinto muita [dor](SINTOMA)
25781,A dor de cabeça diminui um pouco,8,0.166069,inform_symptoms,A [dor de cabeça](SINTOMA) diminui um pouco


In [32]:
result_df = pd.concat([closer_distances_df[desired_columns], higher_distances_df[desired_columns]], axis=0)

result_df.describe()

Unnamed: 0,label,distance
count,25795.0,25795.0
mean,31.000582,0.386161
std,17.526264,0.129077
min,0.0,0.07444
25%,16.0,0.288405
50%,31.0,0.382536
75%,46.0,0.478938
max,59.0,0.834035


In [33]:
result_df.count()

txt              25795
label            25795
distance         25795
intent           25795
annotated_txt    25795
dtype: int64

In [35]:
from pathlib import Path

output_dir = Path(file_manager.filename_from_data_dir('output/h2o/use'))

output_dir.mkdir(parents=True, exist_ok=True)

In [36]:
result_df.to_csv(file_manager.filename_from_data_dir('output/h2o/use/annotated_sentences.csv'))