In [None]:
%load_ext autoreload

%autoreload 2

In [22]:
import pandas as pd
import numpy as np

from src.ml import pipeline
from src.core import file_manager

In [23]:
df_annotated = pipeline.get_annotated_df('use')

In [24]:
df_embeddings = pipeline.get_embedding_dfs('use')

dict_embeddings = pipeline.generate_dict_embedding_text(df_embeddings)

In [25]:
print(pipeline.dict_labels)

{'inform': 1, 'inform_symptoms': 2, 'inform_medicine': 3, 'greeting': 4, 'request_inform': 5}


In [37]:
higher_distances_df = df_annotated[df_annotated['distance'] > 0.25]

random_higher_distances_df  = higher_distances_df.sample(n=1000, random_state=42)

random_higher_distances_df.head()

Unnamed: 0,txt,label,distance,intent,annotated_txt
453,Sobre a febre quero entender,13,0.300878,inform_symptoms,Sobre a [febre](SINTOMA) quero entender
7389,Mais isso já faz um mês e meio,27,0.517859,inform,Mais isso já faz um mês e meio
10612,E pra tosse?,40,0.256707,inform_symptoms,E pra [tosse](SINTOMA)?
998,Quando eu viro para o outro lado da cama acelera,2,0.384203,inform,Quando eu viro para o outro lado da cama acelera
11519,Quero saber o que devemos tomar,26,0.437573,request_inform,Quero saber o que devemos tomar


In [48]:
random_higher_distances_df['txt'].to_csv(file_manager.filename_from_data_dir('output/sentences_to_classify.csv'), index=False)

In [58]:
df_classified_manual = pd.read_csv(file_manager.filename_from_data_dir('output/sentences_classified.csv'))

df_classified_manual['label_index'] = df_classified_manual['label'].map(pipeline.dict_labels)

df_classified_manual['embeddings'] = df_classified_manual['txt'].map(dict_embeddings)

df_classified_manual.drop(['correct_label', 'used_intents', 'intents'], axis=1).head()

Unnamed: 0,txt,label,label_index,embeddings
0,Sobre a febre quero entender,request_inform,5,"[-0.022668906000000003, -0.061208144000000006, 0.003735794, 0.0071..."
1,Mais isso já faz um mês e meio,inform,1,"[-0.03546224, -0.008965515, -0.018063422000000003, 0.010098209, -0..."
2,E pra tosse?,request_inform,5,"[0.022191908, -0.029259719000000003, -0.019401137000000002, -0.029..."
3,Quando eu viro para o outro lado da cama acelera,inform,1,"[-0.072670385, -0.039203867, -0.04282206, -0.0323417, -0.113387555..."
4,Quero saber o que devemos tomar,request_inform,5,"[0.027665587000000002, -0.0006697571, 0.022551859, -0.010470708, -..."


In [59]:
random_higher_distances_df['manual_label'] = df_classified_manual['label'].to_numpy()

random_higher_distances_df[['txt', 'intent', 'manual_label', 'distance']].head()

Unnamed: 0,txt,intent,manual_label,distance
453,Sobre a febre quero entender,inform_symptoms,request_inform,0.300878
7389,Mais isso já faz um mês e meio,inform,inform,0.517859
10612,E pra tosse?,inform_symptoms,request_inform,0.256707
998,Quando eu viro para o outro lado da cama acelera,inform,inform,0.384203
11519,Quero saber o que devemos tomar,request_inform,request_inform,0.437573


In [60]:
differences = random_higher_distances_df[random_higher_distances_df.apply(lambda x: x['intent'] == x['manual_label'], axis=1)]

print(differences.describe())

differences[['txt', 'intent', 'manual_label', 'distance']].head()

            label    distance
count  676.000000  676.000000
mean    30.334320    0.412518
std     17.305723    0.105340
min      0.000000    0.250969
25%     16.000000    0.325190
50%     30.000000    0.402756
75%     44.000000    0.482611
max     59.000000    0.725928


Unnamed: 0,txt,intent,manual_label,distance
7389,Mais isso já faz um mês e meio,inform,inform,0.517859
998,Quando eu viro para o outro lado da cama acelera,inform,inform,0.384203
11519,Quero saber o que devemos tomar,request_inform,request_inform,0.437573
18323,Tendo todos os ciodados,inform,inform,0.468946
12288,Estou preocupada com meu esposo . Hoje ele passou o dia todo deita...,inform_symptoms,inform_symptoms,0.444651


In [61]:
differences = random_higher_distances_df[random_higher_distances_df.apply(lambda x: x['intent'] != x['manual_label'], axis=1)]

print(differences.describe())

differences[['txt', 'intent', 'manual_label', 'distance']].head(50)

            label    distance
count  324.000000  324.000000
mean    35.716049    0.444295
std     17.722131    0.111606
min      0.000000    0.253143
25%     21.000000    0.351125
50%     37.000000    0.429323
75%     54.000000    0.526704
max     59.000000    0.743197


Unnamed: 0,txt,intent,manual_label,distance
453,Sobre a febre quero entender,inform_symptoms,request_inform,0.300878
10612,E pra tosse?,inform_symptoms,request_inform,0.256707
18911,São quantos dias,inform,request_inform,0.292614
19868,Não . Tomei logo um remedio,inform_medicine,inform,0.323028
23046,Na terça,greeting,inform,0.28637
20273,Nessa doença,inform_medicine,inform,0.527291
3467,2 dias no máximo ..ainda tenho o antibiótico pra tomar,inform_medicine,inform,0.361323
12188,Estes problemas de fígado,inform_medicine,inform_symptoms,0.562368
6237,MuitA,greeting,inform,0.365427
4901,Ontem estava febril,inform,inform_symptoms,0.393517


In [63]:
df_to_predict_manual = pipeline.generate_df_from_x_y(
    x_data=df_classified_manual['embeddings'].to_numpy(),
    y_data=df_classified_manual['label_index'].to_numpy()
)


df_to_predict_manual.head(2)

Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511,label
0,-0.022669,-0.061208,0.003736,0.007147,-0.082954,0.050146,-0.00265,0.031844,-0.054984,0.034181,...,-0.028898,0.012946,0.003666,-0.037578,-0.031985,0.037607,-0.044444,-0.021451,0.090896,5
1,-0.035462,-0.008966,-0.018063,0.010098,-0.010659,-0.001407,-0.000123,-0.000145,-0.044095,0.04053,...,-0.02989,-0.016296,0.071277,-0.046813,0.001874,0.001296,-0.100866,0.077414,-0.056887,1


In [16]:
import h2o

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 21 mins
H2O_cluster_timezone:,America/Fortaleza
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,15 days
H2O_cluster_name:,H2O_from_python_valmir_rpfn0h
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.442 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [17]:
model_path = file_manager.filename_from_data_dir('output/h2o/models/StackedEnsemble_AllModels_1_AutoML_1_20220428_200646')

leader_model = h2o.load_model(model_path)

leader_model

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_1_AutoML_1_20220428_200646

No model summary for this model

ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 3.082669827511523e-06
RMSE: 0.001755753350420133

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.003141757357402852
RMSE: 0.05605138140494712

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.996224,0.003082,0.994889,0.998352,0.998291,0.991379,0.998208
1,auc,,0.0,,,,,
2,err,0.003776,0.003082,0.005111,0.001647,0.001709,0.008621,0.001792
3,err_count,2.2,1.788854,3.0,1.0,1.0,5.0,1.0
4,logloss,0.014103,0.007704,0.01332,0.016439,0.011068,0.025356,0.00433
5,max_per_class_error,0.027263,0.015578,0.019231,0.023256,0.009346,0.034483,0.05
6,mean_per_class_accuracy,0.991418,0.006044,0.991334,0.995349,0.998131,0.982275,0.99
7,mean_per_class_error,0.008582,0.006044,0.008666,0.004651,0.001869,0.017725,0.01
8,mse,0.003058,0.002393,0.004068,0.001898,0.001874,0.006746,0.000706
9,null_deviance,1393.2993,61.210255,1407.1559,1467.1542,1398.3177,1397.0126,1296.8563




In [64]:
hf_preds_manual= leader_model.predict(h2o.H2OFrame(df_to_predict_manual))

predicts_manual = hf_preds_manual.as_data_frame().predict.to_numpy()

correct_label_manual = df_to_predict_manual.label.to_numpy()

print(len(predicts_manual), len(correct_label_manual))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
1000 1000


In [65]:
correct_predict_manual = np.equal(predicts_manual, correct_label_manual).sum()

correct_predict_manual / len(correct_label_manual)

0.727