In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
from sklearn.model_selection import train_test_split

import numpy as np

from src.ml import pipeline
from src.core import file_manager

In [3]:
df_annotated = pipeline.get_annotated_df('use')

In [4]:
df_embeddings = pipeline.get_embedding_dfs('use')

dict_embeddings = pipeline.generate_dict_embedding_text(df_embeddings)

In [5]:
print(pipeline.dict_labels)

{'inform': 1, 'inform_symptoms': 2, 'inform_medicine': 3, 'greeting': 4, 'request_inform': 5}


In [6]:
df_closer_sentences = df_annotated[df_annotated['distance'] <= 0.25]

df_closer_sentences.head()

Unnamed: 0,txt,label,distance,intent,annotated_txt
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garg...
3,Sim,12,0.09996,inform,Sim
7,Não,18,0.191976,inform,Não
12,Ok,4,0.15104,inform,Ok
14,Tá certo,4,0.159767,inform,Tá certo


In [7]:
df_closer_sentences['embeddings'] = df_closer_sentences['txt'].map(dict_embeddings)

df_closer_sentences['label_index'] = df_closer_sentences['intent'].map(pipeline.dict_labels)

df_closer_sentences.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_closer_sentences['embeddings'] = df_closer_sentences['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_closer_sentences['label_index'] = df_closer_sentences['intent'].map(pipeline.dict_labels)


Unnamed: 0,txt,label,distance,intent,annotated_txt,embeddings,label_index
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garg...,"[-0.05788306, 0.042827144000000004, -0.0357614...",2
3,Sim,12,0.09996,inform,Sim,"[0.11549624, -0.010514308, 0.02550796800000000...",1
7,Não,18,0.191976,inform,Não,"[0.10408107, -0.02228124, 0.00076653576, -0.01...",1
12,Ok,4,0.15104,inform,Ok,"[0.1262432, -0.01299934, 0.064029716, -0.00782...",1
14,Tá certo,4,0.159767,inform,Tá certo,"[0.12865065, -0.020153655000000003, 0.03307859...",1


In [8]:
X = df_closer_sentences['embeddings'].to_numpy()
y = df_closer_sentences['label_index'].to_numpy()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(len(X_train))

{label: (y_train == label).sum() for label in set(y_train)}

2917


{1: 469, 2: 1725, 3: 117, 4: 401, 5: 205}

In [10]:
print(len(y_test))

{label: (y_test == label).sum() for label in set(y_test)}

1251


{1: 175, 2: 764, 3: 36, 4: 189, 5: 87}

In [11]:
df_train = pipeline.generate_df_from_x_y(X_train, y_train)
df_test = pipeline.generate_df_from_x_y(X_test, y_test)

df_test = df_test.drop(['label'], axis=1)

In [12]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.15" 2022-04-19; OpenJDK Runtime Environment (build 11.0.15+10-Ubuntu-0ubuntu0.22.04.1); OpenJDK 64-Bit Server VM (build 11.0.15+10-Ubuntu-0ubuntu0.22.04.1, mixed mode, sharing)
  Starting server from /home/valmir/dev/python/intent_classifier/venv/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpr6vcch9l
  JVM stdout: /tmp/tmpr6vcch9l/h2o_valmir_started_from_python.out
  JVM stderr: /tmp/tmpr6vcch9l/h2o_valmir_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Fortaleza
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,16 days
H2O_cluster_name:,H2O_from_python_valmir_don3uv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.777 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [13]:
hf_train = h2o.H2OFrame(df_train)

x = hf_train.columns
y = "label"
x.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [14]:
# For binary classification, response should be a factor
hf_train[y] = hf_train[y].asfactor()
# hf_test[y] = hf_test[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=hf_train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_AllModels_1_AutoML_1_20220429_155157,0.00781321,0.0142299,0.0534371,0.00285552
DeepLearning_grid_1_AutoML_1_20220429_155157_model_1,0.0087384,0.0196248,0.0558055,0.00311425
StackedEnsemble_BestOfFamily_1_AutoML_1_20220429_155157,0.00904575,0.0140448,0.0540539,0.00292183
DeepLearning_grid_2_AutoML_1_20220429_155157_model_1,0.00907443,0.0278224,0.0632846,0.00400494
GLM_1_AutoML_1_20220429_155157,0.00978632,0.0159005,0.0595617,0.00354759
DeepLearning_grid_3_AutoML_1_20220429_155157_model_1,0.0105705,0.0374155,0.0663098,0.00439699
DeepLearning_1_AutoML_1_20220429_155157,0.0175362,0.0349525,0.0914218,0.00835795
GBM_3_AutoML_1_20220429_155157,0.0189324,0.0293009,0.0786759,0.00618989
GBM_grid_1_AutoML_1_20220429_155157_model_1,0.0206423,0.0252334,0.0763242,0.00582539
GBM_4_AutoML_1_20220429_155157,0.0210719,0.0293167,0.0842514,0.0070983




In [15]:
model = aml.leader

model

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_AllModels_1_AutoML_1_20220429_155157

No model summary for this model

ModelMetricsMultinomialGLM: stackedensemble
** Reported on train data. **

MSE: 1.3270033511069588e-06
RMSE: 0.0011519563147563187

ModelMetricsMultinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.0028555212834940015
RMSE: 0.053437077797106394

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.996927,0.002833,0.994889,0.998352,0.998291,0.993103,1.0
1,auc,,0.0,,,,,
2,err,0.003073,0.002833,0.005111,0.001647,0.001709,0.006897,0.0
3,err_count,1.8,1.643168,3.0,1.0,1.0,4.0,0.0
4,logloss,0.013622,0.008,0.011158,0.01529,0.013085,0.025379,0.003198
5,max_per_class_error,0.023417,0.019846,0.05,0.023256,0.009346,0.034483,0.0
6,mean_per_class_accuracy,0.992395,0.007634,0.983556,0.995349,0.998131,0.984941,1.0
7,mean_per_class_error,0.007605,0.007634,0.016444,0.004651,0.001869,0.015059,0.0
8,mse,0.0028,0.002094,0.003255,0.001886,0.002391,0.006057,0.00041
9,null_deviance,1393.2993,61.210255,1407.1559,1467.1542,1398.3177,1397.0126,1296.8563




In [16]:
path = file_manager.filename_from_data_dir('output/h2o/models')

h2o.save_model(model=model, path=path, force=True)

'/home/valmir/dev/python/intent_classifier/data/output/h2o/models/StackedEnsemble_AllModels_1_AutoML_1_20220429_155157'

In [20]:
model_path = file_manager.filename_from_data_dir('output/h2o/models/StackedEnsemble_AllModels_1_AutoML_1_20220428_200646')

older_leader_model = h2o.load_model(model_path)

In [25]:
older_leader_model.full_parameters

{'model_id': {'__meta': {'schema_version': 3,
   'schema_name': 'ModelParameterSchemaV3',
   'schema_type': 'Iced'},
  'name': 'model_id',
  'label': 'model_id',
  'help': 'Destination id for this model; auto-generated if not specified.',
  'required': False,
  'type': 'Key<Model>',
  'default_value': None,
  'actual_value': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'StackedEnsemble_AllModels_1_AutoML_1_20220428_200646',
   'type': 'Key<Model>',
   'URL': '/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20220428_200646'},
  'input_value': None,
  'level': 'critical',
  'values': [],
  'is_member_of_frames': [],
  'is_mutually_exclusive_with': [],
  'gridable': False},
 'training_frame': {'__meta': {'schema_version': 3,
   'schema_name': 'ModelParameterSchemaV3',
   'schema_type': 'Iced'},
  'name': 'training_frame',
  'label': 'training_frame',
  'help': 'Id of the training data frame.',
  'required': False,
  'type': 

In [22]:
model.base_models

['DeepLearning_grid_1_AutoML_1_20220429_155157_model_1',
 'DeepLearning_grid_2_AutoML_1_20220429_155157_model_1',
 'GLM_1_AutoML_1_20220429_155157',
 'DeepLearning_grid_3_AutoML_1_20220429_155157_model_1',
 'DeepLearning_1_AutoML_1_20220429_155157',
 'GBM_3_AutoML_1_20220429_155157',
 'GBM_grid_1_AutoML_1_20220429_155157_model_1',
 'GBM_4_AutoML_1_20220429_155157',
 'GBM_5_AutoML_1_20220429_155157',
 'GBM_1_AutoML_1_20220429_155157',
 'GBM_grid_1_AutoML_1_20220429_155157_model_2',
 'XGBoost_2_AutoML_1_20220429_155157',
 'GBM_2_AutoML_1_20220429_155157',
 'XGBoost_3_AutoML_1_20220429_155157',
 'XGBoost_grid_1_AutoML_1_20220429_155157_model_1',
 'XGBoost_grid_1_AutoML_1_20220429_155157_model_2',
 'XGBoost_1_AutoML_1_20220429_155157',
 'XGBoost_grid_1_AutoML_1_20220429_155157_model_3',
 'XRT_1_AutoML_1_20220429_155157',
 'DRF_1_AutoML_1_20220429_155157']

In [38]:
hf_test = h2o.H2OFrame(df_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [39]:
hf_preds = aml.predict(hf_test)

hf_preds

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




In [41]:
preds = hf_preds.as_data_frame().predict.to_numpy()

print(preds)

[1 1 1 ... 2 2 1]


In [42]:
correct_predict = np.equal(preds, y_test).sum()

correct_predict

1247

In [43]:
correct_predict / len(y_test)

0.996802557953637

In [45]:
lb



In [46]:
aml.leader.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'StackedEnsemble_AllModels_1_AutoML_1_20220428_200646',
   'type': 'Key<Model>',
   'URL': '/3/Models/StackedEnsemble_AllModels_1_AutoML_1_20220428_200646'},
  'input': None},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_1_20220428_200646_training_py_9_sid_a0e1',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_1_20220428_200646_training_py_9_sid_a0e1'},
  'input': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_1_20220428_200646_training_py_9_sid_a0e1',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_1_20220428_200646_training_py_9_sid_a0e1'}},
 'response_column': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'sch

In [47]:
aml.leader.metalearner()['name']



'metalearner_AUTO_StackedEnsemble_AllModels_1_AutoML_1_20220428_200646'

In [48]:
aml.leader.metalearner().model_id

'metalearner_AUTO_StackedEnsemble_AllModels_1_AutoML_1_20220428_200646'

In [49]:
df_higher_sentences = df_annotated[df_annotated['distance'] > 0.25]

df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(pipeline.dict_labels)

df_higher_sentences.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(pipeline.dict_labels)


Unnamed: 0,label,distance,label_index
count,21627.0,21627.0,21627.0
mean,31.385999,0.422186,2.258889
std,17.675638,0.107617,1.409358
min,0.0,0.250006,1.0
25%,17.0,0.334341,1.0
50%,32.0,0.411661,2.0
75%,47.0,0.498158,3.0
max,59.0,0.834035,5.0


In [51]:
df_to_predict =  pipeline.generate_df_from_x_y(
    x_data=df_higher_sentences['embeddings'].to_numpy(),
    y_data=df_higher_sentences['label_index'].to_numpy()
)

df_to_predict.head()

Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511,label
0,-0.008343,0.033368,-0.046442,-0.052166,-0.091297,-0.001843,0.033507,0.027626,-0.082742,0.012627,...,-0.069063,0.066489,0.016825,0.01951,0.005963,0.065603,-0.042441,0.007935,0.070579,2
1,0.067861,-0.056069,-0.001313,-0.030145,-0.092034,-0.000798,0.052008,0.061108,-0.091092,-0.012032,...,-0.049187,0.080142,0.014424,0.043665,-0.016288,0.010631,-0.091025,0.029442,-0.074986,1
2,0.018875,0.024117,0.03509,-0.112785,-0.055182,0.048042,0.011638,-0.006258,-0.033919,0.03544,...,-0.024105,0.022007,0.080454,-0.077422,-0.02558,-0.028778,-0.025105,-0.043184,0.004809,3
3,0.05065,-0.018382,0.015175,-0.008381,-0.0674,0.014663,-0.012952,-0.030758,-0.084573,0.020417,...,-0.015316,0.04748,0.010318,-0.033296,-0.040025,-0.044433,0.038009,0.058241,-0.065861,1
4,0.049683,-0.024636,-0.020439,-0.010502,-0.151096,0.027749,0.034429,0.048915,-0.07228,-0.030891,...,-0.060599,0.040682,-0.005987,-0.03796,0.071119,0.019034,-0.126513,0.064421,-0.061999,1


In [52]:
hf_to_predict = h2o.H2OFrame(df_to_predict)

hf_preds = aml.predict(hf_to_predict)

hf_preds

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%




In [54]:
predicts = hf_preds.as_data_frame().predict.to_numpy()

correct_label = df_to_predict.label.to_numpy()

correct_predict = np.equal(predicts, correct_label).sum()

correct_predict / len(correct_label)

0.7781014472649929