In [1]:
from sklearn.model_selection import train_test_split

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

import pandas as pd
import numpy as np
from glob import glob
from src.data_helper import DataHelper
from src.data_helper import DataHelper

In [2]:
annotatted_filename = DataHelper.generate_filename_from_data_dir(f'output/patient/k100/use/annotated_sentences.csv')

embedding_filename = DataHelper.generate_filename_from_data_dir(f'output/patient/k100/use/annotated_sentences.csv')

paths = glob(f'{DataHelper.generate_filename_from_data_dir("models/use/text_emb_patient.json")}/*.json')

df_embeddings = DataHelper.read_multiple_files(paths)

df_annotated = pd.read_csv(annotatted_filename)

In [3]:
dict_embeddings = pd.Series(df_embeddings.embeddings.map(lambda x: np.array(x[0])).to_list(),index=df_embeddings.txt).to_dict()

len(dict_embeddings.keys())

26647

In [4]:
dict_labels = {
  'inform' : 1,
  'inform_symptoms': 2,
  'inform_medicine': 3,
  'greeting': 4,
  'request_inform': 5
}

In [5]:
df_temp = df_annotated[df_annotated['distance'] <= 0.25]

df_temp.count()

txt              4168
label            4168
distance         4168
intent           4168
annotated_txt    4168
dtype: int64

In [45]:
distant_sentences = df_annotated[df_annotated['distance'] > 0.25]

selected_distant_sentences  = distant_sentences['txt'].sample(n=1000, random_state=42)

selected_distant_sentences.head()

453                          Sobre a febre quero entender
7389                       Mais isso já faz um mês e meio
10612                                        E pra tosse?
998      Quando eu viro para o outro lado da cama acelera
11519                     Quero saber o que devemos tomar
Name: txt, dtype: object

In [48]:
selected_distant_sentences.to_csv(DataHelper.generate_filename_from_data_dir('output/sentences_to_classify.csv'), index=False)

In [62]:
df_classified_manual = pd.read_csv(DataHelper.generate_filename_from_data_dir('output/sentences_classified.csv'))

df_classified_manual['label_index'] = df_classified_manual['label'].map(dict_labels)

df_classified_manual['embeddings'] = df_classified_manual['txt'].map(dict_embeddings)

df_classified_manual.drop(['correct_label', 'used_intents', 'intents'], axis=1).head()

Unnamed: 0,txt,label,label_index,embeddings
0,Sobre a febre quero entender,request_inform,5,"[-0.022668906000000003, -0.061208144000000006,..."
1,Mais isso já faz um mês e meio,inform,1,"[-0.03546224, -0.008965515, -0.018063422000000..."
2,E pra tosse?,request_inform,5,"[0.022191908, -0.029259719000000003, -0.019401..."
3,Quando eu viro para o outro lado da cama acelera,request_inform,5,"[-0.072670385, -0.039203867, -0.04282206, -0.0..."
4,Quero saber o que devemos tomar,request_inform,5,"[0.027665587000000002, -0.0006697571, 0.022551..."


In [63]:
print(set(dict_labels.keys()), set(df_classified_manual.label.unique()))

set(dict_labels.keys()) == set(df_classified_manual.label.unique())

{'inform', 'inform_symptoms', 'inform_medicine', 'greeting', 'request_inform'} {'inform', 'inform_symptoms', 'inform_medicine', 'greeting', 'request_inform'}


True

In [64]:
df_classified_manual.label_index.unique()

array([5, 1, 2, 4, 3])

In [65]:
df_classified_manual.head()

Unnamed: 0,txt,label,correct_label,used_intents,intents,label_index,embeddings
0,Sobre a febre quero entender,request_inform,,inform,inform,5,"[-0.022668906000000003, -0.061208144000000006,..."
1,Mais isso já faz um mês e meio,inform,,inform_symptoms,inform_symptoms,1,"[-0.03546224, -0.008965515, -0.018063422000000..."
2,E pra tosse?,request_inform,,inform_medicine,inform_medicine,5,"[0.022191908, -0.029259719000000003, -0.019401..."
3,Quando eu viro para o outro lado da cama acelera,request_inform,,greeting,greeting,5,"[-0.072670385, -0.039203867, -0.04282206, -0.0..."
4,Quero saber o que devemos tomar,request_inform,,request_inform,request_inform,5,"[0.027665587000000002, -0.0006697571, 0.022551..."


In [23]:
df_temp['embeddings'] = df_temp['txt'].map(dict_embeddings)

df_temp['label_index'] = df_temp['intent'].map(dict_labels)

df_temp.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['embeddings'] = df_temp['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp['label_index'] = df_temp['intent'].map(dict_labels)


Unnamed: 0,txt,label,distance,intent,annotated_txt,embeddings,label_index
0,Há dois tias estou com dor na garganta,15,0.232881,inform_symptoms,Há dois tias estou com [dor](SINTOMA) na [garg...,"[-0.05788306, 0.042827144000000004, -0.0357614...",2
3,Sim,12,0.09996,inform,Sim,"[0.11549624, -0.010514308, 0.02550796800000000...",1
7,Não,18,0.191976,inform,Não,"[0.10408107, -0.02228124, 0.00076653576, -0.01...",1
12,Ok,4,0.15104,inform,Ok,"[0.1262432, -0.01299934, 0.064029716, -0.00782...",1
14,Tá certo,4,0.159767,inform,Tá certo,"[0.12865065, -0.020153655000000003, 0.03307859...",1


In [24]:
X = df_temp['embeddings'].to_numpy()
y = df_temp['label_index'].to_numpy()

In [25]:
def generate_df_from_X_y(X_data, y_data):
    vectors = [{f'V_{index:03d}': value for index, value in enumerate(embedding)} for embedding in X_data]

    df_data = pd.DataFrame(data=vectors)

    df_data['label'] = y_data

    return df_data


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(len(X_train))

{label: (y_train == label).sum() for label in set(y_train)}

2917


{1: 469, 2: 1725, 3: 117, 4: 401, 5: 205}

In [27]:
print(len(y_test))

{label: (y_test == label).sum() for label in set(y_test)}

1251


{1: 175, 2: 764, 3: 36, 4: 189, 5: 87}

In [28]:
df_train = generate_df_from_X_y(X_train, y_train)
df_test = generate_df_from_X_y(X_test, y_test)

df_test = df_test.drop(['label'], axis=1)

In [29]:
import h2o
from h2o.automl import H2OAutoML

# Start the H2O cluster (locally)
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.14.1" 2022-02-08; OpenJDK Runtime Environment (build 11.0.14.1+1-Ubuntu-0ubuntu1.21.10); OpenJDK 64-Bit Server VM (build 11.0.14.1+1-Ubuntu-0ubuntu1.21.10, mixed mode, sharing)
  Starting server from /home/valmir/dev/python/intent_classifier/venv/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp9s331jj0
  JVM stdout: /tmp/tmp9s331jj0/h2o_valmir_started_from_python.out
  JVM stderr: /tmp/tmp9s331jj0/h2o_valmir_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Fortaleza
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.4
H2O_cluster_version_age:,20 days
H2O_cluster_name:,H2O_from_python_valmir_xt79c7
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.777 Gb
H2O_cluster_total_cores:,16
H2O_cluster_allowed_cores:,16


In [30]:
hf_train = h2o.H2OFrame(df_train)

x = hf_train.columns
y = "label"
x.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [31]:
# For binary classification, response should be a factor
hf_train[y] = hf_train[y].asfactor()
# hf_test[y] = hf_test[y].asfactor()

# Run AutoML for 20 base models
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=x, y=y, training_frame=hf_train)

# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)

AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537,0.00616933,0.0165079,0.0578427,0.00334577
StackedEnsemble_BestOfFamily_8_AutoML_1_20220420_93537,0.00714494,0.0166253,0.0589281,0.00347252
DeepLearning_grid_3_AutoML_1_20220420_93537_model_1,0.00714494,0.027339,0.0586968,0.00344532
DeepLearning_grid_2_AutoML_1_20220420_93537_model_1,0.00717362,0.0272746,0.0560298,0.00313933
StackedEnsemble_AllModels_7_AutoML_1_20220420_93537,0.00757817,0.0153178,0.055127,0.00303899
StackedEnsemble_AllModels_4_AutoML_1_20220420_93537,0.00757817,0.0160628,0.0570544,0.00325521
DeepLearning_grid_1_AutoML_1_20220420_93537_model_1,0.00757817,0.023915,0.0550651,0.00303217
StackedEnsemble_BestOfFamily_7_AutoML_1_20220420_93537,0.00780642,0.0195802,0.0580466,0.0033694
StackedEnsemble_AllModels_5_AutoML_1_20220420_93537,0.0084279,0.0195522,0.0594211,0.00353086
StackedEnsemble_AllModels_2_AutoML_1_20220420_93537,0.00847153,0.0190885,0.0670299,0.00449301




In [32]:
df_test.head()

Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_502,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511
0,0.128188,0.023643,0.054551,-0.0256,-0.08381,0.062776,0.042874,0.008592,-0.092103,0.037006,...,-0.048022,-0.001864,-0.00346,0.016848,0.025093,-0.033908,0.039552,0.085565,0.020285,0.082097
1,0.112307,0.006911,-0.03988,-0.014899,-0.080023,0.061922,0.069085,-0.001377,-0.055079,-0.014418,...,-0.066611,-0.050252,-0.016871,-0.015402,0.00337,-0.014091,0.030076,0.051044,0.023399,-0.034249
2,0.124153,-0.04293,-0.002698,0.00689,-0.128321,0.049727,0.046278,0.052569,-0.066167,0.048772,...,-0.091975,-0.052914,0.038186,0.038575,0.027861,0.024273,0.020635,0.101188,0.005618,0.013726
3,-0.019081,0.006883,-0.036536,-0.042907,-0.037945,0.008759,0.026516,-0.054927,-0.069322,0.024437,...,-0.064943,-0.077694,0.079303,0.037798,-0.029797,-0.017276,0.027369,0.023527,0.061263,0.058431
4,-0.031653,-0.003482,-0.021699,0.002539,-0.099389,-0.005727,0.080924,0.028786,-0.0928,-0.044258,...,-0.07619,0.034061,0.050258,0.039479,0.04279,-0.009819,0.000736,0.022744,0.030173,0.083213


In [33]:
hf_test = h2o.H2OFrame(df_test)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [34]:
hf_preds = aml.predict(hf_test)

hf_preds

stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,p1,p2,p3,p4,p5
1,0.997466,0.000420563,0.000554762,0.000836424,0.00072201
1,0.997664,0.000414748,0.000547091,0.000662185,0.000712027
1,0.99752,0.000415778,0.00054845,0.000802289,0.000713796
2,0.000397664,0.998554,0.000298588,0.00036152,0.000388606
4,0.000382544,0.000218737,0.000288535,0.998735,0.000375522
2,0.00040266,0.998585,0.000298598,0.000325393,0.000388618
3,0.0025055,0.00140841,0.991644,0.00202454,0.00241792
2,0.00039928,0.998588,0.000298599,0.000325395,0.00038862
4,0.000470506,0.000264912,0.000349443,0.99846,0.000454793
2,0.000395472,0.998592,0.0002986,0.000325396,0.000388621




In [35]:
preds = hf_preds.as_data_frame().predict.to_numpy()

preds

array([1, 1, 1, ..., 2, 2, 1])

In [36]:
len(preds), len(y_test)

(1251, 1251)

In [37]:
correct_predict = np.equal(preds, y_test).sum()

correct_predict

1246

In [38]:
correct_predict / len(y_test)

0.9960031974420464

In [39]:
lb

model_id,mean_per_class_error,logloss,rmse,mse
StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537,0.00616933,0.0165079,0.0578427,0.00334577
StackedEnsemble_BestOfFamily_8_AutoML_1_20220420_93537,0.00714494,0.0166253,0.0589281,0.00347252
DeepLearning_grid_3_AutoML_1_20220420_93537_model_1,0.00714494,0.027339,0.0586968,0.00344532
DeepLearning_grid_2_AutoML_1_20220420_93537_model_1,0.00717362,0.0272746,0.0560298,0.00313933
StackedEnsemble_AllModels_7_AutoML_1_20220420_93537,0.00757817,0.0153178,0.055127,0.00303899
StackedEnsemble_AllModels_4_AutoML_1_20220420_93537,0.00757817,0.0160628,0.0570544,0.00325521
DeepLearning_grid_1_AutoML_1_20220420_93537_model_1,0.00757817,0.023915,0.0550651,0.00303217
StackedEnsemble_BestOfFamily_7_AutoML_1_20220420_93537,0.00780642,0.0195802,0.0580466,0.0033694
StackedEnsemble_AllModels_5_AutoML_1_20220420_93537,0.0084279,0.0195522,0.0594211,0.00353086
StackedEnsemble_AllModels_2_AutoML_1_20220420_93537,0.00847153,0.0190885,0.0670299,0.00449301




In [40]:
aml.leader.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537',
   'type': 'Key<Model>',
   'URL': '/3/Models/StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537'},
  'input': None},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_1_20220420_93537_training_py_1_sid_b577',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_1_20220420_93537_training_py_1_sid_b577'},
  'input': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'AutoML_1_20220420_93537_training_py_1_sid_b577',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/AutoML_1_20220420_93537_training_py_1_sid_b577'}},
 'response_column': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'sch

In [41]:
aml.leader.metalearner()['name']



'metalearner_AUTO_StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537'

In [42]:
aml.leader.metalearner().model_id

'metalearner_AUTO_StackedEnsemble_BestOfFamily_5_AutoML_1_20220420_93537'

In [45]:
df_higher_sentences = df_annotated[df_annotated['distance'] > 0.25]

df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(dict_labels)

df_higher_sentences.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(dict_labels)


Unnamed: 0,label,distance,label_index
count,21627.0,21627.0,21627.0
mean,31.385999,0.422186,2.258889
std,17.675638,0.107617,1.409358
min,0.0,0.250006,1.0
25%,17.0,0.334341,1.0
50%,32.0,0.411661,2.0
75%,47.0,0.498158,3.0
max,59.0,0.834035,5.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(dict_labels)


Unnamed: 0,label,distance,label_index
count,21627.0,21627.0,21627.0
mean,31.385999,0.422186,2.258889
std,17.675638,0.107617,1.409358
min,0.0,0.250006,1.0
25%,17.0,0.334341,1.0
50%,32.0,0.411661,2.0
75%,47.0,0.498158,3.0
max,59.0,0.834035,5.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['embeddings'] = df_higher_sentences['txt'].map(dict_embeddings)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_higher_sentences['label_index'] = df_higher_sentences['intent'].map(dict_labels)


Unnamed: 0,label,distance,label_index
count,21627.0,21627.0,21627.0
mean,31.385999,0.422186,2.258889
std,17.675638,0.107617,1.409358
min,0.0,0.250006,1.0
25%,17.0,0.334341,1.0
50%,32.0,0.411661,2.0
75%,47.0,0.498158,3.0
max,59.0,0.834035,5.0


In [46]:
df_higher_sentences.label_index.unique()

array([2, 1, 3, 4, 5])

In [67]:
data_to_gerante_df = df_higher_sentences

df_to_predict = generate_df_from_X_y(X_data=data_to_gerante_df['embeddings'].to_numpy(), y_data=data_to_gerante_df['label_index'].to_numpy())

print(df_to_predict.count())

df_to_predict.head()

V_000    21627
V_001    21627
V_002    21627
V_003    21627
V_004    21627
         ...  
V_508    21627
V_509    21627
V_510    21627
V_511    21627
label    21627
Length: 513, dtype: int64


Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511,label
0,-0.008343,0.033368,-0.046442,-0.052166,-0.091297,-0.001843,0.033507,0.027626,-0.082742,0.012627,...,-0.069063,0.066489,0.016825,0.01951,0.005963,0.065603,-0.042441,0.007935,0.070579,2
1,0.067861,-0.056069,-0.001313,-0.030145,-0.092034,-0.000798,0.052008,0.061108,-0.091092,-0.012032,...,-0.049187,0.080142,0.014424,0.043665,-0.016288,0.010631,-0.091025,0.029442,-0.074986,1
2,0.018875,0.024117,0.03509,-0.112785,-0.055182,0.048042,0.011638,-0.006258,-0.033919,0.03544,...,-0.024105,0.022007,0.080454,-0.077422,-0.02558,-0.028778,-0.025105,-0.043184,0.004809,3
3,0.05065,-0.018382,0.015175,-0.008381,-0.0674,0.014663,-0.012952,-0.030758,-0.084573,0.020417,...,-0.015316,0.04748,0.010318,-0.033296,-0.040025,-0.044433,0.038009,0.058241,-0.065861,1
4,0.049683,-0.024636,-0.020439,-0.010502,-0.151096,0.027749,0.034429,0.048915,-0.07228,-0.030891,...,-0.060599,0.040682,-0.005987,-0.03796,0.071119,0.019034,-0.126513,0.064421,-0.061999,1


In [68]:
df_to_predict_manual = generate_df_from_X_y(X_data=df_classified_manual['embeddings'].to_numpy(), y_data=df_classified_manual['label_index'].to_numpy())

print(df_to_predict_manual.count())

df_to_predict_manual.head()

V_000    1000
V_001    1000
V_002    1000
V_003    1000
V_004    1000
         ... 
V_508    1000
V_509    1000
V_510    1000
V_511    1000
label    1000
Length: 513, dtype: int64


Unnamed: 0,V_000,V_001,V_002,V_003,V_004,V_005,V_006,V_007,V_008,V_009,...,V_503,V_504,V_505,V_506,V_507,V_508,V_509,V_510,V_511,label
0,-0.022669,-0.061208,0.003736,0.007147,-0.082954,0.050146,-0.00265,0.031844,-0.054984,0.034181,...,-0.028898,0.012946,0.003666,-0.037578,-0.031985,0.037607,-0.044444,-0.021451,0.090896,5
1,-0.035462,-0.008966,-0.018063,0.010098,-0.010659,-0.001407,-0.000123,-0.000145,-0.044095,0.04053,...,-0.02989,-0.016296,0.071277,-0.046813,0.001874,0.001296,-0.100866,0.077414,-0.056887,1
2,0.022192,-0.02926,-0.019401,-0.029497,0.032366,0.046809,0.027017,-0.063078,-0.06565,-0.003921,...,-0.054714,-0.006916,0.045183,-0.023252,0.000828,0.01387,0.022861,0.038083,0.102305,5
3,-0.07267,-0.039204,-0.042822,-0.032342,-0.113388,-0.01236,0.022496,0.048088,-0.064497,-0.05684,...,-0.002986,0.087587,-0.016486,-0.059934,-0.070585,-0.029925,-0.003761,-0.00096,-0.066411,5
4,0.027666,-0.00067,0.022552,-0.010471,-0.061674,0.009946,0.003083,0.042823,-0.019612,-0.050977,...,0.035239,-0.13084,-0.062465,-0.034523,0.032786,-0.000173,0.001779,0.041951,0.1351,5


In [48]:
data_to_gerante_df['label_index'].count()

21627

In [49]:
df_to_predict.label.count()

21627

In [50]:
df_to_predict.label.unique()

array([2, 1, 3, 4, 5])

In [51]:
hf_to_predict = h2o.H2OFrame(df_to_predict)

hf_preds = aml.predict(hf_to_predict)

hf_preds

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%


predict,p1,p2,p3,p4,p5
2,0.000958942,0.996942,0.000559078,0.000812507,0.000727628
1,0.997555,0.000411365,0.000542629,0.000784658,0.00070622
2,0.000414755,0.998461,0.00029856,0.000437181,0.00038857
1,0.997507,0.00041891,0.000552581,0.000802675,0.000719172
1,0.9975,0.000419117,0.000552855,0.000808978,0.000719529
1,0.997537,0.000413032,0.000544827,0.000795956,0.000709081
1,0.997484,0.000424318,0.000559716,0.000803242,0.000728458
1,0.577188,0.377869,0.0143015,0.0161818,0.0144595
1,0.997492,0.000421507,0.000556007,0.000806805,0.000723631
1,0.994475,0.000813062,0.0010725,0.00224364,0.00139584




In [52]:
predicts = hf_preds.as_data_frame().predict.to_numpy()

correct_label = df_to_predict.label.to_numpy()

len(predicts), len(correct_label)

(21627, 21627)

In [53]:
correct_predict = np.equal(predicts, correct_label).sum()

correct_predict

16785

In [54]:
correct_predict / len(predicts)

0.7761131918435289

In [72]:
hf_preds_manual= aml.predict( h2o.H2OFrame(df_to_predict_manual))

predicts_manual = hf_preds_manual.as_data_frame().predict.to_numpy()

correct_label_manual = df_to_predict_manual.label.to_numpy()

print(len(predicts_manual), len(correct_label))

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
1000 1000


In [73]:
correct_predict_manual = np.equal(predicts_manual, correct_label).sum()

correct_predict_manual / len(predicts_manual)

0.72